ziansu's picture
Training in progress, step 1250, checkpoint
89fd9df verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.039962535123322,
"eval_steps": 50,
"global_step": 1250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00832552815069206,
"grad_norm": 0.04514288529753685,
"learning_rate": 4.999451708687114e-06,
"logits/chosen": 14.412135124206543,
"logits/rejected": 14.867518424987793,
"logps/chosen": -0.29279541969299316,
"logps/rejected": -0.33705300092697144,
"loss": 0.9248,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.43919315934181213,
"rewards/margins": 0.066386379301548,
"rewards/rejected": -0.5055795311927795,
"step": 10
},
{
"epoch": 0.01665105630138412,
"grad_norm": 0.05052826926112175,
"learning_rate": 4.997807075247147e-06,
"logits/chosen": 14.956459045410156,
"logits/rejected": 15.363263130187988,
"logps/chosen": -0.3096744120121002,
"logps/rejected": -0.36214715242385864,
"loss": 0.9355,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.46451157331466675,
"rewards/margins": 0.07870914041996002,
"rewards/rejected": -0.5432207584381104,
"step": 20
},
{
"epoch": 0.024976584452076178,
"grad_norm": 0.04879612475633621,
"learning_rate": 4.9950668210706795e-06,
"logits/chosen": 14.485757827758789,
"logits/rejected": 15.057507514953613,
"logps/chosen": -0.27136802673339844,
"logps/rejected": -0.31497400999069214,
"loss": 0.9268,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.4070519804954529,
"rewards/margins": 0.06540900468826294,
"rewards/rejected": -0.4724610447883606,
"step": 30
},
{
"epoch": 0.03330211260276824,
"grad_norm": 0.05672155320644379,
"learning_rate": 4.9912321481237616e-06,
"logits/chosen": 14.529332160949707,
"logits/rejected": 14.814855575561523,
"logps/chosen": -0.29139184951782227,
"logps/rejected": -0.31259119510650635,
"loss": 0.9267,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.4370877742767334,
"rewards/margins": 0.03179898113012314,
"rewards/rejected": -0.46888676285743713,
"step": 40
},
{
"epoch": 0.041627640753460295,
"grad_norm": 0.065071240067482,
"learning_rate": 4.986304738420684e-06,
"logits/chosen": 14.174386978149414,
"logits/rejected": 15.223234176635742,
"logps/chosen": -0.2745029330253601,
"logps/rejected": -0.37693315744400024,
"loss": 0.9243,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.41175442934036255,
"rewards/margins": 0.1536453813314438,
"rewards/rejected": -0.5653998255729675,
"step": 50
},
{
"epoch": 0.041627640753460295,
"eval_logits/chosen": 14.56569766998291,
"eval_logits/rejected": 15.157320976257324,
"eval_logps/chosen": -0.27527979016304016,
"eval_logps/rejected": -0.3633999824523926,
"eval_loss": 0.9083622694015503,
"eval_rewards/accuracies": 0.5612244606018066,
"eval_rewards/chosen": -0.41291970014572144,
"eval_rewards/margins": 0.13218028843402863,
"eval_rewards/rejected": -0.5450999736785889,
"eval_runtime": 29.029,
"eval_samples_per_second": 26.766,
"eval_steps_per_second": 3.376,
"step": 50
},
{
"epoch": 0.049953168904152356,
"grad_norm": 0.14002270996570587,
"learning_rate": 4.980286753286196e-06,
"logits/chosen": 14.408930778503418,
"logits/rejected": 14.791458129882812,
"logps/chosen": -0.285602867603302,
"logps/rejected": -0.3351826071739197,
"loss": 0.9177,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.4284043312072754,
"rewards/margins": 0.07436960190534592,
"rewards/rejected": -0.5027738809585571,
"step": 60
},
{
"epoch": 0.05827869705484442,
"grad_norm": 0.05595069006085396,
"learning_rate": 4.973180832407471e-06,
"logits/chosen": 14.41168212890625,
"logits/rejected": 14.865121841430664,
"logps/chosen": -0.25851207971572876,
"logps/rejected": -0.32240185141563416,
"loss": 0.9168,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.3877681493759155,
"rewards/margins": 0.0958346277475357,
"rewards/rejected": -0.4836028218269348,
"step": 70
},
{
"epoch": 0.06660422520553648,
"grad_norm": 0.058645494282245636,
"learning_rate": 4.964990092676263e-06,
"logits/chosen": 14.897825241088867,
"logits/rejected": 15.01073932647705,
"logps/chosen": -0.2668797969818115,
"logps/rejected": -0.3204379975795746,
"loss": 0.9242,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.4003197252750397,
"rewards/margins": 0.08033724129199982,
"rewards/rejected": -0.4806569516658783,
"step": 80
},
{
"epoch": 0.07492975335622853,
"grad_norm": 0.0597861111164093,
"learning_rate": 4.9557181268217225e-06,
"logits/chosen": 14.531021118164062,
"logits/rejected": 14.767858505249023,
"logps/chosen": -0.26787540316581726,
"logps/rejected": -0.32972821593284607,
"loss": 0.9077,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.4018131196498871,
"rewards/margins": 0.09277921915054321,
"rewards/rejected": -0.4945923686027527,
"step": 90
},
{
"epoch": 0.08325528150692059,
"grad_norm": 0.0863095372915268,
"learning_rate": 4.9453690018345144e-06,
"logits/chosen": 14.179275512695312,
"logits/rejected": 14.909070014953613,
"logps/chosen": -0.2532978057861328,
"logps/rejected": -0.35474082827568054,
"loss": 0.903,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.3799467086791992,
"rewards/margins": 0.1521645337343216,
"rewards/rejected": -0.5321112275123596,
"step": 100
},
{
"epoch": 0.08325528150692059,
"eval_logits/chosen": 14.326024055480957,
"eval_logits/rejected": 14.979863166809082,
"eval_logps/chosen": -0.2673422694206238,
"eval_logps/rejected": -0.3668619990348816,
"eval_loss": 0.8989922404289246,
"eval_rewards/accuracies": 0.6020408272743225,
"eval_rewards/chosen": -0.4010133445262909,
"eval_rewards/margins": 0.1492796391248703,
"eval_rewards/rejected": -0.5502930283546448,
"eval_runtime": 29.0209,
"eval_samples_per_second": 26.774,
"eval_steps_per_second": 3.377,
"step": 100
},
{
"epoch": 0.09158080965761266,
"grad_norm": 0.07181967049837112,
"learning_rate": 4.933947257182901e-06,
"logits/chosen": 14.118756294250488,
"logits/rejected": 14.755918502807617,
"logps/chosen": -0.27995947003364563,
"logps/rejected": -0.3749552369117737,
"loss": 0.9097,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.41993919014930725,
"rewards/margins": 0.14249366521835327,
"rewards/rejected": -0.5624328255653381,
"step": 110
},
{
"epoch": 0.09990633780830471,
"grad_norm": 0.08269819617271423,
"learning_rate": 4.921457902821578e-06,
"logits/chosen": 13.764413833618164,
"logits/rejected": 14.43315315246582,
"logps/chosen": -0.28177163004875183,
"logps/rejected": -0.3637630343437195,
"loss": 0.9075,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.42265743017196655,
"rewards/margins": 0.12298711389303207,
"rewards/rejected": -0.5456445813179016,
"step": 120
},
{
"epoch": 0.10823186595899677,
"grad_norm": 1.9071497917175293,
"learning_rate": 4.907906416994146e-06,
"logits/chosen": 14.103793144226074,
"logits/rejected": 14.727777481079102,
"logps/chosen": -0.2665451765060425,
"logps/rejected": -0.3827117085456848,
"loss": 0.9217,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.3998177647590637,
"rewards/margins": 0.1742497682571411,
"rewards/rejected": -0.5740675926208496,
"step": 130
},
{
"epoch": 0.11655739410968884,
"grad_norm": 0.12107716500759125,
"learning_rate": 4.893298743830168e-06,
"logits/chosen": 13.517863273620605,
"logits/rejected": 14.42052173614502,
"logps/chosen": -0.26627904176712036,
"logps/rejected": -0.3745174705982208,
"loss": 0.904,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.39941853284835815,
"rewards/margins": 0.16235767304897308,
"rewards/rejected": -0.5617762207984924,
"step": 140
},
{
"epoch": 0.12488292226038089,
"grad_norm": 0.1638205647468567,
"learning_rate": 4.8776412907378845e-06,
"logits/chosen": 12.83032512664795,
"logits/rejected": 13.673515319824219,
"logps/chosen": -0.24289576709270477,
"logps/rejected": -0.37163227796554565,
"loss": 0.8779,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.36434367299079895,
"rewards/margins": 0.19310477375984192,
"rewards/rejected": -0.5574483871459961,
"step": 150
},
{
"epoch": 0.12488292226038089,
"eval_logits/chosen": 12.317696571350098,
"eval_logits/rejected": 13.164616584777832,
"eval_logps/chosen": -0.266156405210495,
"eval_logps/rejected": -0.4009220004081726,
"eval_loss": 0.8768696784973145,
"eval_rewards/accuracies": 0.6224489808082581,
"eval_rewards/chosen": -0.3992346227169037,
"eval_rewards/margins": 0.20214837789535522,
"eval_rewards/rejected": -0.6013829708099365,
"eval_runtime": 29.0257,
"eval_samples_per_second": 26.769,
"eval_steps_per_second": 3.376,
"step": 150
},
{
"epoch": 0.13320845041107296,
"grad_norm": 0.1479438841342926,
"learning_rate": 4.860940925593703e-06,
"logits/chosen": 12.736433029174805,
"logits/rejected": 13.475964546203613,
"logps/chosen": -0.2913517355918884,
"logps/rejected": -0.36094629764556885,
"loss": 0.8756,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.43702763319015503,
"rewards/margins": 0.10439182817935944,
"rewards/rejected": -0.5414193868637085,
"step": 160
},
{
"epoch": 0.141533978561765,
"grad_norm": 0.17609630525112152,
"learning_rate": 4.84320497372973e-06,
"logits/chosen": 10.606362342834473,
"logits/rejected": 11.537567138671875,
"logps/chosen": -0.2560296952724457,
"logps/rejected": -0.4312233328819275,
"loss": 0.8489,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.38404449820518494,
"rewards/margins": 0.2627905011177063,
"rewards/rejected": -0.6468349695205688,
"step": 170
},
{
"epoch": 0.14985950671245707,
"grad_norm": 0.18054936826229095,
"learning_rate": 4.824441214720629e-06,
"logits/chosen": 10.13754653930664,
"logits/rejected": 10.914222717285156,
"logps/chosen": -0.29278701543807983,
"logps/rejected": -0.43448886275291443,
"loss": 0.8715,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.43918052315711975,
"rewards/margins": 0.21255281567573547,
"rewards/rejected": -0.6517333388328552,
"step": 180
},
{
"epoch": 0.15818503486314914,
"grad_norm": 0.19739146530628204,
"learning_rate": 4.804657878971252e-06,
"logits/chosen": 8.077766418457031,
"logits/rejected": 9.669368743896484,
"logps/chosen": -0.2844889760017395,
"logps/rejected": -0.5050357580184937,
"loss": 0.8582,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.42673349380493164,
"rewards/margins": 0.3308201730251312,
"rewards/rejected": -0.7575536966323853,
"step": 190
},
{
"epoch": 0.16651056301384118,
"grad_norm": 0.2397814244031906,
"learning_rate": 4.783863644106502e-06,
"logits/chosen": 6.790783882141113,
"logits/rejected": 7.849525451660156,
"logps/chosen": -0.2940555512905121,
"logps/rejected": -0.5699166059494019,
"loss": 0.8196,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.4410833418369293,
"rewards/margins": 0.41379159688949585,
"rewards/rejected": -0.8548749089241028,
"step": 200
},
{
"epoch": 0.16651056301384118,
"eval_logits/chosen": 6.290835857391357,
"eval_logits/rejected": 6.757873058319092,
"eval_logps/chosen": -0.317629337310791,
"eval_logps/rejected": -0.581989586353302,
"eval_loss": 0.8032433986663818,
"eval_rewards/accuracies": 0.6734693646430969,
"eval_rewards/chosen": -0.47644397616386414,
"eval_rewards/margins": 0.39654040336608887,
"eval_rewards/rejected": -0.8729843497276306,
"eval_runtime": 29.025,
"eval_samples_per_second": 26.77,
"eval_steps_per_second": 3.376,
"step": 200
},
{
"epoch": 0.17483609116453325,
"grad_norm": 0.2858545184135437,
"learning_rate": 4.762067631165049e-06,
"logits/chosen": 6.875879764556885,
"logits/rejected": 6.691536903381348,
"logps/chosen": -0.37194910645484924,
"logps/rejected": -0.5639354586601257,
"loss": 0.8129,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5579236745834351,
"rewards/margins": 0.2879795432090759,
"rewards/rejected": -0.8459032773971558,
"step": 210
},
{
"epoch": 0.18316161931522532,
"grad_norm": 0.30206382274627686,
"learning_rate": 4.7392794005985324e-06,
"logits/chosen": 4.656112194061279,
"logits/rejected": 4.483086585998535,
"logps/chosen": -0.360150009393692,
"logps/rejected": -0.6204283833503723,
"loss": 0.7954,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.5402250289916992,
"rewards/margins": 0.39041754603385925,
"rewards/rejected": -0.9306427240371704,
"step": 220
},
{
"epoch": 0.19148714746591736,
"grad_norm": 0.40204310417175293,
"learning_rate": 4.715508948078037e-06,
"logits/chosen": 3.9398162364959717,
"logits/rejected": 3.38537859916687,
"logps/chosen": -0.39010342955589294,
"logps/rejected": -0.7167688608169556,
"loss": 0.7664,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5851551294326782,
"rewards/margins": 0.4899981617927551,
"rewards/rejected": -1.0751533508300781,
"step": 230
},
{
"epoch": 0.19981267561660943,
"grad_norm": 0.48389795422554016,
"learning_rate": 4.690766700109659e-06,
"logits/chosen": 2.925476551055908,
"logits/rejected": 2.824068069458008,
"logps/chosen": -0.41053348779678345,
"logps/rejected": -0.8508625030517578,
"loss": 0.7606,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.6158002018928528,
"rewards/margins": 0.6604936718940735,
"rewards/rejected": -1.2762939929962158,
"step": 240
},
{
"epoch": 0.2081382037673015,
"grad_norm": 0.6687452793121338,
"learning_rate": 4.665063509461098e-06,
"logits/chosen": 2.751737594604492,
"logits/rejected": 2.2424545288085938,
"logps/chosen": -0.4365699291229248,
"logps/rejected": -0.8550359606742859,
"loss": 0.7234,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.6548548936843872,
"rewards/margins": 0.6276990175247192,
"rewards/rejected": -1.2825539112091064,
"step": 250
},
{
"epoch": 0.2081382037673015,
"eval_logits/chosen": 2.1380228996276855,
"eval_logits/rejected": 1.3922746181488037,
"eval_logps/chosen": -0.48307570815086365,
"eval_logps/rejected": -1.0382359027862549,
"eval_loss": 0.668463945388794,
"eval_rewards/accuracies": 0.6938775777816772,
"eval_rewards/chosen": -0.7246134877204895,
"eval_rewards/margins": 0.8327403664588928,
"eval_rewards/rejected": -1.5573538541793823,
"eval_runtime": 29.0228,
"eval_samples_per_second": 26.772,
"eval_steps_per_second": 3.377,
"step": 250
},
{
"epoch": 0.21646373191799353,
"grad_norm": 0.7085956335067749,
"learning_rate": 4.638410650401267e-06,
"logits/chosen": 1.7889283895492554,
"logits/rejected": 0.9420136213302612,
"logps/chosen": -0.5195389986038208,
"logps/rejected": -1.0534025430679321,
"loss": 0.6863,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.7793084979057312,
"rewards/margins": 0.8007953763008118,
"rewards/rejected": -1.580103874206543,
"step": 260
},
{
"epoch": 0.2247892600686856,
"grad_norm": 0.4416671097278595,
"learning_rate": 4.610819813755038e-06,
"logits/chosen": 1.582745909690857,
"logits/rejected": 0.3820720911026001,
"logps/chosen": -0.5181297063827515,
"logps/rejected": -1.2198141813278198,
"loss": 0.5809,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.7771945595741272,
"rewards/margins": 1.0525267124176025,
"rewards/rejected": -1.8297210931777954,
"step": 270
},
{
"epoch": 0.23311478821937767,
"grad_norm": 2.7746617794036865,
"learning_rate": 4.582303101775249e-06,
"logits/chosen": 1.2947760820388794,
"logits/rejected": 0.27237796783447266,
"logps/chosen": -0.643541693687439,
"logps/rejected": -1.7467323541641235,
"loss": 0.5775,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9653124809265137,
"rewards/margins": 1.6547861099243164,
"rewards/rejected": -2.62009859085083,
"step": 280
},
{
"epoch": 0.2414403163700697,
"grad_norm": 0.6444702744483948,
"learning_rate": 4.55287302283426e-06,
"logits/chosen": 1.2399464845657349,
"logits/rejected": 0.22667090594768524,
"logps/chosen": -0.7517040967941284,
"logps/rejected": -1.9010766744613647,
"loss": 0.5314,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1275560855865479,
"rewards/margins": 1.724058747291565,
"rewards/rejected": -2.8516147136688232,
"step": 290
},
{
"epoch": 0.24976584452076178,
"grad_norm": 0.5103917717933655,
"learning_rate": 4.522542485937369e-06,
"logits/chosen": 1.438954472541809,
"logits/rejected": 0.5288833379745483,
"logps/chosen": -0.7871009707450867,
"logps/rejected": -2.0329811573028564,
"loss": 0.5271,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1806514263153076,
"rewards/margins": 1.8688204288482666,
"rewards/rejected": -3.049471616744995,
"step": 300
},
{
"epoch": 0.24976584452076178,
"eval_logits/chosen": 1.3706706762313843,
"eval_logits/rejected": 0.8007871508598328,
"eval_logps/chosen": -0.7460500001907349,
"eval_logps/rejected": -2.209245443344116,
"eval_loss": 0.5008835792541504,
"eval_rewards/accuracies": 0.7244898080825806,
"eval_rewards/chosen": -1.1190749406814575,
"eval_rewards/margins": 2.194793224334717,
"eval_rewards/rejected": -3.313868284225464,
"eval_runtime": 29.0227,
"eval_samples_per_second": 26.772,
"eval_steps_per_second": 3.377,
"step": 300
},
{
"epoch": 0.2580913726714538,
"grad_norm": 0.7984316945075989,
"learning_rate": 4.491324795060491e-06,
"logits/chosen": 0.9250973463058472,
"logits/rejected": 0.1887839138507843,
"logps/chosen": -0.8511486053466797,
"logps/rejected": -2.447072982788086,
"loss": 0.5506,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2767229080200195,
"rewards/margins": 2.3938865661621094,
"rewards/rejected": -3.670609712600708,
"step": 310
},
{
"epoch": 0.2664169008221459,
"grad_norm": 0.5243161916732788,
"learning_rate": 4.4592336433146e-06,
"logits/chosen": 2.437886953353882,
"logits/rejected": 1.6011940240859985,
"logps/chosen": -0.7107629776000977,
"logps/rejected": -2.132263422012329,
"loss": 0.5423,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.0661444664001465,
"rewards/margins": 2.1322507858276367,
"rewards/rejected": -3.198395013809204,
"step": 320
},
{
"epoch": 0.27474242897283796,
"grad_norm": 0.4742359220981598,
"learning_rate": 4.426283106939474e-06,
"logits/chosen": 1.8433977365493774,
"logits/rejected": 1.199568748474121,
"logps/chosen": -0.8737133145332336,
"logps/rejected": -2.1652615070343018,
"loss": 0.5015,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.3105700016021729,
"rewards/margins": 1.9373222589492798,
"rewards/rejected": -3.247892379760742,
"step": 330
},
{
"epoch": 0.28306795712353,
"grad_norm": 0.5529736280441284,
"learning_rate": 4.3924876391293915e-06,
"logits/chosen": 2.0044589042663574,
"logits/rejected": 0.9263212084770203,
"logps/chosen": -0.9175036549568176,
"logps/rejected": -2.6408374309539795,
"loss": 0.4921,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.3762553930282593,
"rewards/margins": 2.585000991821289,
"rewards/rejected": -3.961256504058838,
"step": 340
},
{
"epoch": 0.2913934852742221,
"grad_norm": 0.7060612440109253,
"learning_rate": 4.357862063693486e-06,
"logits/chosen": 2.243232250213623,
"logits/rejected": 1.6251205205917358,
"logps/chosen": -0.9481338262557983,
"logps/rejected": -2.9519124031066895,
"loss": 0.4753,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.4222007989883423,
"rewards/margins": 3.0056674480438232,
"rewards/rejected": -4.427868366241455,
"step": 350
},
{
"epoch": 0.2913934852742221,
"eval_logits/chosen": 1.7781500816345215,
"eval_logits/rejected": 1.412752628326416,
"eval_logps/chosen": -0.9692521095275879,
"eval_logps/rejected": -2.8247811794281006,
"eval_loss": 0.4446474015712738,
"eval_rewards/accuracies": 0.7346938848495483,
"eval_rewards/chosen": -1.4538781642913818,
"eval_rewards/margins": 2.7832937240600586,
"eval_rewards/rejected": -4.2371721267700195,
"eval_runtime": 29.0245,
"eval_samples_per_second": 26.77,
"eval_steps_per_second": 3.376,
"step": 350
},
{
"epoch": 0.29971901342491414,
"grad_norm": 0.9664792418479919,
"learning_rate": 4.322421568553529e-06,
"logits/chosen": 1.7094570398330688,
"logits/rejected": 1.1617993116378784,
"logps/chosen": -0.992924690246582,
"logps/rejected": -2.7834811210632324,
"loss": 0.4972,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.4893869161605835,
"rewards/margins": 2.6858346462249756,
"rewards/rejected": -4.1752214431762695,
"step": 360
},
{
"epoch": 0.3080445415756062,
"grad_norm": 0.7800536155700684,
"learning_rate": 4.286181699082008e-06,
"logits/chosen": 2.9170143604278564,
"logits/rejected": 2.384690523147583,
"logps/chosen": -1.0323909521102905,
"logps/rejected": -2.726369857788086,
"loss": 0.4689,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.548586368560791,
"rewards/margins": 2.540968418121338,
"rewards/rejected": -4.089555263519287,
"step": 370
},
{
"epoch": 0.3163700697262983,
"grad_norm": 1.3163660764694214,
"learning_rate": 4.249158351283414e-06,
"logits/chosen": 2.780831813812256,
"logits/rejected": 1.753291130065918,
"logps/chosen": -1.0468894243240356,
"logps/rejected": -2.7425389289855957,
"loss": 0.4835,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.5703339576721191,
"rewards/margins": 2.5434746742248535,
"rewards/rejected": -4.113808631896973,
"step": 380
},
{
"epoch": 0.3246955978769903,
"grad_norm": 0.6381780505180359,
"learning_rate": 4.211367764821722e-06,
"logits/chosen": 2.585071086883545,
"logits/rejected": 1.9254558086395264,
"logps/chosen": -1.2089946269989014,
"logps/rejected": -3.615030288696289,
"loss": 0.4518,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.8134920597076416,
"rewards/margins": 3.6090526580810547,
"rewards/rejected": -5.422544956207275,
"step": 390
},
{
"epoch": 0.33302112602768236,
"grad_norm": 0.9214782118797302,
"learning_rate": 4.172826515897146e-06,
"logits/chosen": 1.9765586853027344,
"logits/rejected": 1.1926987171173096,
"logps/chosen": -1.2852815389633179,
"logps/rejected": -3.786972761154175,
"loss": 0.4165,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.9279224872589111,
"rewards/margins": 3.7525367736816406,
"rewards/rejected": -5.680459022521973,
"step": 400
},
{
"epoch": 0.33302112602768236,
"eval_logits/chosen": 2.6366844177246094,
"eval_logits/rejected": 2.394319534301758,
"eval_logps/chosen": -1.322396993637085,
"eval_logps/rejected": -3.686817169189453,
"eval_loss": 0.4065541923046112,
"eval_rewards/accuracies": 0.7551020383834839,
"eval_rewards/chosen": -1.9835957288742065,
"eval_rewards/margins": 3.5466296672821045,
"eval_rewards/rejected": -5.5302252769470215,
"eval_runtime": 29.025,
"eval_samples_per_second": 26.77,
"eval_steps_per_second": 3.376,
"step": 400
},
{
"epoch": 0.34134665417837445,
"grad_norm": 1.5113208293914795,
"learning_rate": 4.133551509975264e-06,
"logits/chosen": 2.0068416595458984,
"logits/rejected": 1.5152744054794312,
"logps/chosen": -1.5090525150299072,
"logps/rejected": -3.9272122383117676,
"loss": 0.4004,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.2635788917541504,
"rewards/margins": 3.627239227294922,
"rewards/rejected": -5.890818119049072,
"step": 410
},
{
"epoch": 0.3496721823290665,
"grad_norm": 11.516369819641113,
"learning_rate": 4.093559974371725e-06,
"logits/chosen": 3.343449115753174,
"logits/rejected": 2.920070171356201,
"logps/chosen": -1.8312532901763916,
"logps/rejected": -4.115124702453613,
"loss": 0.4045,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.746879816055298,
"rewards/margins": 3.425807476043701,
"rewards/rejected": -6.17268705368042,
"step": 420
},
{
"epoch": 0.35799771047975854,
"grad_norm": 3.0497395992279053,
"learning_rate": 4.052869450695776e-06,
"logits/chosen": 2.5527279376983643,
"logits/rejected": 2.2495744228363037,
"logps/chosen": -2.2998366355895996,
"logps/rejected": -4.966278076171875,
"loss": 0.3758,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.4497551918029785,
"rewards/margins": 3.9996612071990967,
"rewards/rejected": -7.4494171142578125,
"step": 430
},
{
"epoch": 0.36632323863045063,
"grad_norm": 3.900503158569336,
"learning_rate": 4.011497787155938e-06,
"logits/chosen": 2.4560112953186035,
"logits/rejected": 2.3936328887939453,
"logps/chosen": -2.563218593597412,
"logps/rejected": -5.063398838043213,
"loss": 0.3739,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.8448281288146973,
"rewards/margins": 3.750270366668701,
"rewards/rejected": -7.595097541809082,
"step": 440
},
{
"epoch": 0.3746487667811427,
"grad_norm": 2.8846070766448975,
"learning_rate": 3.969463130731183e-06,
"logits/chosen": 2.5467796325683594,
"logits/rejected": 2.4370405673980713,
"logps/chosen": -2.4494822025299072,
"logps/rejected": -5.12601900100708,
"loss": 0.2905,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.6742234230041504,
"rewards/margins": 4.014804840087891,
"rewards/rejected": -7.689028263092041,
"step": 450
},
{
"epoch": 0.3746487667811427,
"eval_logits/chosen": 2.922081232070923,
"eval_logits/rejected": 2.879075050354004,
"eval_logps/chosen": -2.352473020553589,
"eval_logps/rejected": -5.1224799156188965,
"eval_loss": 0.3302614390850067,
"eval_rewards/accuracies": 0.8673469424247742,
"eval_rewards/chosen": -3.5287091732025146,
"eval_rewards/margins": 4.155009746551514,
"eval_rewards/rejected": -7.683719635009766,
"eval_runtime": 29.0235,
"eval_samples_per_second": 26.771,
"eval_steps_per_second": 3.377,
"step": 450
},
{
"epoch": 0.3829742949318347,
"grad_norm": 4.662614345550537,
"learning_rate": 3.92678391921108e-06,
"logits/chosen": 2.428154468536377,
"logits/rejected": 2.2403202056884766,
"logps/chosen": -2.5936172008514404,
"logps/rejected": -5.356133460998535,
"loss": 0.2881,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.89042592048645,
"rewards/margins": 4.143774509429932,
"rewards/rejected": -8.034199714660645,
"step": 460
},
{
"epoch": 0.3912998230825268,
"grad_norm": 2.716899871826172,
"learning_rate": 3.88347887310836e-06,
"logits/chosen": 2.437295436859131,
"logits/rejected": 2.271914005279541,
"logps/chosen": -2.470245361328125,
"logps/rejected": -5.719494819641113,
"loss": 0.31,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -3.70536732673645,
"rewards/margins": 4.873874187469482,
"rewards/rejected": -8.579241752624512,
"step": 470
},
{
"epoch": 0.39962535123321885,
"grad_norm": 3.343271255493164,
"learning_rate": 3.839566987447492e-06,
"logits/chosen": 2.144461154937744,
"logits/rejected": 2.0314810276031494,
"logps/chosen": -2.5805585384368896,
"logps/rejected": -5.418456077575684,
"loss": 0.3194,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.870838165283203,
"rewards/margins": 4.256844997406006,
"rewards/rejected": -8.12768268585205,
"step": 480
},
{
"epoch": 0.4079508793839109,
"grad_norm": 6.411283493041992,
"learning_rate": 3.795067523432826e-06,
"logits/chosen": 2.408092498779297,
"logits/rejected": 2.2996156215667725,
"logps/chosen": -2.8846375942230225,
"logps/rejected": -5.957771301269531,
"loss": 0.3353,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -4.326956748962402,
"rewards/margins": 4.6097002029418945,
"rewards/rejected": -8.936657905578613,
"step": 490
},
{
"epoch": 0.416276407534603,
"grad_norm": 3.2472238540649414,
"learning_rate": 3.7500000000000005e-06,
"logits/chosen": 3.0815653800964355,
"logits/rejected": 2.8496975898742676,
"logps/chosen": -3.061626434326172,
"logps/rejected": -5.966124534606934,
"loss": 0.3018,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -4.592440128326416,
"rewards/margins": 4.356747627258301,
"rewards/rejected": -8.949186325073242,
"step": 500
},
{
"epoch": 0.416276407534603,
"eval_logits/chosen": 2.7115373611450195,
"eval_logits/rejected": 2.763493061065674,
"eval_logps/chosen": -2.85333251953125,
"eval_logps/rejected": -5.915884017944336,
"eval_loss": 0.3079966604709625,
"eval_rewards/accuracies": 0.8979591727256775,
"eval_rewards/chosen": -4.279998302459717,
"eval_rewards/margins": 4.593828201293945,
"eval_rewards/rejected": -8.873826026916504,
"eval_runtime": 29.0268,
"eval_samples_per_second": 26.768,
"eval_steps_per_second": 3.376,
"step": 500
},
{
"epoch": 0.42460193568529503,
"grad_norm": 10.017457962036133,
"learning_rate": 3.7043841852542884e-06,
"logits/chosen": 2.775202989578247,
"logits/rejected": 2.6122496128082275,
"logps/chosen": -3.0054879188537598,
"logps/rejected": -6.258307456970215,
"loss": 0.3101,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -4.5082316398620605,
"rewards/margins": 4.879229545593262,
"rewards/rejected": -9.387460708618164,
"step": 510
},
{
"epoch": 0.43292746383598707,
"grad_norm": 4.494226932525635,
"learning_rate": 3.658240087799655e-06,
"logits/chosen": 2.816701889038086,
"logits/rejected": 2.4107789993286133,
"logps/chosen": -3.2932097911834717,
"logps/rejected": -6.099677562713623,
"loss": 0.2925,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.939814567565918,
"rewards/margins": 4.209702014923096,
"rewards/rejected": -9.149517059326172,
"step": 520
},
{
"epoch": 0.44125299198667917,
"grad_norm": 2.957486391067505,
"learning_rate": 3.611587947962319e-06,
"logits/chosen": 2.3626818656921387,
"logits/rejected": 2.4196550846099854,
"logps/chosen": -3.085209608078003,
"logps/rejected": -6.118277072906494,
"loss": 0.3169,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -4.627814292907715,
"rewards/margins": 4.549601078033447,
"rewards/rejected": -9.17741584777832,
"step": 530
},
{
"epoch": 0.4495785201373712,
"grad_norm": 3.429408550262451,
"learning_rate": 3.564448228912682e-06,
"logits/chosen": 2.559816360473633,
"logits/rejected": 2.598250150680542,
"logps/chosen": -3.3060078620910645,
"logps/rejected": -6.124637126922607,
"loss": 0.3271,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -4.959012031555176,
"rewards/margins": 4.227944850921631,
"rewards/rejected": -9.186956405639648,
"step": 540
},
{
"epoch": 0.45790404828806325,
"grad_norm": 2.110722780227661,
"learning_rate": 3.516841607689501e-06,
"logits/chosen": 2.4487693309783936,
"logits/rejected": 2.0568625926971436,
"logps/chosen": -3.396770477294922,
"logps/rejected": -6.35222864151001,
"loss": 0.3172,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -5.095156192779541,
"rewards/margins": 4.4331865310668945,
"rewards/rejected": -9.528343200683594,
"step": 550
},
{
"epoch": 0.45790404828806325,
"eval_logits/chosen": 2.5644595623016357,
"eval_logits/rejected": 2.6437506675720215,
"eval_logps/chosen": -3.1958370208740234,
"eval_logps/rejected": -6.542325496673584,
"eval_loss": 0.28538385033607483,
"eval_rewards/accuracies": 0.918367326259613,
"eval_rewards/chosen": -4.793755054473877,
"eval_rewards/margins": 5.0197319984436035,
"eval_rewards/rejected": -9.813486099243164,
"eval_runtime": 29.0252,
"eval_samples_per_second": 26.77,
"eval_steps_per_second": 3.376,
"step": 550
},
{
"epoch": 0.46622957643875534,
"grad_norm": 2.0929551124572754,
"learning_rate": 3.4687889661302577e-06,
"logits/chosen": 2.497122287750244,
"logits/rejected": 2.1119792461395264,
"logps/chosen": -3.586158037185669,
"logps/rejected": -6.939994812011719,
"loss": 0.2826,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -5.379237174987793,
"rewards/margins": 5.030755043029785,
"rewards/rejected": -10.409992218017578,
"step": 560
},
{
"epoch": 0.4745551045894474,
"grad_norm": 3.344160556793213,
"learning_rate": 3.4203113817116955e-06,
"logits/chosen": 3.181488275527954,
"logits/rejected": 2.8188672065734863,
"logps/chosen": -3.465902328491211,
"logps/rejected": -6.737443447113037,
"loss": 0.3027,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -5.198853492736816,
"rewards/margins": 4.90731143951416,
"rewards/rejected": -10.106164932250977,
"step": 570
},
{
"epoch": 0.4828806327401394,
"grad_norm": 6.381539344787598,
"learning_rate": 3.3714301183045382e-06,
"logits/chosen": 3.8848679065704346,
"logits/rejected": 3.54484224319458,
"logps/chosen": -3.321965456008911,
"logps/rejected": -6.796433448791504,
"loss": 0.2619,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -4.982948303222656,
"rewards/margins": 5.211700916290283,
"rewards/rejected": -10.194650650024414,
"step": 580
},
{
"epoch": 0.4912061608908315,
"grad_norm": 3.058936834335327,
"learning_rate": 3.3221666168464584e-06,
"logits/chosen": 2.9645297527313232,
"logits/rejected": 2.7630581855773926,
"logps/chosen": -3.2019195556640625,
"logps/rejected": -6.635239601135254,
"loss": 0.2573,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -4.802879810333252,
"rewards/margins": 5.149979114532471,
"rewards/rejected": -9.952859878540039,
"step": 590
},
{
"epoch": 0.49953168904152356,
"grad_norm": 4.1828155517578125,
"learning_rate": 3.272542485937369e-06,
"logits/chosen": 2.696993350982666,
"logits/rejected": 2.7842001914978027,
"logps/chosen": -3.3624558448791504,
"logps/rejected": -6.4542059898376465,
"loss": 0.2598,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -5.043683052062988,
"rewards/margins": 4.637625217437744,
"rewards/rejected": -9.68130874633789,
"step": 600
},
{
"epoch": 0.49953168904152356,
"eval_logits/chosen": 2.9141366481781006,
"eval_logits/rejected": 2.9971513748168945,
"eval_logps/chosen": -3.1258208751678467,
"eval_logps/rejected": -6.787447452545166,
"eval_loss": 0.27035781741142273,
"eval_rewards/accuracies": 0.918367326259613,
"eval_rewards/chosen": -4.688731670379639,
"eval_rewards/margins": 5.492439270019531,
"eval_rewards/rejected": -10.181171417236328,
"eval_runtime": 29.0227,
"eval_samples_per_second": 26.772,
"eval_steps_per_second": 3.377,
"step": 600
},
{
"epoch": 0.5078572171922157,
"grad_norm": 3.1104886531829834,
"learning_rate": 3.222579492361179e-06,
"logits/chosen": 2.582984447479248,
"logits/rejected": 2.424341917037964,
"logps/chosen": -3.0132031440734863,
"logps/rejected": -6.317469596862793,
"loss": 0.2598,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.519804954528809,
"rewards/margins": 4.956398963928223,
"rewards/rejected": -9.476203918457031,
"step": 610
},
{
"epoch": 0.5161827453429076,
"grad_norm": 12.320380210876465,
"learning_rate": 3.1722995515381644e-06,
"logits/chosen": 2.1016178131103516,
"logits/rejected": 2.345324754714966,
"logps/chosen": -3.1399683952331543,
"logps/rejected": -7.096994876861572,
"loss": 0.2601,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -4.709952354431152,
"rewards/margins": 5.935539722442627,
"rewards/rejected": -10.645492553710938,
"step": 620
},
{
"epoch": 0.5245082734935997,
"grad_norm": 2.704423189163208,
"learning_rate": 3.121724717912138e-06,
"logits/chosen": 2.108675718307495,
"logits/rejected": 2.369410991668701,
"logps/chosen": -3.6519737243652344,
"logps/rejected": -6.964946746826172,
"loss": 0.2351,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -5.477960586547852,
"rewards/margins": 4.96945858001709,
"rewards/rejected": -10.447419166564941,
"step": 630
},
{
"epoch": 0.5328338016442918,
"grad_norm": 4.401206970214844,
"learning_rate": 3.0708771752766397e-06,
"logits/chosen": 2.3692595958709717,
"logits/rejected": 2.5313620567321777,
"logps/chosen": -4.0485663414001465,
"logps/rejected": -7.747661590576172,
"loss": 0.2265,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.072849750518799,
"rewards/margins": 5.548642635345459,
"rewards/rejected": -11.621491432189941,
"step": 640
},
{
"epoch": 0.5411593297949838,
"grad_norm": 4.68662166595459,
"learning_rate": 3.019779227044398e-06,
"logits/chosen": 2.4383034706115723,
"logits/rejected": 2.4655585289001465,
"logps/chosen": -3.8650074005126953,
"logps/rejected": -7.987051963806152,
"loss": 0.263,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.797511100769043,
"rewards/margins": 6.183066368103027,
"rewards/rejected": -11.98057746887207,
"step": 650
},
{
"epoch": 0.5411593297949838,
"eval_logits/chosen": 2.7321341037750244,
"eval_logits/rejected": 2.906801700592041,
"eval_logps/chosen": -3.7255136966705322,
"eval_logps/rejected": -7.620375633239746,
"eval_loss": 0.26394686102867126,
"eval_rewards/accuracies": 0.9285714030265808,
"eval_rewards/chosen": -5.5882697105407715,
"eval_rewards/margins": 5.8422932624816895,
"eval_rewards/rejected": -11.430564880371094,
"eval_runtime": 29.0258,
"eval_samples_per_second": 26.769,
"eval_steps_per_second": 3.376,
"step": 650
},
{
"epoch": 0.5494848579456759,
"grad_norm": 4.704371929168701,
"learning_rate": 2.9684532864643123e-06,
"logits/chosen": 2.7277207374572754,
"logits/rejected": 2.7106287479400635,
"logps/chosen": -3.979590654373169,
"logps/rejected": -6.88008975982666,
"loss": 0.2933,
"rewards/accuracies": 0.875,
"rewards/chosen": -5.969386100769043,
"rewards/margins": 4.350748062133789,
"rewards/rejected": -10.320135116577148,
"step": 660
},
{
"epoch": 0.557810386096368,
"grad_norm": 3.2897160053253174,
"learning_rate": 2.9169218667902562e-06,
"logits/chosen": 2.207106113433838,
"logits/rejected": 2.454056978225708,
"logps/chosen": -3.760200023651123,
"logps/rejected": -7.504108428955078,
"loss": 0.2262,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -5.6402997970581055,
"rewards/margins": 5.615862846374512,
"rewards/rejected": -11.256162643432617,
"step": 670
},
{
"epoch": 0.56613591424706,
"grad_norm": 3.6699540615081787,
"learning_rate": 2.8652075714060296e-06,
"logits/chosen": 2.5904622077941895,
"logits/rejected": 2.693467617034912,
"logps/chosen": -3.2713139057159424,
"logps/rejected": -7.3422722816467285,
"loss": 0.2721,
"rewards/accuracies": 0.9375,
"rewards/chosen": -4.906970500946045,
"rewards/margins": 6.106438636779785,
"rewards/rejected": -11.013408660888672,
"step": 680
},
{
"epoch": 0.5744614423977521,
"grad_norm": 3.054532289505005,
"learning_rate": 2.813333083910761e-06,
"logits/chosen": 2.9145145416259766,
"logits/rejected": 2.7135214805603027,
"logps/chosen": -3.5082690715789795,
"logps/rejected": -7.293328762054443,
"loss": 0.271,
"rewards/accuracies": 0.9375,
"rewards/chosen": -5.26240348815918,
"rewards/margins": 5.677589416503906,
"rewards/rejected": -10.939992904663086,
"step": 690
},
{
"epoch": 0.5827869705484442,
"grad_norm": 3.5161256790161133,
"learning_rate": 2.761321158169134e-06,
"logits/chosen": 2.915343761444092,
"logits/rejected": 2.731520891189575,
"logps/chosen": -3.4292550086975098,
"logps/rejected": -8.124921798706055,
"loss": 0.1985,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -5.143881797790527,
"rewards/margins": 7.043501377105713,
"rewards/rejected": -12.187383651733398,
"step": 700
},
{
"epoch": 0.5827869705484442,
"eval_logits/chosen": 2.5902156829833984,
"eval_logits/rejected": 2.774846315383911,
"eval_logps/chosen": -3.5158140659332275,
"eval_logps/rejected": -7.544556140899658,
"eval_loss": 0.24698135256767273,
"eval_rewards/accuracies": 0.9285714030265808,
"eval_rewards/chosen": -5.273721694946289,
"eval_rewards/margins": 6.043112754821777,
"eval_rewards/rejected": -11.31683349609375,
"eval_runtime": 29.0187,
"eval_samples_per_second": 26.776,
"eval_steps_per_second": 3.377,
"step": 700
},
{
"epoch": 0.5911124986991362,
"grad_norm": 3.2246947288513184,
"learning_rate": 2.70919460833079e-06,
"logits/chosen": 2.9566922187805176,
"logits/rejected": 2.874277353286743,
"logps/chosen": -3.772322177886963,
"logps/rejected": -7.461319923400879,
"loss": 0.2565,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.658483028411865,
"rewards/margins": 5.533496856689453,
"rewards/rejected": -11.191980361938477,
"step": 710
},
{
"epoch": 0.5994380268498283,
"grad_norm": 4.457447052001953,
"learning_rate": 2.6569762988232838e-06,
"logits/chosen": 2.653148889541626,
"logits/rejected": 2.646437168121338,
"logps/chosen": -3.8250937461853027,
"logps/rejected": -7.855221748352051,
"loss": 0.2244,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -5.737640857696533,
"rewards/margins": 6.045191287994385,
"rewards/rejected": -11.782832145690918,
"step": 720
},
{
"epoch": 0.6077635550005204,
"grad_norm": 3.477293014526367,
"learning_rate": 2.604689134322999e-06,
"logits/chosen": 2.2635607719421387,
"logits/rejected": 2.2247064113616943,
"logps/chosen": -3.974703550338745,
"logps/rejected": -8.289571762084961,
"loss": 0.2294,
"rewards/accuracies": 0.9375,
"rewards/chosen": -5.962055206298828,
"rewards/margins": 6.4723029136657715,
"rewards/rejected": -12.434357643127441,
"step": 730
},
{
"epoch": 0.6160890831512124,
"grad_norm": 1.6821621656417847,
"learning_rate": 2.5523560497083927e-06,
"logits/chosen": 1.8432185649871826,
"logits/rejected": 1.9002739191055298,
"logps/chosen": -3.8650963306427,
"logps/rejected": -7.553779602050781,
"loss": 0.2221,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -5.79764461517334,
"rewards/margins": 5.533024787902832,
"rewards/rejected": -11.330669403076172,
"step": 740
},
{
"epoch": 0.6244146113019045,
"grad_norm": 24.729644775390625,
"learning_rate": 2.5e-06,
"logits/chosen": 2.5135562419891357,
"logits/rejected": 2.6035869121551514,
"logps/chosen": -3.6619372367858887,
"logps/rejected": -7.801999568939209,
"loss": 0.2724,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -5.492905616760254,
"rewards/margins": 6.2100934982299805,
"rewards/rejected": -11.702998161315918,
"step": 750
},
{
"epoch": 0.6244146113019045,
"eval_logits/chosen": 2.876950979232788,
"eval_logits/rejected": 3.0243964195251465,
"eval_logps/chosen": -3.517216682434082,
"eval_logps/rejected": -7.607268810272217,
"eval_loss": 0.24484822154045105,
"eval_rewards/accuracies": 0.9387755393981934,
"eval_rewards/chosen": -5.275824546813965,
"eval_rewards/margins": 6.135078430175781,
"eval_rewards/rejected": -11.410903930664062,
"eval_runtime": 28.9129,
"eval_samples_per_second": 26.874,
"eval_steps_per_second": 3.389,
"step": 750
},
{
"epoch": 0.6327401394525966,
"grad_norm": 9.702905654907227,
"learning_rate": 2.447643950291608e-06,
"logits/chosen": 2.693587064743042,
"logits/rejected": 2.6106948852539062,
"logps/chosen": -3.7441153526306152,
"logps/rejected": -7.564157009124756,
"loss": 0.2506,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -5.61617374420166,
"rewards/margins": 5.730062961578369,
"rewards/rejected": -11.346236228942871,
"step": 760
},
{
"epoch": 0.6410656676032885,
"grad_norm": 8.551860809326172,
"learning_rate": 2.3953108656770018e-06,
"logits/chosen": 2.894711971282959,
"logits/rejected": 3.036170482635498,
"logps/chosen": -3.972269058227539,
"logps/rejected": -8.38014030456543,
"loss": 0.2107,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.958403587341309,
"rewards/margins": 6.6118059158325195,
"rewards/rejected": -12.570208549499512,
"step": 770
},
{
"epoch": 0.6493911957539806,
"grad_norm": 2.4394350051879883,
"learning_rate": 2.3430237011767166e-06,
"logits/chosen": 3.1415820121765137,
"logits/rejected": 3.1218018531799316,
"logps/chosen": -4.007376194000244,
"logps/rejected": -8.103262901306152,
"loss": 0.1886,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -6.011064052581787,
"rewards/margins": 6.1438307762146,
"rewards/rejected": -12.154894828796387,
"step": 780
},
{
"epoch": 0.6577167239046727,
"grad_norm": 3.69184947013855,
"learning_rate": 2.290805391669212e-06,
"logits/chosen": 3.3487350940704346,
"logits/rejected": 3.5375237464904785,
"logps/chosen": -3.7646141052246094,
"logps/rejected": -7.569940090179443,
"loss": 0.2106,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -5.646921634674072,
"rewards/margins": 5.707989692687988,
"rewards/rejected": -11.354910850524902,
"step": 790
},
{
"epoch": 0.6660422520553647,
"grad_norm": 4.604506015777588,
"learning_rate": 2.238678841830867e-06,
"logits/chosen": 3.159898519515991,
"logits/rejected": 3.09334135055542,
"logps/chosen": -4.009636878967285,
"logps/rejected": -7.4454545974731445,
"loss": 0.2379,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -6.014455318450928,
"rewards/margins": 5.1537251472473145,
"rewards/rejected": -11.168180465698242,
"step": 800
},
{
"epoch": 0.6660422520553647,
"eval_logits/chosen": 2.748328924179077,
"eval_logits/rejected": 2.9500906467437744,
"eval_logps/chosen": -3.652164936065674,
"eval_logps/rejected": -7.951470375061035,
"eval_loss": 0.23568958044052124,
"eval_rewards/accuracies": 0.9387755393981934,
"eval_rewards/chosen": -5.478247165679932,
"eval_rewards/margins": 6.448958396911621,
"eval_rewards/rejected": -11.927205085754395,
"eval_runtime": 29.021,
"eval_samples_per_second": 26.774,
"eval_steps_per_second": 3.377,
"step": 800
},
{
"epoch": 0.6743677802060568,
"grad_norm": 3.968970537185669,
"learning_rate": 2.186666916089239e-06,
"logits/chosen": 2.384208917617798,
"logits/rejected": 2.3336739540100098,
"logps/chosen": -3.8832621574401855,
"logps/rejected": -7.72598123550415,
"loss": 0.2706,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -5.824892520904541,
"rewards/margins": 5.764077663421631,
"rewards/rejected": -11.588971138000488,
"step": 810
},
{
"epoch": 0.6826933083567489,
"grad_norm": 3.6892929077148438,
"learning_rate": 2.134792428593971e-06,
"logits/chosen": 3.5869107246398926,
"logits/rejected": 3.517749786376953,
"logps/chosen": -3.306342363357544,
"logps/rejected": -7.020272254943848,
"loss": 0.2398,
"rewards/accuracies": 0.875,
"rewards/chosen": -4.9595136642456055,
"rewards/margins": 5.570894718170166,
"rewards/rejected": -10.530407905578613,
"step": 820
},
{
"epoch": 0.6910188365074409,
"grad_norm": 4.89448881149292,
"learning_rate": 2.0830781332097446e-06,
"logits/chosen": 2.5076346397399902,
"logits/rejected": 2.3836727142333984,
"logps/chosen": -3.843027114868164,
"logps/rejected": -7.852384090423584,
"loss": 0.2116,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -5.764540672302246,
"rewards/margins": 6.014035701751709,
"rewards/rejected": -11.77857494354248,
"step": 830
},
{
"epoch": 0.699344364658133,
"grad_norm": 8.198432922363281,
"learning_rate": 2.031546713535688e-06,
"logits/chosen": 2.5533287525177,
"logits/rejected": 2.407637357711792,
"logps/chosen": -3.574105739593506,
"logps/rejected": -8.23727798461914,
"loss": 0.2415,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -5.361158847808838,
"rewards/margins": 6.994758605957031,
"rewards/rejected": -12.355916976928711,
"step": 840
},
{
"epoch": 0.7076698928088251,
"grad_norm": 4.123171329498291,
"learning_rate": 1.9802207729556023e-06,
"logits/chosen": 2.4909422397613525,
"logits/rejected": 2.3119165897369385,
"logps/chosen": -3.927218198776245,
"logps/rejected": -7.961021423339844,
"loss": 0.2217,
"rewards/accuracies": 0.9375,
"rewards/chosen": -5.89082670211792,
"rewards/margins": 6.050704002380371,
"rewards/rejected": -11.94153118133545,
"step": 850
},
{
"epoch": 0.7076698928088251,
"eval_logits/chosen": 2.858954668045044,
"eval_logits/rejected": 3.012629270553589,
"eval_logps/chosen": -3.577458381652832,
"eval_logps/rejected": -7.837220668792725,
"eval_loss": 0.23848077654838562,
"eval_rewards/accuracies": 0.9387755393981934,
"eval_rewards/chosen": -5.36618709564209,
"eval_rewards/margins": 6.389642715454102,
"eval_rewards/rejected": -11.755829811096191,
"eval_runtime": 29.02,
"eval_samples_per_second": 26.775,
"eval_steps_per_second": 3.377,
"step": 850
},
{
"epoch": 0.7159954209595171,
"grad_norm": 3.4179177284240723,
"learning_rate": 1.9291228247233607e-06,
"logits/chosen": 2.535378932952881,
"logits/rejected": 2.5335640907287598,
"logps/chosen": -3.541815996170044,
"logps/rejected": -7.519083499908447,
"loss": 0.2167,
"rewards/accuracies": 0.9375,
"rewards/chosen": -5.312723159790039,
"rewards/margins": 5.965902328491211,
"rewards/rejected": -11.27862548828125,
"step": 860
},
{
"epoch": 0.7243209491102092,
"grad_norm": 1.8562341928482056,
"learning_rate": 1.8782752820878636e-06,
"logits/chosen": 3.0650887489318848,
"logits/rejected": 2.7918925285339355,
"logps/chosen": -3.791342258453369,
"logps/rejected": -7.656645774841309,
"loss": 0.1925,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.687013149261475,
"rewards/margins": 5.7979559898376465,
"rewards/rejected": -11.484968185424805,
"step": 870
},
{
"epoch": 0.7326464772609013,
"grad_norm": 9.719799995422363,
"learning_rate": 1.827700448461836e-06,
"logits/chosen": 2.4594621658325195,
"logits/rejected": 2.4324564933776855,
"logps/chosen": -3.6558470726013184,
"logps/rejected": -8.101290702819824,
"loss": 0.1975,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -5.483770847320557,
"rewards/margins": 6.6681647300720215,
"rewards/rejected": -12.151935577392578,
"step": 880
},
{
"epoch": 0.7409720054115932,
"grad_norm": 3.240176200866699,
"learning_rate": 1.7774205076388207e-06,
"logits/chosen": 2.689762592315674,
"logits/rejected": 2.553614616394043,
"logps/chosen": -3.3837451934814453,
"logps/rejected": -7.7740020751953125,
"loss": 0.177,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.075617790222168,
"rewards/margins": 6.585384368896484,
"rewards/rejected": -11.661002159118652,
"step": 890
},
{
"epoch": 0.7492975335622853,
"grad_norm": 3.8752946853637695,
"learning_rate": 1.7274575140626318e-06,
"logits/chosen": 3.2561440467834473,
"logits/rejected": 3.13822603225708,
"logps/chosen": -3.69258451461792,
"logps/rejected": -7.472433567047119,
"loss": 0.213,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -5.538876533508301,
"rewards/margins": 5.669772624969482,
"rewards/rejected": -11.208650588989258,
"step": 900
},
{
"epoch": 0.7492975335622853,
"eval_logits/chosen": 2.8268215656280518,
"eval_logits/rejected": 3.031662702560425,
"eval_logps/chosen": -3.6311440467834473,
"eval_logps/rejected": -8.067394256591797,
"eval_loss": 0.23127013444900513,
"eval_rewards/accuracies": 0.9285714030265808,
"eval_rewards/chosen": -5.44671630859375,
"eval_rewards/margins": 6.654376029968262,
"eval_rewards/rejected": -12.101091384887695,
"eval_runtime": 29.022,
"eval_samples_per_second": 26.773,
"eval_steps_per_second": 3.377,
"step": 900
},
{
"epoch": 0.7576230617129774,
"grad_norm": 3.4024012088775635,
"learning_rate": 1.677833383153542e-06,
"logits/chosen": 2.0691773891448975,
"logits/rejected": 2.190563201904297,
"logps/chosen": -3.483668565750122,
"logps/rejected": -8.020956039428711,
"loss": 0.198,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -5.225502967834473,
"rewards/margins": 6.805932521820068,
"rewards/rejected": -12.0314359664917,
"step": 910
},
{
"epoch": 0.7659485898636694,
"grad_norm": 4.999133586883545,
"learning_rate": 1.6285698816954626e-06,
"logits/chosen": 2.4453094005584717,
"logits/rejected": 2.440931558609009,
"logps/chosen": -4.1138916015625,
"logps/rejected": -8.617280960083008,
"loss": 0.253,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -6.170836925506592,
"rewards/margins": 6.7550835609436035,
"rewards/rejected": -12.925920486450195,
"step": 920
},
{
"epoch": 0.7742741180143615,
"grad_norm": 3.1391687393188477,
"learning_rate": 1.5796886182883053e-06,
"logits/chosen": 2.892235517501831,
"logits/rejected": 2.8754334449768066,
"logps/chosen": -3.8762309551239014,
"logps/rejected": -7.991665840148926,
"loss": 0.2171,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -5.814346790313721,
"rewards/margins": 6.173151969909668,
"rewards/rejected": -11.98749828338623,
"step": 930
},
{
"epoch": 0.7825996461650536,
"grad_norm": 6.850193023681641,
"learning_rate": 1.5312110338697427e-06,
"logits/chosen": 3.0068447589874268,
"logits/rejected": 3.0385780334472656,
"logps/chosen": -3.7039177417755127,
"logps/rejected": -8.53662109375,
"loss": 0.1907,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -5.555876731872559,
"rewards/margins": 7.2490553855896,
"rewards/rejected": -12.804931640625,
"step": 940
},
{
"epoch": 0.7909251743157456,
"grad_norm": 16.202392578125,
"learning_rate": 1.4831583923105e-06,
"logits/chosen": 2.445254325866699,
"logits/rejected": 2.6017098426818848,
"logps/chosen": -4.0695037841796875,
"logps/rejected": -8.545947074890137,
"loss": 0.2033,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.104255676269531,
"rewards/margins": 6.714664459228516,
"rewards/rejected": -12.818921089172363,
"step": 950
},
{
"epoch": 0.7909251743157456,
"eval_logits/chosen": 2.7845335006713867,
"eval_logits/rejected": 3.037020206451416,
"eval_logps/chosen": -3.982541799545288,
"eval_logps/rejected": -8.498592376708984,
"eval_loss": 0.22774070501327515,
"eval_rewards/accuracies": 0.9387755393981934,
"eval_rewards/chosen": -5.973812580108643,
"eval_rewards/margins": 6.77407693862915,
"eval_rewards/rejected": -12.747888565063477,
"eval_runtime": 29.0201,
"eval_samples_per_second": 26.775,
"eval_steps_per_second": 3.377,
"step": 950
},
{
"epoch": 0.7992507024664377,
"grad_norm": 4.31044864654541,
"learning_rate": 1.4355517710873184e-06,
"logits/chosen": 2.1485049724578857,
"logits/rejected": 2.493374824523926,
"logps/chosen": -3.8115482330322266,
"logps/rejected": -8.553500175476074,
"loss": 0.2109,
"rewards/accuracies": 0.9375,
"rewards/chosen": -5.71732234954834,
"rewards/margins": 7.112928867340088,
"rewards/rejected": -12.83025074005127,
"step": 960
},
{
"epoch": 0.8075762306171298,
"grad_norm": 4.177423000335693,
"learning_rate": 1.388412052037682e-06,
"logits/chosen": 2.9300179481506348,
"logits/rejected": 2.9548909664154053,
"logps/chosen": -3.9784176349639893,
"logps/rejected": -8.308394432067871,
"loss": 0.2012,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -5.967626094818115,
"rewards/margins": 6.49496603012085,
"rewards/rejected": -12.462592124938965,
"step": 970
},
{
"epoch": 0.8159017587678218,
"grad_norm": 4.683027744293213,
"learning_rate": 1.3417599122003464e-06,
"logits/chosen": 2.5800061225891113,
"logits/rejected": 2.526090145111084,
"logps/chosen": -3.86810564994812,
"logps/rejected": -8.47614574432373,
"loss": 0.2141,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -5.802158355712891,
"rewards/margins": 6.9120612144470215,
"rewards/rejected": -12.714218139648438,
"step": 980
},
{
"epoch": 0.8242272869185139,
"grad_norm": 3.7419984340667725,
"learning_rate": 1.2956158147457116e-06,
"logits/chosen": 3.4706058502197266,
"logits/rejected": 3.4088757038116455,
"logps/chosen": -4.216760158538818,
"logps/rejected": -8.575207710266113,
"loss": 0.2422,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.325140953063965,
"rewards/margins": 6.537671089172363,
"rewards/rejected": -12.862811088562012,
"step": 990
},
{
"epoch": 0.832552815069206,
"grad_norm": 8.953512191772461,
"learning_rate": 1.2500000000000007e-06,
"logits/chosen": 2.9276206493377686,
"logits/rejected": 2.946265459060669,
"logps/chosen": -3.9976966381073,
"logps/rejected": -8.48410701751709,
"loss": 0.2139,
"rewards/accuracies": 0.9375,
"rewards/chosen": -5.996545314788818,
"rewards/margins": 6.729616641998291,
"rewards/rejected": -12.726162910461426,
"step": 1000
},
{
"epoch": 0.832552815069206,
"eval_logits/chosen": 2.9153146743774414,
"eval_logits/rejected": 3.0989012718200684,
"eval_logps/chosen": -3.6678271293640137,
"eval_logps/rejected": -8.173608779907227,
"eval_loss": 0.22841480374336243,
"eval_rewards/accuracies": 0.9285714030265808,
"eval_rewards/chosen": -5.5017409324646,
"eval_rewards/margins": 6.758671760559082,
"eval_rewards/rejected": -12.26041316986084,
"eval_runtime": 29.0504,
"eval_samples_per_second": 26.747,
"eval_steps_per_second": 3.373,
"step": 1000
},
{
"epoch": 0.840878343219898,
"grad_norm": 4.267103672027588,
"learning_rate": 1.204932476567175e-06,
"logits/chosen": 2.3057751655578613,
"logits/rejected": 2.3750340938568115,
"logps/chosen": -3.576403856277466,
"logps/rejected": -8.301278114318848,
"loss": 0.2211,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -5.364605903625488,
"rewards/margins": 7.087311744689941,
"rewards/rejected": -12.45191764831543,
"step": 1010
},
{
"epoch": 0.8492038713705901,
"grad_norm": 6.008708477020264,
"learning_rate": 1.160433012552508e-06,
"logits/chosen": 3.0167624950408936,
"logits/rejected": 2.817478895187378,
"logps/chosen": -4.053152084350586,
"logps/rejected": -8.841009140014648,
"loss": 0.195,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -6.079728126525879,
"rewards/margins": 7.181784152984619,
"rewards/rejected": -13.261512756347656,
"step": 1020
},
{
"epoch": 0.8575293995212822,
"grad_norm": 3.7652032375335693,
"learning_rate": 1.11652112689164e-06,
"logits/chosen": 2.3387794494628906,
"logits/rejected": 2.420820474624634,
"logps/chosen": -4.114675045013428,
"logps/rejected": -8.801934242248535,
"loss": 0.2226,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.172013282775879,
"rewards/margins": 7.030886650085449,
"rewards/rejected": -13.202900886535645,
"step": 1030
},
{
"epoch": 0.8658549276719741,
"grad_norm": 3.811018466949463,
"learning_rate": 1.073216080788921e-06,
"logits/chosen": 3.4545624256134033,
"logits/rejected": 2.934145212173462,
"logps/chosen": -3.841254472732544,
"logps/rejected": -8.547441482543945,
"loss": 0.2039,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.7618818283081055,
"rewards/margins": 7.059278964996338,
"rewards/rejected": -12.821161270141602,
"step": 1040
},
{
"epoch": 0.8741804558226662,
"grad_norm": 3.5039620399475098,
"learning_rate": 1.0305368692688175e-06,
"logits/chosen": 1.9293429851531982,
"logits/rejected": 2.530273914337158,
"logps/chosen": -3.6742749214172363,
"logps/rejected": -8.751821517944336,
"loss": 0.2168,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -5.511412143707275,
"rewards/margins": 7.616321563720703,
"rewards/rejected": -13.127734184265137,
"step": 1050
},
{
"epoch": 0.8741804558226662,
"eval_logits/chosen": 2.9263997077941895,
"eval_logits/rejected": 3.1277804374694824,
"eval_logps/chosen": -3.6399991512298584,
"eval_logps/rejected": -8.258426666259766,
"eval_loss": 0.2207891196012497,
"eval_rewards/accuracies": 0.9387755393981934,
"eval_rewards/chosen": -5.45999813079834,
"eval_rewards/margins": 6.92764139175415,
"eval_rewards/rejected": -12.387639045715332,
"eval_runtime": 29.0247,
"eval_samples_per_second": 26.77,
"eval_steps_per_second": 3.376,
"step": 1050
},
{
"epoch": 0.8825059839733583,
"grad_norm": 5.916813373565674,
"learning_rate": 9.88502212844063e-07,
"logits/chosen": 3.1234467029571533,
"logits/rejected": 3.058065891265869,
"logps/chosen": -3.8148319721221924,
"logps/rejected": -8.512906074523926,
"loss": 0.2123,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -5.722247123718262,
"rewards/margins": 7.047112464904785,
"rewards/rejected": -12.769360542297363,
"step": 1060
},
{
"epoch": 0.8908315121240503,
"grad_norm": 1.9670017957687378,
"learning_rate": 9.471305493042243e-07,
"logits/chosen": 3.48276948928833,
"logits/rejected": 2.9211738109588623,
"logps/chosen": -4.147209167480469,
"logps/rejected": -8.622703552246094,
"loss": 0.1951,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -6.220814228057861,
"rewards/margins": 6.7132415771484375,
"rewards/rejected": -12.934056282043457,
"step": 1070
},
{
"epoch": 0.8991570402747424,
"grad_norm": 4.044788837432861,
"learning_rate": 9.064400256282757e-07,
"logits/chosen": 1.3344472646713257,
"logits/rejected": 2.0601110458374023,
"logps/chosen": -4.18247127532959,
"logps/rejected": -9.543882369995117,
"loss": 0.2129,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.273707389831543,
"rewards/margins": 8.042116165161133,
"rewards/rejected": -14.315823554992676,
"step": 1080
},
{
"epoch": 0.9074825684254345,
"grad_norm": 8.981308937072754,
"learning_rate": 8.664484900247363e-07,
"logits/chosen": 3.2444870471954346,
"logits/rejected": 3.3333630561828613,
"logps/chosen": -3.4744930267333984,
"logps/rejected": -8.478456497192383,
"loss": 0.1827,
"rewards/accuracies": 0.9375,
"rewards/chosen": -5.2117390632629395,
"rewards/margins": 7.50594425201416,
"rewards/rejected": -12.717683792114258,
"step": 1090
},
{
"epoch": 0.9158080965761265,
"grad_norm": 5.613018035888672,
"learning_rate": 8.271734841028553e-07,
"logits/chosen": 3.114234209060669,
"logits/rejected": 2.8669419288635254,
"logps/chosen": -3.5276169776916504,
"logps/rejected": -7.26898193359375,
"loss": 0.1883,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.291424751281738,
"rewards/margins": 5.6120476722717285,
"rewards/rejected": -10.903471946716309,
"step": 1100
},
{
"epoch": 0.9158080965761265,
"eval_logits/chosen": 2.896069288253784,
"eval_logits/rejected": 3.120903730392456,
"eval_logps/chosen": -3.6780893802642822,
"eval_logps/rejected": -8.3290376663208,
"eval_loss": 0.21761386096477509,
"eval_rewards/accuracies": 0.9285714030265808,
"eval_rewards/chosen": -5.517134189605713,
"eval_rewards/margins": 6.9764227867126465,
"eval_rewards/rejected": -12.493557929992676,
"eval_runtime": 29.0228,
"eval_samples_per_second": 26.772,
"eval_steps_per_second": 3.377,
"step": 1100
},
{
"epoch": 0.9241336247268186,
"grad_norm": 6.375245571136475,
"learning_rate": 7.886322351782782e-07,
"logits/chosen": 2.5309860706329346,
"logits/rejected": 2.6255879402160645,
"logps/chosen": -4.059412002563477,
"logps/rejected": -9.413751602172852,
"loss": 0.2197,
"rewards/accuracies": 0.9375,
"rewards/chosen": -6.089118003845215,
"rewards/margins": 8.03150749206543,
"rewards/rejected": -14.120626449584961,
"step": 1110
},
{
"epoch": 0.9324591528775107,
"grad_norm": 3.5590834617614746,
"learning_rate": 7.508416487165862e-07,
"logits/chosen": 3.3510899543762207,
"logits/rejected": 3.4622738361358643,
"logps/chosen": -4.005453586578369,
"logps/rejected": -9.220897674560547,
"loss": 0.2071,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -6.008180141448975,
"rewards/margins": 7.823166847229004,
"rewards/rejected": -13.83134651184082,
"step": 1120
},
{
"epoch": 0.9407846810282027,
"grad_norm": 4.397263050079346,
"learning_rate": 7.138183009179922e-07,
"logits/chosen": 3.1275603771209717,
"logits/rejected": 2.9770944118499756,
"logps/chosen": -4.224826812744141,
"logps/rejected": -8.15820026397705,
"loss": 0.2331,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -6.337240219116211,
"rewards/margins": 5.900059700012207,
"rewards/rejected": -12.237300872802734,
"step": 1130
},
{
"epoch": 0.9491102091788948,
"grad_norm": 4.102133750915527,
"learning_rate": 6.775784314464717e-07,
"logits/chosen": 2.8464298248291016,
"logits/rejected": 2.4384379386901855,
"logps/chosen": -3.9073352813720703,
"logps/rejected": -7.805499076843262,
"loss": 0.2223,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -5.8610029220581055,
"rewards/margins": 5.847245216369629,
"rewards/rejected": -11.708248138427734,
"step": 1140
},
{
"epoch": 0.9574357373295869,
"grad_norm": 4.830289363861084,
"learning_rate": 6.421379363065142e-07,
"logits/chosen": 2.3135313987731934,
"logits/rejected": 2.514207124710083,
"logps/chosen": -4.163815498352051,
"logps/rejected": -9.554147720336914,
"loss": 0.184,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -6.245722770690918,
"rewards/margins": 8.085500717163086,
"rewards/rejected": -14.331222534179688,
"step": 1150
},
{
"epoch": 0.9574357373295869,
"eval_logits/chosen": 2.9589338302612305,
"eval_logits/rejected": 3.1358554363250732,
"eval_logps/chosen": -3.7160890102386475,
"eval_logps/rejected": -8.306242942810059,
"eval_loss": 0.22364133596420288,
"eval_rewards/accuracies": 0.9489796161651611,
"eval_rewards/chosen": -5.57413387298584,
"eval_rewards/margins": 6.8852314949035645,
"eval_rewards/rejected": -12.45936393737793,
"eval_runtime": 29.0274,
"eval_samples_per_second": 26.768,
"eval_steps_per_second": 3.376,
"step": 1150
},
{
"epoch": 0.9657612654802789,
"grad_norm": 3.550083637237549,
"learning_rate": 6.075123608706093e-07,
"logits/chosen": 3.0239412784576416,
"logits/rejected": 3.124316930770874,
"logps/chosen": -3.9214415550231934,
"logps/rejected": -8.38886547088623,
"loss": 0.2249,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -5.882162094116211,
"rewards/margins": 6.701135158538818,
"rewards/rejected": -12.583298683166504,
"step": 1160
},
{
"epoch": 0.974086793630971,
"grad_norm": 6.0046515464782715,
"learning_rate": 5.737168930605272e-07,
"logits/chosen": 3.2713863849639893,
"logits/rejected": 3.0897414684295654,
"logps/chosen": -3.4735615253448486,
"logps/rejected": -8.098161697387695,
"loss": 0.1977,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -5.2103424072265625,
"rewards/margins": 6.936898708343506,
"rewards/rejected": -12.147241592407227,
"step": 1170
},
{
"epoch": 0.982412321781663,
"grad_norm": 4.833160400390625,
"learning_rate": 5.407663566854008e-07,
"logits/chosen": 2.0586276054382324,
"logits/rejected": 2.014996290206909,
"logps/chosen": -4.311732292175293,
"logps/rejected": -9.875633239746094,
"loss": 0.2184,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.467599391937256,
"rewards/margins": 8.345849990844727,
"rewards/rejected": -14.813450813293457,
"step": 1180
},
{
"epoch": 0.990737849932355,
"grad_norm": 3.9890189170837402,
"learning_rate": 5.086752049395094e-07,
"logits/chosen": 3.0970911979675293,
"logits/rejected": 2.8563153743743896,
"logps/chosen": -3.985518217086792,
"logps/rejected": -8.199251174926758,
"loss": 0.2178,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.978277683258057,
"rewards/margins": 6.320598125457764,
"rewards/rejected": -12.29887580871582,
"step": 1190
},
{
"epoch": 0.9990633780830471,
"grad_norm": 25.71741485595703,
"learning_rate": 4.774575140626317e-07,
"logits/chosen": 2.927126884460449,
"logits/rejected": 2.802952527999878,
"logps/chosen": -3.5128173828125,
"logps/rejected": -8.298576354980469,
"loss": 0.1799,
"rewards/accuracies": 0.9375,
"rewards/chosen": -5.269227027893066,
"rewards/margins": 7.1786394119262695,
"rewards/rejected": -12.44786548614502,
"step": 1200
},
{
"epoch": 0.9990633780830471,
"eval_logits/chosen": 2.952441692352295,
"eval_logits/rejected": 3.141965389251709,
"eval_logps/chosen": -3.6855292320251465,
"eval_logps/rejected": -8.327260971069336,
"eval_loss": 0.21951240301132202,
"eval_rewards/accuracies": 0.9489796161651611,
"eval_rewards/chosen": -5.528294086456299,
"eval_rewards/margins": 6.962599277496338,
"eval_rewards/rejected": -12.490893363952637,
"eval_runtime": 29.0225,
"eval_samples_per_second": 26.772,
"eval_steps_per_second": 3.377,
"step": 1200
},
{
"epoch": 1.0066604225205535,
"grad_norm": 1.9987434148788452,
"learning_rate": 4.4712697716573994e-07,
"logits/chosen": 2.8181862831115723,
"logits/rejected": 2.7676520347595215,
"logps/chosen": -4.099400043487549,
"logps/rejected": -9.125377655029297,
"loss": 0.1878,
"rewards/accuracies": 0.9452054500579834,
"rewards/chosen": -6.149099826812744,
"rewards/margins": 7.538967132568359,
"rewards/rejected": -13.688066482543945,
"step": 1210
},
{
"epoch": 1.0149859506712458,
"grad_norm": 3.452667474746704,
"learning_rate": 4.1769689822475147e-07,
"logits/chosen": 2.893108367919922,
"logits/rejected": 2.860814332962036,
"logps/chosen": -3.798161268234253,
"logps/rejected": -7.9546356201171875,
"loss": 0.1861,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -5.697242259979248,
"rewards/margins": 6.23471212387085,
"rewards/rejected": -11.931954383850098,
"step": 1220
},
{
"epoch": 1.0233114788219377,
"grad_norm": 3.3297345638275146,
"learning_rate": 3.891801862449629e-07,
"logits/chosen": 2.500004529953003,
"logits/rejected": 2.4870145320892334,
"logps/chosen": -3.8301002979278564,
"logps/rejected": -8.578625679016113,
"loss": 0.2212,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -5.745150566101074,
"rewards/margins": 7.1227874755859375,
"rewards/rejected": -12.867938041687012,
"step": 1230
},
{
"epoch": 1.0316370069726297,
"grad_norm": 5.022886276245117,
"learning_rate": 3.615893495987335e-07,
"logits/chosen": 2.6115026473999023,
"logits/rejected": 2.8262619972229004,
"logps/chosen": -3.562458038330078,
"logps/rejected": -8.772577285766602,
"loss": 0.1869,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -5.343687057495117,
"rewards/margins": 7.815177917480469,
"rewards/rejected": -13.15886402130127,
"step": 1240
},
{
"epoch": 1.039962535123322,
"grad_norm": 5.58774995803833,
"learning_rate": 3.3493649053890325e-07,
"logits/chosen": 2.9379220008850098,
"logits/rejected": 2.9155845642089844,
"logps/chosen": -3.8242735862731934,
"logps/rejected": -8.058219909667969,
"loss": 0.1817,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.736410140991211,
"rewards/margins": 6.3509202003479,
"rewards/rejected": -12.08733081817627,
"step": 1250
},
{
"epoch": 1.039962535123322,
"eval_logits/chosen": 2.932387113571167,
"eval_logits/rejected": 3.14872407913208,
"eval_logps/chosen": -3.7640583515167236,
"eval_logps/rejected": -8.459839820861816,
"eval_loss": 0.21760709583759308,
"eval_rewards/accuracies": 0.9489796161651611,
"eval_rewards/chosen": -5.646087169647217,
"eval_rewards/margins": 7.043673038482666,
"eval_rewards/rejected": -12.689759254455566,
"eval_runtime": 29.0272,
"eval_samples_per_second": 26.768,
"eval_steps_per_second": 3.376,
"step": 1250
}
],
"logging_steps": 10,
"max_steps": 1500,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.0429645337117327e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}