{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.2481007388906233, "eval_steps": 50, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00832552815069206, "grad_norm": 0.04514288529753685, "learning_rate": 4.999451708687114e-06, "logits/chosen": 14.412135124206543, "logits/rejected": 14.867518424987793, "logps/chosen": -0.29279541969299316, "logps/rejected": -0.33705300092697144, "loss": 0.9248, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.43919315934181213, "rewards/margins": 0.066386379301548, "rewards/rejected": -0.5055795311927795, "step": 10 }, { "epoch": 0.01665105630138412, "grad_norm": 0.05052826926112175, "learning_rate": 4.997807075247147e-06, "logits/chosen": 14.956459045410156, "logits/rejected": 15.363263130187988, "logps/chosen": -0.3096744120121002, "logps/rejected": -0.36214715242385864, "loss": 0.9355, "rewards/accuracies": 0.5, "rewards/chosen": -0.46451157331466675, "rewards/margins": 0.07870914041996002, "rewards/rejected": -0.5432207584381104, "step": 20 }, { "epoch": 0.024976584452076178, "grad_norm": 0.04879612475633621, "learning_rate": 4.9950668210706795e-06, "logits/chosen": 14.485757827758789, "logits/rejected": 15.057507514953613, "logps/chosen": -0.27136802673339844, "logps/rejected": -0.31497400999069214, "loss": 0.9268, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.4070519804954529, "rewards/margins": 0.06540900468826294, "rewards/rejected": -0.4724610447883606, "step": 30 }, { "epoch": 0.03330211260276824, "grad_norm": 0.05672155320644379, "learning_rate": 4.9912321481237616e-06, "logits/chosen": 14.529332160949707, "logits/rejected": 14.814855575561523, "logps/chosen": -0.29139184951782227, "logps/rejected": -0.31259119510650635, "loss": 0.9267, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.4370877742767334, "rewards/margins": 0.03179898113012314, "rewards/rejected": -0.46888676285743713, "step": 40 }, { "epoch": 0.041627640753460295, "grad_norm": 0.065071240067482, "learning_rate": 4.986304738420684e-06, "logits/chosen": 14.174386978149414, "logits/rejected": 15.223234176635742, "logps/chosen": -0.2745029330253601, "logps/rejected": -0.37693315744400024, "loss": 0.9243, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.41175442934036255, "rewards/margins": 0.1536453813314438, "rewards/rejected": -0.5653998255729675, "step": 50 }, { "epoch": 0.041627640753460295, "eval_logits/chosen": 14.56569766998291, "eval_logits/rejected": 15.157320976257324, "eval_logps/chosen": -0.27527979016304016, "eval_logps/rejected": -0.3633999824523926, "eval_loss": 0.9083622694015503, "eval_rewards/accuracies": 0.5612244606018066, "eval_rewards/chosen": -0.41291970014572144, "eval_rewards/margins": 0.13218028843402863, "eval_rewards/rejected": -0.5450999736785889, "eval_runtime": 29.029, "eval_samples_per_second": 26.766, "eval_steps_per_second": 3.376, "step": 50 }, { "epoch": 0.049953168904152356, "grad_norm": 0.14002270996570587, "learning_rate": 4.980286753286196e-06, "logits/chosen": 14.408930778503418, "logits/rejected": 14.791458129882812, "logps/chosen": -0.285602867603302, "logps/rejected": -0.3351826071739197, "loss": 0.9177, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.4284043312072754, "rewards/margins": 0.07436960190534592, "rewards/rejected": -0.5027738809585571, "step": 60 }, { "epoch": 0.05827869705484442, "grad_norm": 0.05595069006085396, "learning_rate": 4.973180832407471e-06, "logits/chosen": 14.41168212890625, "logits/rejected": 14.865121841430664, "logps/chosen": -0.25851207971572876, "logps/rejected": -0.32240185141563416, "loss": 0.9168, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.3877681493759155, "rewards/margins": 0.0958346277475357, "rewards/rejected": -0.4836028218269348, "step": 70 }, { "epoch": 0.06660422520553648, "grad_norm": 0.058645494282245636, "learning_rate": 4.964990092676263e-06, "logits/chosen": 14.897825241088867, "logits/rejected": 15.01073932647705, "logps/chosen": -0.2668797969818115, "logps/rejected": -0.3204379975795746, "loss": 0.9242, "rewards/accuracies": 0.5, "rewards/chosen": -0.4003197252750397, "rewards/margins": 0.08033724129199982, "rewards/rejected": -0.4806569516658783, "step": 80 }, { "epoch": 0.07492975335622853, "grad_norm": 0.0597861111164093, "learning_rate": 4.9557181268217225e-06, "logits/chosen": 14.531021118164062, "logits/rejected": 14.767858505249023, "logps/chosen": -0.26787540316581726, "logps/rejected": -0.32972821593284607, "loss": 0.9077, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.4018131196498871, "rewards/margins": 0.09277921915054321, "rewards/rejected": -0.4945923686027527, "step": 90 }, { "epoch": 0.08325528150692059, "grad_norm": 0.0863095372915268, "learning_rate": 4.9453690018345144e-06, "logits/chosen": 14.179275512695312, "logits/rejected": 14.909070014953613, "logps/chosen": -0.2532978057861328, "logps/rejected": -0.35474082827568054, "loss": 0.903, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3799467086791992, "rewards/margins": 0.1521645337343216, "rewards/rejected": -0.5321112275123596, "step": 100 }, { "epoch": 0.08325528150692059, "eval_logits/chosen": 14.326024055480957, "eval_logits/rejected": 14.979863166809082, "eval_logps/chosen": -0.2673422694206238, "eval_logps/rejected": -0.3668619990348816, "eval_loss": 0.8989922404289246, "eval_rewards/accuracies": 0.6020408272743225, "eval_rewards/chosen": -0.4010133445262909, "eval_rewards/margins": 0.1492796391248703, "eval_rewards/rejected": -0.5502930283546448, "eval_runtime": 29.0209, "eval_samples_per_second": 26.774, "eval_steps_per_second": 3.377, "step": 100 }, { "epoch": 0.09158080965761266, "grad_norm": 0.07181967049837112, "learning_rate": 4.933947257182901e-06, "logits/chosen": 14.118756294250488, "logits/rejected": 14.755918502807617, "logps/chosen": -0.27995947003364563, "logps/rejected": -0.3749552369117737, "loss": 0.9097, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.41993919014930725, "rewards/margins": 0.14249366521835327, "rewards/rejected": -0.5624328255653381, "step": 110 }, { "epoch": 0.09990633780830471, "grad_norm": 0.08269819617271423, "learning_rate": 4.921457902821578e-06, "logits/chosen": 13.764413833618164, "logits/rejected": 14.43315315246582, "logps/chosen": -0.28177163004875183, "logps/rejected": -0.3637630343437195, "loss": 0.9075, "rewards/accuracies": 0.625, "rewards/chosen": -0.42265743017196655, "rewards/margins": 0.12298711389303207, "rewards/rejected": -0.5456445813179016, "step": 120 }, { "epoch": 0.10823186595899677, "grad_norm": 1.9071497917175293, "learning_rate": 4.907906416994146e-06, "logits/chosen": 14.103793144226074, "logits/rejected": 14.727777481079102, "logps/chosen": -0.2665451765060425, "logps/rejected": -0.3827117085456848, "loss": 0.9217, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3998177647590637, "rewards/margins": 0.1742497682571411, "rewards/rejected": -0.5740675926208496, "step": 130 }, { "epoch": 0.11655739410968884, "grad_norm": 0.12107716500759125, "learning_rate": 4.893298743830168e-06, "logits/chosen": 13.517863273620605, "logits/rejected": 14.42052173614502, "logps/chosen": -0.26627904176712036, "logps/rejected": -0.3745174705982208, "loss": 0.904, "rewards/accuracies": 0.5625, "rewards/chosen": -0.39941853284835815, "rewards/margins": 0.16235767304897308, "rewards/rejected": -0.5617762207984924, "step": 140 }, { "epoch": 0.12488292226038089, "grad_norm": 0.1638205647468567, "learning_rate": 4.8776412907378845e-06, "logits/chosen": 12.83032512664795, "logits/rejected": 13.673515319824219, "logps/chosen": -0.24289576709270477, "logps/rejected": -0.37163227796554565, "loss": 0.8779, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.36434367299079895, "rewards/margins": 0.19310477375984192, "rewards/rejected": -0.5574483871459961, "step": 150 }, { "epoch": 0.12488292226038089, "eval_logits/chosen": 12.317696571350098, "eval_logits/rejected": 13.164616584777832, "eval_logps/chosen": -0.266156405210495, "eval_logps/rejected": -0.4009220004081726, "eval_loss": 0.8768696784973145, "eval_rewards/accuracies": 0.6224489808082581, "eval_rewards/chosen": -0.3992346227169037, "eval_rewards/margins": 0.20214837789535522, "eval_rewards/rejected": -0.6013829708099365, "eval_runtime": 29.0257, "eval_samples_per_second": 26.769, "eval_steps_per_second": 3.376, "step": 150 }, { "epoch": 0.13320845041107296, "grad_norm": 0.1479438841342926, "learning_rate": 4.860940925593703e-06, "logits/chosen": 12.736433029174805, "logits/rejected": 13.475964546203613, "logps/chosen": -0.2913517355918884, "logps/rejected": -0.36094629764556885, "loss": 0.8756, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.43702763319015503, "rewards/margins": 0.10439182817935944, "rewards/rejected": -0.5414193868637085, "step": 160 }, { "epoch": 0.141533978561765, "grad_norm": 0.17609630525112152, "learning_rate": 4.84320497372973e-06, "logits/chosen": 10.606362342834473, "logits/rejected": 11.537567138671875, "logps/chosen": -0.2560296952724457, "logps/rejected": -0.4312233328819275, "loss": 0.8489, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.38404449820518494, "rewards/margins": 0.2627905011177063, "rewards/rejected": -0.6468349695205688, "step": 170 }, { "epoch": 0.14985950671245707, "grad_norm": 0.18054936826229095, "learning_rate": 4.824441214720629e-06, "logits/chosen": 10.13754653930664, "logits/rejected": 10.914222717285156, "logps/chosen": -0.29278701543807983, "logps/rejected": -0.43448886275291443, "loss": 0.8715, "rewards/accuracies": 0.625, "rewards/chosen": -0.43918052315711975, "rewards/margins": 0.21255281567573547, "rewards/rejected": -0.6517333388328552, "step": 180 }, { "epoch": 0.15818503486314914, "grad_norm": 0.19739146530628204, "learning_rate": 4.804657878971252e-06, "logits/chosen": 8.077766418457031, "logits/rejected": 9.669368743896484, "logps/chosen": -0.2844889760017395, "logps/rejected": -0.5050357580184937, "loss": 0.8582, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.42673349380493164, "rewards/margins": 0.3308201730251312, "rewards/rejected": -0.7575536966323853, "step": 190 }, { "epoch": 0.16651056301384118, "grad_norm": 0.2397814244031906, "learning_rate": 4.783863644106502e-06, "logits/chosen": 6.790783882141113, "logits/rejected": 7.849525451660156, "logps/chosen": -0.2940555512905121, "logps/rejected": -0.5699166059494019, "loss": 0.8196, "rewards/accuracies": 0.75, "rewards/chosen": -0.4410833418369293, "rewards/margins": 0.41379159688949585, "rewards/rejected": -0.8548749089241028, "step": 200 }, { "epoch": 0.16651056301384118, "eval_logits/chosen": 6.290835857391357, "eval_logits/rejected": 6.757873058319092, "eval_logps/chosen": -0.317629337310791, "eval_logps/rejected": -0.581989586353302, "eval_loss": 0.8032433986663818, "eval_rewards/accuracies": 0.6734693646430969, "eval_rewards/chosen": -0.47644397616386414, "eval_rewards/margins": 0.39654040336608887, "eval_rewards/rejected": -0.8729843497276306, "eval_runtime": 29.025, "eval_samples_per_second": 26.77, "eval_steps_per_second": 3.376, "step": 200 }, { "epoch": 0.17483609116453325, "grad_norm": 0.2858545184135437, "learning_rate": 4.762067631165049e-06, "logits/chosen": 6.875879764556885, "logits/rejected": 6.691536903381348, "logps/chosen": -0.37194910645484924, "logps/rejected": -0.5639354586601257, "loss": 0.8129, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5579236745834351, "rewards/margins": 0.2879795432090759, "rewards/rejected": -0.8459032773971558, "step": 210 }, { "epoch": 0.18316161931522532, "grad_norm": 0.30206382274627686, "learning_rate": 4.7392794005985324e-06, "logits/chosen": 4.656112194061279, "logits/rejected": 4.483086585998535, "logps/chosen": -0.360150009393692, "logps/rejected": -0.6204283833503723, "loss": 0.7954, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5402250289916992, "rewards/margins": 0.39041754603385925, "rewards/rejected": -0.9306427240371704, "step": 220 }, { "epoch": 0.19148714746591736, "grad_norm": 0.40204310417175293, "learning_rate": 4.715508948078037e-06, "logits/chosen": 3.9398162364959717, "logits/rejected": 3.38537859916687, "logps/chosen": -0.39010342955589294, "logps/rejected": -0.7167688608169556, "loss": 0.7664, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5851551294326782, "rewards/margins": 0.4899981617927551, "rewards/rejected": -1.0751533508300781, "step": 230 }, { "epoch": 0.19981267561660943, "grad_norm": 0.48389795422554016, "learning_rate": 4.690766700109659e-06, "logits/chosen": 2.925476551055908, "logits/rejected": 2.824068069458008, "logps/chosen": -0.41053348779678345, "logps/rejected": -0.8508625030517578, "loss": 0.7606, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6158002018928528, "rewards/margins": 0.6604936718940735, "rewards/rejected": -1.2762939929962158, "step": 240 }, { "epoch": 0.2081382037673015, "grad_norm": 0.6687452793121338, "learning_rate": 4.665063509461098e-06, "logits/chosen": 2.751737594604492, "logits/rejected": 2.2424545288085938, "logps/chosen": -0.4365699291229248, "logps/rejected": -0.8550359606742859, "loss": 0.7234, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6548548936843872, "rewards/margins": 0.6276990175247192, "rewards/rejected": -1.2825539112091064, "step": 250 }, { "epoch": 0.2081382037673015, "eval_logits/chosen": 2.1380228996276855, "eval_logits/rejected": 1.3922746181488037, "eval_logps/chosen": -0.48307570815086365, "eval_logps/rejected": -1.0382359027862549, "eval_loss": 0.668463945388794, "eval_rewards/accuracies": 0.6938775777816772, "eval_rewards/chosen": -0.7246134877204895, "eval_rewards/margins": 0.8327403664588928, "eval_rewards/rejected": -1.5573538541793823, "eval_runtime": 29.0228, "eval_samples_per_second": 26.772, "eval_steps_per_second": 3.377, "step": 250 }, { "epoch": 0.21646373191799353, "grad_norm": 0.7085956335067749, "learning_rate": 4.638410650401267e-06, "logits/chosen": 1.7889283895492554, "logits/rejected": 0.9420136213302612, "logps/chosen": -0.5195389986038208, "logps/rejected": -1.0534025430679321, "loss": 0.6863, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7793084979057312, "rewards/margins": 0.8007953763008118, "rewards/rejected": -1.580103874206543, "step": 260 }, { "epoch": 0.2247892600686856, "grad_norm": 0.4416671097278595, "learning_rate": 4.610819813755038e-06, "logits/chosen": 1.582745909690857, "logits/rejected": 0.3820720911026001, "logps/chosen": -0.5181297063827515, "logps/rejected": -1.2198141813278198, "loss": 0.5809, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7771945595741272, "rewards/margins": 1.0525267124176025, "rewards/rejected": -1.8297210931777954, "step": 270 }, { "epoch": 0.23311478821937767, "grad_norm": 2.7746617794036865, "learning_rate": 4.582303101775249e-06, "logits/chosen": 1.2947760820388794, "logits/rejected": 0.27237796783447266, "logps/chosen": -0.643541693687439, "logps/rejected": -1.7467323541641235, "loss": 0.5775, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9653124809265137, "rewards/margins": 1.6547861099243164, "rewards/rejected": -2.62009859085083, "step": 280 }, { "epoch": 0.2414403163700697, "grad_norm": 0.6444702744483948, "learning_rate": 4.55287302283426e-06, "logits/chosen": 1.2399464845657349, "logits/rejected": 0.22667090594768524, "logps/chosen": -0.7517040967941284, "logps/rejected": -1.9010766744613647, "loss": 0.5314, "rewards/accuracies": 0.625, "rewards/chosen": -1.1275560855865479, "rewards/margins": 1.724058747291565, "rewards/rejected": -2.8516147136688232, "step": 290 }, { "epoch": 0.24976584452076178, "grad_norm": 0.5103917717933655, "learning_rate": 4.522542485937369e-06, "logits/chosen": 1.438954472541809, "logits/rejected": 0.5288833379745483, "logps/chosen": -0.7871009707450867, "logps/rejected": -2.0329811573028564, "loss": 0.5271, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1806514263153076, "rewards/margins": 1.8688204288482666, "rewards/rejected": -3.049471616744995, "step": 300 }, { "epoch": 0.24976584452076178, "eval_logits/chosen": 1.3706706762313843, "eval_logits/rejected": 0.8007871508598328, "eval_logps/chosen": -0.7460500001907349, "eval_logps/rejected": -2.209245443344116, "eval_loss": 0.5008835792541504, "eval_rewards/accuracies": 0.7244898080825806, "eval_rewards/chosen": -1.1190749406814575, "eval_rewards/margins": 2.194793224334717, "eval_rewards/rejected": -3.313868284225464, "eval_runtime": 29.0227, "eval_samples_per_second": 26.772, "eval_steps_per_second": 3.377, "step": 300 }, { "epoch": 0.2580913726714538, "grad_norm": 0.7984316945075989, "learning_rate": 4.491324795060491e-06, "logits/chosen": 0.9250973463058472, "logits/rejected": 0.1887839138507843, "logps/chosen": -0.8511486053466797, "logps/rejected": -2.447072982788086, "loss": 0.5506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2767229080200195, "rewards/margins": 2.3938865661621094, "rewards/rejected": -3.670609712600708, "step": 310 }, { "epoch": 0.2664169008221459, "grad_norm": 0.5243161916732788, "learning_rate": 4.4592336433146e-06, "logits/chosen": 2.437886953353882, "logits/rejected": 1.6011940240859985, "logps/chosen": -0.7107629776000977, "logps/rejected": -2.132263422012329, "loss": 0.5423, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0661444664001465, "rewards/margins": 2.1322507858276367, "rewards/rejected": -3.198395013809204, "step": 320 }, { "epoch": 0.27474242897283796, "grad_norm": 0.4742359220981598, "learning_rate": 4.426283106939474e-06, "logits/chosen": 1.8433977365493774, "logits/rejected": 1.199568748474121, "logps/chosen": -0.8737133145332336, "logps/rejected": -2.1652615070343018, "loss": 0.5015, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3105700016021729, "rewards/margins": 1.9373222589492798, "rewards/rejected": -3.247892379760742, "step": 330 }, { "epoch": 0.28306795712353, "grad_norm": 0.5529736280441284, "learning_rate": 4.3924876391293915e-06, "logits/chosen": 2.0044589042663574, "logits/rejected": 0.9263212084770203, "logps/chosen": -0.9175036549568176, "logps/rejected": -2.6408374309539795, "loss": 0.4921, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3762553930282593, "rewards/margins": 2.585000991821289, "rewards/rejected": -3.961256504058838, "step": 340 }, { "epoch": 0.2913934852742221, "grad_norm": 0.7060612440109253, "learning_rate": 4.357862063693486e-06, "logits/chosen": 2.243232250213623, "logits/rejected": 1.6251205205917358, "logps/chosen": -0.9481338262557983, "logps/rejected": -2.9519124031066895, "loss": 0.4753, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4222007989883423, "rewards/margins": 3.0056674480438232, "rewards/rejected": -4.427868366241455, "step": 350 }, { "epoch": 0.2913934852742221, "eval_logits/chosen": 1.7781500816345215, "eval_logits/rejected": 1.412752628326416, "eval_logps/chosen": -0.9692521095275879, "eval_logps/rejected": -2.8247811794281006, "eval_loss": 0.4446474015712738, "eval_rewards/accuracies": 0.7346938848495483, "eval_rewards/chosen": -1.4538781642913818, "eval_rewards/margins": 2.7832937240600586, "eval_rewards/rejected": -4.2371721267700195, "eval_runtime": 29.0245, "eval_samples_per_second": 26.77, "eval_steps_per_second": 3.376, "step": 350 }, { "epoch": 0.29971901342491414, "grad_norm": 0.9664792418479919, "learning_rate": 4.322421568553529e-06, "logits/chosen": 1.7094570398330688, "logits/rejected": 1.1617993116378784, "logps/chosen": -0.992924690246582, "logps/rejected": -2.7834811210632324, "loss": 0.4972, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4893869161605835, "rewards/margins": 2.6858346462249756, "rewards/rejected": -4.1752214431762695, "step": 360 }, { "epoch": 0.3080445415756062, "grad_norm": 0.7800536155700684, "learning_rate": 4.286181699082008e-06, "logits/chosen": 2.9170143604278564, "logits/rejected": 2.384690523147583, "logps/chosen": -1.0323909521102905, "logps/rejected": -2.726369857788086, "loss": 0.4689, "rewards/accuracies": 0.625, "rewards/chosen": -1.548586368560791, "rewards/margins": 2.540968418121338, "rewards/rejected": -4.089555263519287, "step": 370 }, { "epoch": 0.3163700697262983, "grad_norm": 1.3163660764694214, "learning_rate": 4.249158351283414e-06, "logits/chosen": 2.780831813812256, "logits/rejected": 1.753291130065918, "logps/chosen": -1.0468894243240356, "logps/rejected": -2.7425389289855957, "loss": 0.4835, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5703339576721191, "rewards/margins": 2.5434746742248535, "rewards/rejected": -4.113808631896973, "step": 380 }, { "epoch": 0.3246955978769903, "grad_norm": 0.6381780505180359, "learning_rate": 4.211367764821722e-06, "logits/chosen": 2.585071086883545, "logits/rejected": 1.9254558086395264, "logps/chosen": -1.2089946269989014, "logps/rejected": -3.615030288696289, "loss": 0.4518, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8134920597076416, "rewards/margins": 3.6090526580810547, "rewards/rejected": -5.422544956207275, "step": 390 }, { "epoch": 0.33302112602768236, "grad_norm": 0.9214782118797302, "learning_rate": 4.172826515897146e-06, "logits/chosen": 1.9765586853027344, "logits/rejected": 1.1926987171173096, "logps/chosen": -1.2852815389633179, "logps/rejected": -3.786972761154175, "loss": 0.4165, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9279224872589111, "rewards/margins": 3.7525367736816406, "rewards/rejected": -5.680459022521973, "step": 400 }, { "epoch": 0.33302112602768236, "eval_logits/chosen": 2.6366844177246094, "eval_logits/rejected": 2.394319534301758, "eval_logps/chosen": -1.322396993637085, "eval_logps/rejected": -3.686817169189453, "eval_loss": 0.4065541923046112, "eval_rewards/accuracies": 0.7551020383834839, "eval_rewards/chosen": -1.9835957288742065, "eval_rewards/margins": 3.5466296672821045, "eval_rewards/rejected": -5.5302252769470215, "eval_runtime": 29.025, "eval_samples_per_second": 26.77, "eval_steps_per_second": 3.376, "step": 400 }, { "epoch": 0.34134665417837445, "grad_norm": 1.5113208293914795, "learning_rate": 4.133551509975264e-06, "logits/chosen": 2.0068416595458984, "logits/rejected": 1.5152744054794312, "logps/chosen": -1.5090525150299072, "logps/rejected": -3.9272122383117676, "loss": 0.4004, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.2635788917541504, "rewards/margins": 3.627239227294922, "rewards/rejected": -5.890818119049072, "step": 410 }, { "epoch": 0.3496721823290665, "grad_norm": 11.516369819641113, "learning_rate": 4.093559974371725e-06, "logits/chosen": 3.343449115753174, "logits/rejected": 2.920070171356201, "logps/chosen": -1.8312532901763916, "logps/rejected": -4.115124702453613, "loss": 0.4045, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.746879816055298, "rewards/margins": 3.425807476043701, "rewards/rejected": -6.17268705368042, "step": 420 }, { "epoch": 0.35799771047975854, "grad_norm": 3.0497395992279053, "learning_rate": 4.052869450695776e-06, "logits/chosen": 2.5527279376983643, "logits/rejected": 2.2495744228363037, "logps/chosen": -2.2998366355895996, "logps/rejected": -4.966278076171875, "loss": 0.3758, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.4497551918029785, "rewards/margins": 3.9996612071990967, "rewards/rejected": -7.4494171142578125, "step": 430 }, { "epoch": 0.36632323863045063, "grad_norm": 3.900503158569336, "learning_rate": 4.011497787155938e-06, "logits/chosen": 2.4560112953186035, "logits/rejected": 2.3936328887939453, "logps/chosen": -2.563218593597412, "logps/rejected": -5.063398838043213, "loss": 0.3739, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8448281288146973, "rewards/margins": 3.750270366668701, "rewards/rejected": -7.595097541809082, "step": 440 }, { "epoch": 0.3746487667811427, "grad_norm": 2.8846070766448975, "learning_rate": 3.969463130731183e-06, "logits/chosen": 2.5467796325683594, "logits/rejected": 2.4370405673980713, "logps/chosen": -2.4494822025299072, "logps/rejected": -5.12601900100708, "loss": 0.2905, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.6742234230041504, "rewards/margins": 4.014804840087891, "rewards/rejected": -7.689028263092041, "step": 450 }, { "epoch": 0.3746487667811427, "eval_logits/chosen": 2.922081232070923, "eval_logits/rejected": 2.879075050354004, "eval_logps/chosen": -2.352473020553589, "eval_logps/rejected": -5.1224799156188965, "eval_loss": 0.3302614390850067, "eval_rewards/accuracies": 0.8673469424247742, "eval_rewards/chosen": -3.5287091732025146, "eval_rewards/margins": 4.155009746551514, "eval_rewards/rejected": -7.683719635009766, "eval_runtime": 29.0235, "eval_samples_per_second": 26.771, "eval_steps_per_second": 3.377, "step": 450 }, { "epoch": 0.3829742949318347, "grad_norm": 4.662614345550537, "learning_rate": 3.92678391921108e-06, "logits/chosen": 2.428154468536377, "logits/rejected": 2.2403202056884766, "logps/chosen": -2.5936172008514404, "logps/rejected": -5.356133460998535, "loss": 0.2881, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.89042592048645, "rewards/margins": 4.143774509429932, "rewards/rejected": -8.034199714660645, "step": 460 }, { "epoch": 0.3912998230825268, "grad_norm": 2.716899871826172, "learning_rate": 3.88347887310836e-06, "logits/chosen": 2.437295436859131, "logits/rejected": 2.271914005279541, "logps/chosen": -2.470245361328125, "logps/rejected": -5.719494819641113, "loss": 0.31, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.70536732673645, "rewards/margins": 4.873874187469482, "rewards/rejected": -8.579241752624512, "step": 470 }, { "epoch": 0.39962535123321885, "grad_norm": 3.343271255493164, "learning_rate": 3.839566987447492e-06, "logits/chosen": 2.144461154937744, "logits/rejected": 2.0314810276031494, "logps/chosen": -2.5805585384368896, "logps/rejected": -5.418456077575684, "loss": 0.3194, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.870838165283203, "rewards/margins": 4.256844997406006, "rewards/rejected": -8.12768268585205, "step": 480 }, { "epoch": 0.4079508793839109, "grad_norm": 6.411283493041992, "learning_rate": 3.795067523432826e-06, "logits/chosen": 2.408092498779297, "logits/rejected": 2.2996156215667725, "logps/chosen": -2.8846375942230225, "logps/rejected": -5.957771301269531, "loss": 0.3353, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.326956748962402, "rewards/margins": 4.6097002029418945, "rewards/rejected": -8.936657905578613, "step": 490 }, { "epoch": 0.416276407534603, "grad_norm": 3.2472238540649414, "learning_rate": 3.7500000000000005e-06, "logits/chosen": 3.0815653800964355, "logits/rejected": 2.8496975898742676, "logps/chosen": -3.061626434326172, "logps/rejected": -5.966124534606934, "loss": 0.3018, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.592440128326416, "rewards/margins": 4.356747627258301, "rewards/rejected": -8.949186325073242, "step": 500 }, { "epoch": 0.416276407534603, "eval_logits/chosen": 2.7115373611450195, "eval_logits/rejected": 2.763493061065674, "eval_logps/chosen": -2.85333251953125, "eval_logps/rejected": -5.915884017944336, "eval_loss": 0.3079966604709625, "eval_rewards/accuracies": 0.8979591727256775, "eval_rewards/chosen": -4.279998302459717, "eval_rewards/margins": 4.593828201293945, "eval_rewards/rejected": -8.873826026916504, "eval_runtime": 29.0268, "eval_samples_per_second": 26.768, "eval_steps_per_second": 3.376, "step": 500 }, { "epoch": 0.42460193568529503, "grad_norm": 10.017457962036133, "learning_rate": 3.7043841852542884e-06, "logits/chosen": 2.775202989578247, "logits/rejected": 2.6122496128082275, "logps/chosen": -3.0054879188537598, "logps/rejected": -6.258307456970215, "loss": 0.3101, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.5082316398620605, "rewards/margins": 4.879229545593262, "rewards/rejected": -9.387460708618164, "step": 510 }, { "epoch": 0.43292746383598707, "grad_norm": 4.494226932525635, "learning_rate": 3.658240087799655e-06, "logits/chosen": 2.816701889038086, "logits/rejected": 2.4107789993286133, "logps/chosen": -3.2932097911834717, "logps/rejected": -6.099677562713623, "loss": 0.2925, "rewards/accuracies": 0.875, "rewards/chosen": -4.939814567565918, "rewards/margins": 4.209702014923096, "rewards/rejected": -9.149517059326172, "step": 520 }, { "epoch": 0.44125299198667917, "grad_norm": 2.957486391067505, "learning_rate": 3.611587947962319e-06, "logits/chosen": 2.3626818656921387, "logits/rejected": 2.4196550846099854, "logps/chosen": -3.085209608078003, "logps/rejected": -6.118277072906494, "loss": 0.3169, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.627814292907715, "rewards/margins": 4.549601078033447, "rewards/rejected": -9.17741584777832, "step": 530 }, { "epoch": 0.4495785201373712, "grad_norm": 3.429408550262451, "learning_rate": 3.564448228912682e-06, "logits/chosen": 2.559816360473633, "logits/rejected": 2.598250150680542, "logps/chosen": -3.3060078620910645, "logps/rejected": -6.124637126922607, "loss": 0.3271, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.959012031555176, "rewards/margins": 4.227944850921631, "rewards/rejected": -9.186956405639648, "step": 540 }, { "epoch": 0.45790404828806325, "grad_norm": 2.110722780227661, "learning_rate": 3.516841607689501e-06, "logits/chosen": 2.4487693309783936, "logits/rejected": 2.0568625926971436, "logps/chosen": -3.396770477294922, "logps/rejected": -6.35222864151001, "loss": 0.3172, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -5.095156192779541, "rewards/margins": 4.4331865310668945, "rewards/rejected": -9.528343200683594, "step": 550 }, { "epoch": 0.45790404828806325, "eval_logits/chosen": 2.5644595623016357, "eval_logits/rejected": 2.6437506675720215, "eval_logps/chosen": -3.1958370208740234, "eval_logps/rejected": -6.542325496673584, "eval_loss": 0.28538385033607483, "eval_rewards/accuracies": 0.918367326259613, "eval_rewards/chosen": -4.793755054473877, "eval_rewards/margins": 5.0197319984436035, "eval_rewards/rejected": -9.813486099243164, "eval_runtime": 29.0252, "eval_samples_per_second": 26.77, "eval_steps_per_second": 3.376, "step": 550 }, { "epoch": 0.46622957643875534, "grad_norm": 2.0929551124572754, "learning_rate": 3.4687889661302577e-06, "logits/chosen": 2.497122287750244, "logits/rejected": 2.1119792461395264, "logps/chosen": -3.586158037185669, "logps/rejected": -6.939994812011719, "loss": 0.2826, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.379237174987793, "rewards/margins": 5.030755043029785, "rewards/rejected": -10.409992218017578, "step": 560 }, { "epoch": 0.4745551045894474, "grad_norm": 3.344160556793213, "learning_rate": 3.4203113817116955e-06, "logits/chosen": 3.181488275527954, "logits/rejected": 2.8188672065734863, "logps/chosen": -3.465902328491211, "logps/rejected": -6.737443447113037, "loss": 0.3027, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.198853492736816, "rewards/margins": 4.90731143951416, "rewards/rejected": -10.106164932250977, "step": 570 }, { "epoch": 0.4828806327401394, "grad_norm": 6.381539344787598, "learning_rate": 3.3714301183045382e-06, "logits/chosen": 3.8848679065704346, "logits/rejected": 3.54484224319458, "logps/chosen": -3.321965456008911, "logps/rejected": -6.796433448791504, "loss": 0.2619, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.982948303222656, "rewards/margins": 5.211700916290283, "rewards/rejected": -10.194650650024414, "step": 580 }, { "epoch": 0.4912061608908315, "grad_norm": 3.058936834335327, "learning_rate": 3.3221666168464584e-06, "logits/chosen": 2.9645297527313232, "logits/rejected": 2.7630581855773926, "logps/chosen": -3.2019195556640625, "logps/rejected": -6.635239601135254, "loss": 0.2573, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.802879810333252, "rewards/margins": 5.149979114532471, "rewards/rejected": -9.952859878540039, "step": 590 }, { "epoch": 0.49953168904152356, "grad_norm": 4.1828155517578125, "learning_rate": 3.272542485937369e-06, "logits/chosen": 2.696993350982666, "logits/rejected": 2.7842001914978027, "logps/chosen": -3.3624558448791504, "logps/rejected": -6.4542059898376465, "loss": 0.2598, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.043683052062988, "rewards/margins": 4.637625217437744, "rewards/rejected": -9.68130874633789, "step": 600 }, { "epoch": 0.49953168904152356, "eval_logits/chosen": 2.9141366481781006, "eval_logits/rejected": 2.9971513748168945, "eval_logps/chosen": -3.1258208751678467, "eval_logps/rejected": -6.787447452545166, "eval_loss": 0.27035781741142273, "eval_rewards/accuracies": 0.918367326259613, "eval_rewards/chosen": -4.688731670379639, "eval_rewards/margins": 5.492439270019531, "eval_rewards/rejected": -10.181171417236328, "eval_runtime": 29.0227, "eval_samples_per_second": 26.772, "eval_steps_per_second": 3.377, "step": 600 }, { "epoch": 0.5078572171922157, "grad_norm": 3.1104886531829834, "learning_rate": 3.222579492361179e-06, "logits/chosen": 2.582984447479248, "logits/rejected": 2.424341917037964, "logps/chosen": -3.0132031440734863, "logps/rejected": -6.317469596862793, "loss": 0.2598, "rewards/accuracies": 0.875, "rewards/chosen": -4.519804954528809, "rewards/margins": 4.956398963928223, "rewards/rejected": -9.476203918457031, "step": 610 }, { "epoch": 0.5161827453429076, "grad_norm": 12.320380210876465, "learning_rate": 3.1722995515381644e-06, "logits/chosen": 2.1016178131103516, "logits/rejected": 2.345324754714966, "logps/chosen": -3.1399683952331543, "logps/rejected": -7.096994876861572, "loss": 0.2601, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.709952354431152, "rewards/margins": 5.935539722442627, "rewards/rejected": -10.645492553710938, "step": 620 }, { "epoch": 0.5245082734935997, "grad_norm": 2.704423189163208, "learning_rate": 3.121724717912138e-06, "logits/chosen": 2.108675718307495, "logits/rejected": 2.369410991668701, "logps/chosen": -3.6519737243652344, "logps/rejected": -6.964946746826172, "loss": 0.2351, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.477960586547852, "rewards/margins": 4.96945858001709, "rewards/rejected": -10.447419166564941, "step": 630 }, { "epoch": 0.5328338016442918, "grad_norm": 4.401206970214844, "learning_rate": 3.0708771752766397e-06, "logits/chosen": 2.3692595958709717, "logits/rejected": 2.5313620567321777, "logps/chosen": -4.0485663414001465, "logps/rejected": -7.747661590576172, "loss": 0.2265, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.072849750518799, "rewards/margins": 5.548642635345459, "rewards/rejected": -11.621491432189941, "step": 640 }, { "epoch": 0.5411593297949838, "grad_norm": 4.68662166595459, "learning_rate": 3.019779227044398e-06, "logits/chosen": 2.4383034706115723, "logits/rejected": 2.4655585289001465, "logps/chosen": -3.8650074005126953, "logps/rejected": -7.987051963806152, "loss": 0.263, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.797511100769043, "rewards/margins": 6.183066368103027, "rewards/rejected": -11.98057746887207, "step": 650 }, { "epoch": 0.5411593297949838, "eval_logits/chosen": 2.7321341037750244, "eval_logits/rejected": 2.906801700592041, "eval_logps/chosen": -3.7255136966705322, "eval_logps/rejected": -7.620375633239746, "eval_loss": 0.26394686102867126, "eval_rewards/accuracies": 0.9285714030265808, "eval_rewards/chosen": -5.5882697105407715, "eval_rewards/margins": 5.8422932624816895, "eval_rewards/rejected": -11.430564880371094, "eval_runtime": 29.0258, "eval_samples_per_second": 26.769, "eval_steps_per_second": 3.376, "step": 650 }, { "epoch": 0.5494848579456759, "grad_norm": 4.704371929168701, "learning_rate": 2.9684532864643123e-06, "logits/chosen": 2.7277207374572754, "logits/rejected": 2.7106287479400635, "logps/chosen": -3.979590654373169, "logps/rejected": -6.88008975982666, "loss": 0.2933, "rewards/accuracies": 0.875, "rewards/chosen": -5.969386100769043, "rewards/margins": 4.350748062133789, "rewards/rejected": -10.320135116577148, "step": 660 }, { "epoch": 0.557810386096368, "grad_norm": 3.2897160053253174, "learning_rate": 2.9169218667902562e-06, "logits/chosen": 2.207106113433838, "logits/rejected": 2.454056978225708, "logps/chosen": -3.760200023651123, "logps/rejected": -7.504108428955078, "loss": 0.2262, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.6402997970581055, "rewards/margins": 5.615862846374512, "rewards/rejected": -11.256162643432617, "step": 670 }, { "epoch": 0.56613591424706, "grad_norm": 3.6699540615081787, "learning_rate": 2.8652075714060296e-06, "logits/chosen": 2.5904622077941895, "logits/rejected": 2.693467617034912, "logps/chosen": -3.2713139057159424, "logps/rejected": -7.3422722816467285, "loss": 0.2721, "rewards/accuracies": 0.9375, "rewards/chosen": -4.906970500946045, "rewards/margins": 6.106438636779785, "rewards/rejected": -11.013408660888672, "step": 680 }, { "epoch": 0.5744614423977521, "grad_norm": 3.054532289505005, "learning_rate": 2.813333083910761e-06, "logits/chosen": 2.9145145416259766, "logits/rejected": 2.7135214805603027, "logps/chosen": -3.5082690715789795, "logps/rejected": -7.293328762054443, "loss": 0.271, "rewards/accuracies": 0.9375, "rewards/chosen": -5.26240348815918, "rewards/margins": 5.677589416503906, "rewards/rejected": -10.939992904663086, "step": 690 }, { "epoch": 0.5827869705484442, "grad_norm": 3.5161256790161133, "learning_rate": 2.761321158169134e-06, "logits/chosen": 2.915343761444092, "logits/rejected": 2.731520891189575, "logps/chosen": -3.4292550086975098, "logps/rejected": -8.124921798706055, "loss": 0.1985, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.143881797790527, "rewards/margins": 7.043501377105713, "rewards/rejected": -12.187383651733398, "step": 700 }, { "epoch": 0.5827869705484442, "eval_logits/chosen": 2.5902156829833984, "eval_logits/rejected": 2.774846315383911, "eval_logps/chosen": -3.5158140659332275, "eval_logps/rejected": -7.544556140899658, "eval_loss": 0.24698135256767273, "eval_rewards/accuracies": 0.9285714030265808, "eval_rewards/chosen": -5.273721694946289, "eval_rewards/margins": 6.043112754821777, "eval_rewards/rejected": -11.31683349609375, "eval_runtime": 29.0187, "eval_samples_per_second": 26.776, "eval_steps_per_second": 3.377, "step": 700 }, { "epoch": 0.5911124986991362, "grad_norm": 3.2246947288513184, "learning_rate": 2.70919460833079e-06, "logits/chosen": 2.9566922187805176, "logits/rejected": 2.874277353286743, "logps/chosen": -3.772322177886963, "logps/rejected": -7.461319923400879, "loss": 0.2565, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.658483028411865, "rewards/margins": 5.533496856689453, "rewards/rejected": -11.191980361938477, "step": 710 }, { "epoch": 0.5994380268498283, "grad_norm": 4.457447052001953, "learning_rate": 2.6569762988232838e-06, "logits/chosen": 2.653148889541626, "logits/rejected": 2.646437168121338, "logps/chosen": -3.8250937461853027, "logps/rejected": -7.855221748352051, "loss": 0.2244, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.737640857696533, "rewards/margins": 6.045191287994385, "rewards/rejected": -11.782832145690918, "step": 720 }, { "epoch": 0.6077635550005204, "grad_norm": 3.477293014526367, "learning_rate": 2.604689134322999e-06, "logits/chosen": 2.2635607719421387, "logits/rejected": 2.2247064113616943, "logps/chosen": -3.974703550338745, "logps/rejected": -8.289571762084961, "loss": 0.2294, "rewards/accuracies": 0.9375, "rewards/chosen": -5.962055206298828, "rewards/margins": 6.4723029136657715, "rewards/rejected": -12.434357643127441, "step": 730 }, { "epoch": 0.6160890831512124, "grad_norm": 1.6821621656417847, "learning_rate": 2.5523560497083927e-06, "logits/chosen": 1.8432185649871826, "logits/rejected": 1.9002739191055298, "logps/chosen": -3.8650963306427, "logps/rejected": -7.553779602050781, "loss": 0.2221, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -5.79764461517334, "rewards/margins": 5.533024787902832, "rewards/rejected": -11.330669403076172, "step": 740 }, { "epoch": 0.6244146113019045, "grad_norm": 24.729644775390625, "learning_rate": 2.5e-06, "logits/chosen": 2.5135562419891357, "logits/rejected": 2.6035869121551514, "logps/chosen": -3.6619372367858887, "logps/rejected": -7.801999568939209, "loss": 0.2724, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.492905616760254, "rewards/margins": 6.2100934982299805, "rewards/rejected": -11.702998161315918, "step": 750 }, { "epoch": 0.6244146113019045, "eval_logits/chosen": 2.876950979232788, "eval_logits/rejected": 3.0243964195251465, "eval_logps/chosen": -3.517216682434082, "eval_logps/rejected": -7.607268810272217, "eval_loss": 0.24484822154045105, "eval_rewards/accuracies": 0.9387755393981934, "eval_rewards/chosen": -5.275824546813965, "eval_rewards/margins": 6.135078430175781, "eval_rewards/rejected": -11.410903930664062, "eval_runtime": 28.9129, "eval_samples_per_second": 26.874, "eval_steps_per_second": 3.389, "step": 750 }, { "epoch": 0.6327401394525966, "grad_norm": 9.702905654907227, "learning_rate": 2.447643950291608e-06, "logits/chosen": 2.693587064743042, "logits/rejected": 2.6106948852539062, "logps/chosen": -3.7441153526306152, "logps/rejected": -7.564157009124756, "loss": 0.2506, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.61617374420166, "rewards/margins": 5.730062961578369, "rewards/rejected": -11.346236228942871, "step": 760 }, { "epoch": 0.6410656676032885, "grad_norm": 8.551860809326172, "learning_rate": 2.3953108656770018e-06, "logits/chosen": 2.894711971282959, "logits/rejected": 3.036170482635498, "logps/chosen": -3.972269058227539, "logps/rejected": -8.38014030456543, "loss": 0.2107, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.958403587341309, "rewards/margins": 6.6118059158325195, "rewards/rejected": -12.570208549499512, "step": 770 }, { "epoch": 0.6493911957539806, "grad_norm": 2.4394350051879883, "learning_rate": 2.3430237011767166e-06, "logits/chosen": 3.1415820121765137, "logits/rejected": 3.1218018531799316, "logps/chosen": -4.007376194000244, "logps/rejected": -8.103262901306152, "loss": 0.1886, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.011064052581787, "rewards/margins": 6.1438307762146, "rewards/rejected": -12.154894828796387, "step": 780 }, { "epoch": 0.6577167239046727, "grad_norm": 3.69184947013855, "learning_rate": 2.290805391669212e-06, "logits/chosen": 3.3487350940704346, "logits/rejected": 3.5375237464904785, "logps/chosen": -3.7646141052246094, "logps/rejected": -7.569940090179443, "loss": 0.2106, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.646921634674072, "rewards/margins": 5.707989692687988, "rewards/rejected": -11.354910850524902, "step": 790 }, { "epoch": 0.6660422520553647, "grad_norm": 4.604506015777588, "learning_rate": 2.238678841830867e-06, "logits/chosen": 3.159898519515991, "logits/rejected": 3.09334135055542, "logps/chosen": -4.009636878967285, "logps/rejected": -7.4454545974731445, "loss": 0.2379, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.014455318450928, "rewards/margins": 5.1537251472473145, "rewards/rejected": -11.168180465698242, "step": 800 }, { "epoch": 0.6660422520553647, "eval_logits/chosen": 2.748328924179077, "eval_logits/rejected": 2.9500906467437744, "eval_logps/chosen": -3.652164936065674, "eval_logps/rejected": -7.951470375061035, "eval_loss": 0.23568958044052124, "eval_rewards/accuracies": 0.9387755393981934, "eval_rewards/chosen": -5.478247165679932, "eval_rewards/margins": 6.448958396911621, "eval_rewards/rejected": -11.927205085754395, "eval_runtime": 29.021, "eval_samples_per_second": 26.774, "eval_steps_per_second": 3.377, "step": 800 }, { "epoch": 0.6743677802060568, "grad_norm": 3.968970537185669, "learning_rate": 2.186666916089239e-06, "logits/chosen": 2.384208917617798, "logits/rejected": 2.3336739540100098, "logps/chosen": -3.8832621574401855, "logps/rejected": -7.72598123550415, "loss": 0.2706, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.824892520904541, "rewards/margins": 5.764077663421631, "rewards/rejected": -11.588971138000488, "step": 810 }, { "epoch": 0.6826933083567489, "grad_norm": 3.6892929077148438, "learning_rate": 2.134792428593971e-06, "logits/chosen": 3.5869107246398926, "logits/rejected": 3.517749786376953, "logps/chosen": -3.306342363357544, "logps/rejected": -7.020272254943848, "loss": 0.2398, "rewards/accuracies": 0.875, "rewards/chosen": -4.9595136642456055, "rewards/margins": 5.570894718170166, "rewards/rejected": -10.530407905578613, "step": 820 }, { "epoch": 0.6910188365074409, "grad_norm": 4.89448881149292, "learning_rate": 2.0830781332097446e-06, "logits/chosen": 2.5076346397399902, "logits/rejected": 2.3836727142333984, "logps/chosen": -3.843027114868164, "logps/rejected": -7.852384090423584, "loss": 0.2116, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.764540672302246, "rewards/margins": 6.014035701751709, "rewards/rejected": -11.77857494354248, "step": 830 }, { "epoch": 0.699344364658133, "grad_norm": 8.198432922363281, "learning_rate": 2.031546713535688e-06, "logits/chosen": 2.5533287525177, "logits/rejected": 2.407637357711792, "logps/chosen": -3.574105739593506, "logps/rejected": -8.23727798461914, "loss": 0.2415, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.361158847808838, "rewards/margins": 6.994758605957031, "rewards/rejected": -12.355916976928711, "step": 840 }, { "epoch": 0.7076698928088251, "grad_norm": 4.123171329498291, "learning_rate": 1.9802207729556023e-06, "logits/chosen": 2.4909422397613525, "logits/rejected": 2.3119165897369385, "logps/chosen": -3.927218198776245, "logps/rejected": -7.961021423339844, "loss": 0.2217, "rewards/accuracies": 0.9375, "rewards/chosen": -5.89082670211792, "rewards/margins": 6.050704002380371, "rewards/rejected": -11.94153118133545, "step": 850 }, { "epoch": 0.7076698928088251, "eval_logits/chosen": 2.858954668045044, "eval_logits/rejected": 3.012629270553589, "eval_logps/chosen": -3.577458381652832, "eval_logps/rejected": -7.837220668792725, "eval_loss": 0.23848077654838562, "eval_rewards/accuracies": 0.9387755393981934, "eval_rewards/chosen": -5.36618709564209, "eval_rewards/margins": 6.389642715454102, "eval_rewards/rejected": -11.755829811096191, "eval_runtime": 29.02, "eval_samples_per_second": 26.775, "eval_steps_per_second": 3.377, "step": 850 }, { "epoch": 0.7159954209595171, "grad_norm": 3.4179177284240723, "learning_rate": 1.9291228247233607e-06, "logits/chosen": 2.535378932952881, "logits/rejected": 2.5335640907287598, "logps/chosen": -3.541815996170044, "logps/rejected": -7.519083499908447, "loss": 0.2167, "rewards/accuracies": 0.9375, "rewards/chosen": -5.312723159790039, "rewards/margins": 5.965902328491211, "rewards/rejected": -11.27862548828125, "step": 860 }, { "epoch": 0.7243209491102092, "grad_norm": 1.8562341928482056, "learning_rate": 1.8782752820878636e-06, "logits/chosen": 3.0650887489318848, "logits/rejected": 2.7918925285339355, "logps/chosen": -3.791342258453369, "logps/rejected": -7.656645774841309, "loss": 0.1925, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.687013149261475, "rewards/margins": 5.7979559898376465, "rewards/rejected": -11.484968185424805, "step": 870 }, { "epoch": 0.7326464772609013, "grad_norm": 9.719799995422363, "learning_rate": 1.827700448461836e-06, "logits/chosen": 2.4594621658325195, "logits/rejected": 2.4324564933776855, "logps/chosen": -3.6558470726013184, "logps/rejected": -8.101290702819824, "loss": 0.1975, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.483770847320557, "rewards/margins": 6.6681647300720215, "rewards/rejected": -12.151935577392578, "step": 880 }, { "epoch": 0.7409720054115932, "grad_norm": 3.240176200866699, "learning_rate": 1.7774205076388207e-06, "logits/chosen": 2.689762592315674, "logits/rejected": 2.553614616394043, "logps/chosen": -3.3837451934814453, "logps/rejected": -7.7740020751953125, "loss": 0.177, "rewards/accuracies": 1.0, "rewards/chosen": -5.075617790222168, "rewards/margins": 6.585384368896484, "rewards/rejected": -11.661002159118652, "step": 890 }, { "epoch": 0.7492975335622853, "grad_norm": 3.8752946853637695, "learning_rate": 1.7274575140626318e-06, "logits/chosen": 3.2561440467834473, "logits/rejected": 3.13822603225708, "logps/chosen": -3.69258451461792, "logps/rejected": -7.472433567047119, "loss": 0.213, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.538876533508301, "rewards/margins": 5.669772624969482, "rewards/rejected": -11.208650588989258, "step": 900 }, { "epoch": 0.7492975335622853, "eval_logits/chosen": 2.8268215656280518, "eval_logits/rejected": 3.031662702560425, "eval_logps/chosen": -3.6311440467834473, "eval_logps/rejected": -8.067394256591797, "eval_loss": 0.23127013444900513, "eval_rewards/accuracies": 0.9285714030265808, "eval_rewards/chosen": -5.44671630859375, "eval_rewards/margins": 6.654376029968262, "eval_rewards/rejected": -12.101091384887695, "eval_runtime": 29.022, "eval_samples_per_second": 26.773, "eval_steps_per_second": 3.377, "step": 900 }, { "epoch": 0.7576230617129774, "grad_norm": 3.4024012088775635, "learning_rate": 1.677833383153542e-06, "logits/chosen": 2.0691773891448975, "logits/rejected": 2.190563201904297, "logps/chosen": -3.483668565750122, "logps/rejected": -8.020956039428711, "loss": 0.198, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.225502967834473, "rewards/margins": 6.805932521820068, "rewards/rejected": -12.0314359664917, "step": 910 }, { "epoch": 0.7659485898636694, "grad_norm": 4.999133586883545, "learning_rate": 1.6285698816954626e-06, "logits/chosen": 2.4453094005584717, "logits/rejected": 2.440931558609009, "logps/chosen": -4.1138916015625, "logps/rejected": -8.617280960083008, "loss": 0.253, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.170836925506592, "rewards/margins": 6.7550835609436035, "rewards/rejected": -12.925920486450195, "step": 920 }, { "epoch": 0.7742741180143615, "grad_norm": 3.1391687393188477, "learning_rate": 1.5796886182883053e-06, "logits/chosen": 2.892235517501831, "logits/rejected": 2.8754334449768066, "logps/chosen": -3.8762309551239014, "logps/rejected": -7.991665840148926, "loss": 0.2171, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.814346790313721, "rewards/margins": 6.173151969909668, "rewards/rejected": -11.98749828338623, "step": 930 }, { "epoch": 0.7825996461650536, "grad_norm": 6.850193023681641, "learning_rate": 1.5312110338697427e-06, "logits/chosen": 3.0068447589874268, "logits/rejected": 3.0385780334472656, "logps/chosen": -3.7039177417755127, "logps/rejected": -8.53662109375, "loss": 0.1907, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.555876731872559, "rewards/margins": 7.2490553855896, "rewards/rejected": -12.804931640625, "step": 940 }, { "epoch": 0.7909251743157456, "grad_norm": 16.202392578125, "learning_rate": 1.4831583923105e-06, "logits/chosen": 2.445254325866699, "logits/rejected": 2.6017098426818848, "logps/chosen": -4.0695037841796875, "logps/rejected": -8.545947074890137, "loss": 0.2033, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.104255676269531, "rewards/margins": 6.714664459228516, "rewards/rejected": -12.818921089172363, "step": 950 }, { "epoch": 0.7909251743157456, "eval_logits/chosen": 2.7845335006713867, "eval_logits/rejected": 3.037020206451416, "eval_logps/chosen": -3.982541799545288, "eval_logps/rejected": -8.498592376708984, "eval_loss": 0.22774070501327515, "eval_rewards/accuracies": 0.9387755393981934, "eval_rewards/chosen": -5.973812580108643, "eval_rewards/margins": 6.77407693862915, "eval_rewards/rejected": -12.747888565063477, "eval_runtime": 29.0201, "eval_samples_per_second": 26.775, "eval_steps_per_second": 3.377, "step": 950 }, { "epoch": 0.7992507024664377, "grad_norm": 4.31044864654541, "learning_rate": 1.4355517710873184e-06, "logits/chosen": 2.1485049724578857, "logits/rejected": 2.493374824523926, "logps/chosen": -3.8115482330322266, "logps/rejected": -8.553500175476074, "loss": 0.2109, "rewards/accuracies": 0.9375, "rewards/chosen": -5.71732234954834, "rewards/margins": 7.112928867340088, "rewards/rejected": -12.83025074005127, "step": 960 }, { "epoch": 0.8075762306171298, "grad_norm": 4.177423000335693, "learning_rate": 1.388412052037682e-06, "logits/chosen": 2.9300179481506348, "logits/rejected": 2.9548909664154053, "logps/chosen": -3.9784176349639893, "logps/rejected": -8.308394432067871, "loss": 0.2012, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.967626094818115, "rewards/margins": 6.49496603012085, "rewards/rejected": -12.462592124938965, "step": 970 }, { "epoch": 0.8159017587678218, "grad_norm": 4.683027744293213, "learning_rate": 1.3417599122003464e-06, "logits/chosen": 2.5800061225891113, "logits/rejected": 2.526090145111084, "logps/chosen": -3.86810564994812, "logps/rejected": -8.47614574432373, "loss": 0.2141, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.802158355712891, "rewards/margins": 6.9120612144470215, "rewards/rejected": -12.714218139648438, "step": 980 }, { "epoch": 0.8242272869185139, "grad_norm": 3.7419984340667725, "learning_rate": 1.2956158147457116e-06, "logits/chosen": 3.4706058502197266, "logits/rejected": 3.4088757038116455, "logps/chosen": -4.216760158538818, "logps/rejected": -8.575207710266113, "loss": 0.2422, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.325140953063965, "rewards/margins": 6.537671089172363, "rewards/rejected": -12.862811088562012, "step": 990 }, { "epoch": 0.832552815069206, "grad_norm": 8.953512191772461, "learning_rate": 1.2500000000000007e-06, "logits/chosen": 2.9276206493377686, "logits/rejected": 2.946265459060669, "logps/chosen": -3.9976966381073, "logps/rejected": -8.48410701751709, "loss": 0.2139, "rewards/accuracies": 0.9375, "rewards/chosen": -5.996545314788818, "rewards/margins": 6.729616641998291, "rewards/rejected": -12.726162910461426, "step": 1000 }, { "epoch": 0.832552815069206, "eval_logits/chosen": 2.9153146743774414, "eval_logits/rejected": 3.0989012718200684, "eval_logps/chosen": -3.6678271293640137, "eval_logps/rejected": -8.173608779907227, "eval_loss": 0.22841480374336243, "eval_rewards/accuracies": 0.9285714030265808, "eval_rewards/chosen": -5.5017409324646, "eval_rewards/margins": 6.758671760559082, "eval_rewards/rejected": -12.26041316986084, "eval_runtime": 29.0504, "eval_samples_per_second": 26.747, "eval_steps_per_second": 3.373, "step": 1000 }, { "epoch": 0.840878343219898, "grad_norm": 4.267103672027588, "learning_rate": 1.204932476567175e-06, "logits/chosen": 2.3057751655578613, "logits/rejected": 2.3750340938568115, "logps/chosen": -3.576403856277466, "logps/rejected": -8.301278114318848, "loss": 0.2211, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.364605903625488, "rewards/margins": 7.087311744689941, "rewards/rejected": -12.45191764831543, "step": 1010 }, { "epoch": 0.8492038713705901, "grad_norm": 6.008708477020264, "learning_rate": 1.160433012552508e-06, "logits/chosen": 3.0167624950408936, "logits/rejected": 2.817478895187378, "logps/chosen": -4.053152084350586, "logps/rejected": -8.841009140014648, "loss": 0.195, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.079728126525879, "rewards/margins": 7.181784152984619, "rewards/rejected": -13.261512756347656, "step": 1020 }, { "epoch": 0.8575293995212822, "grad_norm": 3.7652032375335693, "learning_rate": 1.11652112689164e-06, "logits/chosen": 2.3387794494628906, "logits/rejected": 2.420820474624634, "logps/chosen": -4.114675045013428, "logps/rejected": -8.801934242248535, "loss": 0.2226, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.172013282775879, "rewards/margins": 7.030886650085449, "rewards/rejected": -13.202900886535645, "step": 1030 }, { "epoch": 0.8658549276719741, "grad_norm": 3.811018466949463, "learning_rate": 1.073216080788921e-06, "logits/chosen": 3.4545624256134033, "logits/rejected": 2.934145212173462, "logps/chosen": -3.841254472732544, "logps/rejected": -8.547441482543945, "loss": 0.2039, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.7618818283081055, "rewards/margins": 7.059278964996338, "rewards/rejected": -12.821161270141602, "step": 1040 }, { "epoch": 0.8741804558226662, "grad_norm": 3.5039620399475098, "learning_rate": 1.0305368692688175e-06, "logits/chosen": 1.9293429851531982, "logits/rejected": 2.530273914337158, "logps/chosen": -3.6742749214172363, "logps/rejected": -8.751821517944336, "loss": 0.2168, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.511412143707275, "rewards/margins": 7.616321563720703, "rewards/rejected": -13.127734184265137, "step": 1050 }, { "epoch": 0.8741804558226662, "eval_logits/chosen": 2.9263997077941895, "eval_logits/rejected": 3.1277804374694824, "eval_logps/chosen": -3.6399991512298584, "eval_logps/rejected": -8.258426666259766, "eval_loss": 0.2207891196012497, "eval_rewards/accuracies": 0.9387755393981934, "eval_rewards/chosen": -5.45999813079834, "eval_rewards/margins": 6.92764139175415, "eval_rewards/rejected": -12.387639045715332, "eval_runtime": 29.0247, "eval_samples_per_second": 26.77, "eval_steps_per_second": 3.376, "step": 1050 }, { "epoch": 0.8825059839733583, "grad_norm": 5.916813373565674, "learning_rate": 9.88502212844063e-07, "logits/chosen": 3.1234467029571533, "logits/rejected": 3.058065891265869, "logps/chosen": -3.8148319721221924, "logps/rejected": -8.512906074523926, "loss": 0.2123, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.722247123718262, "rewards/margins": 7.047112464904785, "rewards/rejected": -12.769360542297363, "step": 1060 }, { "epoch": 0.8908315121240503, "grad_norm": 1.9670017957687378, "learning_rate": 9.471305493042243e-07, "logits/chosen": 3.48276948928833, "logits/rejected": 2.9211738109588623, "logps/chosen": -4.147209167480469, "logps/rejected": -8.622703552246094, "loss": 0.1951, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.220814228057861, "rewards/margins": 6.7132415771484375, "rewards/rejected": -12.934056282043457, "step": 1070 }, { "epoch": 0.8991570402747424, "grad_norm": 4.044788837432861, "learning_rate": 9.064400256282757e-07, "logits/chosen": 1.3344472646713257, "logits/rejected": 2.0601110458374023, "logps/chosen": -4.18247127532959, "logps/rejected": -9.543882369995117, "loss": 0.2129, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.273707389831543, "rewards/margins": 8.042116165161133, "rewards/rejected": -14.315823554992676, "step": 1080 }, { "epoch": 0.9074825684254345, "grad_norm": 8.981308937072754, "learning_rate": 8.664484900247363e-07, "logits/chosen": 3.2444870471954346, "logits/rejected": 3.3333630561828613, "logps/chosen": -3.4744930267333984, "logps/rejected": -8.478456497192383, "loss": 0.1827, "rewards/accuracies": 0.9375, "rewards/chosen": -5.2117390632629395, "rewards/margins": 7.50594425201416, "rewards/rejected": -12.717683792114258, "step": 1090 }, { "epoch": 0.9158080965761265, "grad_norm": 5.613018035888672, "learning_rate": 8.271734841028553e-07, "logits/chosen": 3.114234209060669, "logits/rejected": 2.8669419288635254, "logps/chosen": -3.5276169776916504, "logps/rejected": -7.26898193359375, "loss": 0.1883, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.291424751281738, "rewards/margins": 5.6120476722717285, "rewards/rejected": -10.903471946716309, "step": 1100 }, { "epoch": 0.9158080965761265, "eval_logits/chosen": 2.896069288253784, "eval_logits/rejected": 3.120903730392456, "eval_logps/chosen": -3.6780893802642822, "eval_logps/rejected": -8.3290376663208, "eval_loss": 0.21761386096477509, "eval_rewards/accuracies": 0.9285714030265808, "eval_rewards/chosen": -5.517134189605713, "eval_rewards/margins": 6.9764227867126465, "eval_rewards/rejected": -12.493557929992676, "eval_runtime": 29.0228, "eval_samples_per_second": 26.772, "eval_steps_per_second": 3.377, "step": 1100 }, { "epoch": 0.9241336247268186, "grad_norm": 6.375245571136475, "learning_rate": 7.886322351782782e-07, "logits/chosen": 2.5309860706329346, "logits/rejected": 2.6255879402160645, "logps/chosen": -4.059412002563477, "logps/rejected": -9.413751602172852, "loss": 0.2197, "rewards/accuracies": 0.9375, "rewards/chosen": -6.089118003845215, "rewards/margins": 8.03150749206543, "rewards/rejected": -14.120626449584961, "step": 1110 }, { "epoch": 0.9324591528775107, "grad_norm": 3.5590834617614746, "learning_rate": 7.508416487165862e-07, "logits/chosen": 3.3510899543762207, "logits/rejected": 3.4622738361358643, "logps/chosen": -4.005453586578369, "logps/rejected": -9.220897674560547, "loss": 0.2071, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.008180141448975, "rewards/margins": 7.823166847229004, "rewards/rejected": -13.83134651184082, "step": 1120 }, { "epoch": 0.9407846810282027, "grad_norm": 4.397263050079346, "learning_rate": 7.138183009179922e-07, "logits/chosen": 3.1275603771209717, "logits/rejected": 2.9770944118499756, "logps/chosen": -4.224826812744141, "logps/rejected": -8.15820026397705, "loss": 0.2331, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.337240219116211, "rewards/margins": 5.900059700012207, "rewards/rejected": -12.237300872802734, "step": 1130 }, { "epoch": 0.9491102091788948, "grad_norm": 4.102133750915527, "learning_rate": 6.775784314464717e-07, "logits/chosen": 2.8464298248291016, "logits/rejected": 2.4384379386901855, "logps/chosen": -3.9073352813720703, "logps/rejected": -7.805499076843262, "loss": 0.2223, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.8610029220581055, "rewards/margins": 5.847245216369629, "rewards/rejected": -11.708248138427734, "step": 1140 }, { "epoch": 0.9574357373295869, "grad_norm": 4.830289363861084, "learning_rate": 6.421379363065142e-07, "logits/chosen": 2.3135313987731934, "logits/rejected": 2.514207124710083, "logps/chosen": -4.163815498352051, "logps/rejected": -9.554147720336914, "loss": 0.184, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.245722770690918, "rewards/margins": 8.085500717163086, "rewards/rejected": -14.331222534179688, "step": 1150 }, { "epoch": 0.9574357373295869, "eval_logits/chosen": 2.9589338302612305, "eval_logits/rejected": 3.1358554363250732, "eval_logps/chosen": -3.7160890102386475, "eval_logps/rejected": -8.306242942810059, "eval_loss": 0.22364133596420288, "eval_rewards/accuracies": 0.9489796161651611, "eval_rewards/chosen": -5.57413387298584, "eval_rewards/margins": 6.8852314949035645, "eval_rewards/rejected": -12.45936393737793, "eval_runtime": 29.0274, "eval_samples_per_second": 26.768, "eval_steps_per_second": 3.376, "step": 1150 }, { "epoch": 0.9657612654802789, "grad_norm": 3.550083637237549, "learning_rate": 6.075123608706093e-07, "logits/chosen": 3.0239412784576416, "logits/rejected": 3.124316930770874, "logps/chosen": -3.9214415550231934, "logps/rejected": -8.38886547088623, "loss": 0.2249, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.882162094116211, "rewards/margins": 6.701135158538818, "rewards/rejected": -12.583298683166504, "step": 1160 }, { "epoch": 0.974086793630971, "grad_norm": 6.0046515464782715, "learning_rate": 5.737168930605272e-07, "logits/chosen": 3.2713863849639893, "logits/rejected": 3.0897414684295654, "logps/chosen": -3.4735615253448486, "logps/rejected": -8.098161697387695, "loss": 0.1977, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.2103424072265625, "rewards/margins": 6.936898708343506, "rewards/rejected": -12.147241592407227, "step": 1170 }, { "epoch": 0.982412321781663, "grad_norm": 4.833160400390625, "learning_rate": 5.407663566854008e-07, "logits/chosen": 2.0586276054382324, "logits/rejected": 2.014996290206909, "logps/chosen": -4.311732292175293, "logps/rejected": -9.875633239746094, "loss": 0.2184, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.467599391937256, "rewards/margins": 8.345849990844727, "rewards/rejected": -14.813450813293457, "step": 1180 }, { "epoch": 0.990737849932355, "grad_norm": 3.9890189170837402, "learning_rate": 5.086752049395094e-07, "logits/chosen": 3.0970911979675293, "logits/rejected": 2.8563153743743896, "logps/chosen": -3.985518217086792, "logps/rejected": -8.199251174926758, "loss": 0.2178, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.978277683258057, "rewards/margins": 6.320598125457764, "rewards/rejected": -12.29887580871582, "step": 1190 }, { "epoch": 0.9990633780830471, "grad_norm": 25.71741485595703, "learning_rate": 4.774575140626317e-07, "logits/chosen": 2.927126884460449, "logits/rejected": 2.802952527999878, "logps/chosen": -3.5128173828125, "logps/rejected": -8.298576354980469, "loss": 0.1799, "rewards/accuracies": 0.9375, "rewards/chosen": -5.269227027893066, "rewards/margins": 7.1786394119262695, "rewards/rejected": -12.44786548614502, "step": 1200 }, { "epoch": 0.9990633780830471, "eval_logits/chosen": 2.952441692352295, "eval_logits/rejected": 3.141965389251709, "eval_logps/chosen": -3.6855292320251465, "eval_logps/rejected": -8.327260971069336, "eval_loss": 0.21951240301132202, "eval_rewards/accuracies": 0.9489796161651611, "eval_rewards/chosen": -5.528294086456299, "eval_rewards/margins": 6.962599277496338, "eval_rewards/rejected": -12.490893363952637, "eval_runtime": 29.0225, "eval_samples_per_second": 26.772, "eval_steps_per_second": 3.377, "step": 1200 }, { "epoch": 1.0066604225205535, "grad_norm": 1.9987434148788452, "learning_rate": 4.4712697716573994e-07, "logits/chosen": 2.8181862831115723, "logits/rejected": 2.7676520347595215, "logps/chosen": -4.099400043487549, "logps/rejected": -9.125377655029297, "loss": 0.1878, "rewards/accuracies": 0.9452054500579834, "rewards/chosen": -6.149099826812744, "rewards/margins": 7.538967132568359, "rewards/rejected": -13.688066482543945, "step": 1210 }, { "epoch": 1.0149859506712458, "grad_norm": 3.452667474746704, "learning_rate": 4.1769689822475147e-07, "logits/chosen": 2.893108367919922, "logits/rejected": 2.860814332962036, "logps/chosen": -3.798161268234253, "logps/rejected": -7.9546356201171875, "loss": 0.1861, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.697242259979248, "rewards/margins": 6.23471212387085, "rewards/rejected": -11.931954383850098, "step": 1220 }, { "epoch": 1.0233114788219377, "grad_norm": 3.3297345638275146, "learning_rate": 3.891801862449629e-07, "logits/chosen": 2.500004529953003, "logits/rejected": 2.4870145320892334, "logps/chosen": -3.8301002979278564, "logps/rejected": -8.578625679016113, "loss": 0.2212, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.745150566101074, "rewards/margins": 7.1227874755859375, "rewards/rejected": -12.867938041687012, "step": 1230 }, { "epoch": 1.0316370069726297, "grad_norm": 5.022886276245117, "learning_rate": 3.615893495987335e-07, "logits/chosen": 2.6115026473999023, "logits/rejected": 2.8262619972229004, "logps/chosen": -3.562458038330078, "logps/rejected": -8.772577285766602, "loss": 0.1869, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.343687057495117, "rewards/margins": 7.815177917480469, "rewards/rejected": -13.15886402130127, "step": 1240 }, { "epoch": 1.039962535123322, "grad_norm": 5.58774995803833, "learning_rate": 3.3493649053890325e-07, "logits/chosen": 2.9379220008850098, "logits/rejected": 2.9155845642089844, "logps/chosen": -3.8242735862731934, "logps/rejected": -8.058219909667969, "loss": 0.1817, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.736410140991211, "rewards/margins": 6.3509202003479, "rewards/rejected": -12.08733081817627, "step": 1250 }, { "epoch": 1.039962535123322, "eval_logits/chosen": 2.932387113571167, "eval_logits/rejected": 3.14872407913208, "eval_logps/chosen": -3.7640583515167236, "eval_logps/rejected": -8.459839820861816, "eval_loss": 0.21760709583759308, "eval_rewards/accuracies": 0.9489796161651611, "eval_rewards/chosen": -5.646087169647217, "eval_rewards/margins": 7.043673038482666, "eval_rewards/rejected": -12.689759254455566, "eval_runtime": 29.0272, "eval_samples_per_second": 26.768, "eval_steps_per_second": 3.376, "step": 1250 }, { "epoch": 1.048288063274014, "grad_norm": 19.32095718383789, "learning_rate": 3.092332998903416e-07, "logits/chosen": 2.834139585494995, "logits/rejected": 2.8379931449890137, "logps/chosen": -3.8233566284179688, "logps/rejected": -8.284521102905273, "loss": 0.2106, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.735034942626953, "rewards/margins": 6.691746711730957, "rewards/rejected": -12.426782608032227, "step": 1260 }, { "epoch": 1.056613591424706, "grad_norm": 6.384090423583984, "learning_rate": 2.844910519219632e-07, "logits/chosen": 1.747768759727478, "logits/rejected": 1.7704150676727295, "logps/chosen": -3.8434691429138184, "logps/rejected": -8.780694961547852, "loss": 0.1793, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.765204429626465, "rewards/margins": 7.4058380126953125, "rewards/rejected": -13.171041488647461, "step": 1270 }, { "epoch": 1.064939119575398, "grad_norm": 6.131472110748291, "learning_rate": 2.6072059940146775e-07, "logits/chosen": 2.4055263996124268, "logits/rejected": 2.5173308849334717, "logps/chosen": -4.25059700012207, "logps/rejected": -9.270018577575684, "loss": 0.1917, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.375895023345947, "rewards/margins": 7.529131889343262, "rewards/rejected": -13.905027389526367, "step": 1280 }, { "epoch": 1.07326464772609, "grad_norm": 2.4371416568756104, "learning_rate": 2.3793236883495164e-07, "logits/chosen": 2.3893160820007324, "logits/rejected": 2.3759751319885254, "logps/chosen": -3.7604167461395264, "logps/rejected": -8.331053733825684, "loss": 0.1773, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.640625953674316, "rewards/margins": 6.855956077575684, "rewards/rejected": -12.49658203125, "step": 1290 }, { "epoch": 1.0815901758767823, "grad_norm": 4.089534759521484, "learning_rate": 2.1613635589349756e-07, "logits/chosen": 2.4751062393188477, "logits/rejected": 2.5714690685272217, "logps/chosen": -4.401629447937012, "logps/rejected": -9.281499862670898, "loss": 0.1906, "rewards/accuracies": 0.9375, "rewards/chosen": -6.602444648742676, "rewards/margins": 7.319806098937988, "rewards/rejected": -13.922250747680664, "step": 1300 }, { "epoch": 1.0815901758767823, "eval_logits/chosen": 2.9346606731414795, "eval_logits/rejected": 3.165830135345459, "eval_logps/chosen": -3.765430450439453, "eval_logps/rejected": -8.493232727050781, "eval_loss": 0.2158125936985016, "eval_rewards/accuracies": 0.9489796161651611, "eval_rewards/chosen": -5.648146152496338, "eval_rewards/margins": 7.091703414916992, "eval_rewards/rejected": -12.739849090576172, "eval_runtime": 29.0214, "eval_samples_per_second": 26.773, "eval_steps_per_second": 3.377, "step": 1300 }, { "epoch": 1.0899157040274743, "grad_norm": 9.074567794799805, "learning_rate": 1.95342121028749e-07, "logits/chosen": 3.50212025642395, "logits/rejected": 3.3719208240509033, "logps/chosen": -4.220685005187988, "logps/rejected": -9.281393051147461, "loss": 0.1974, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.331027030944824, "rewards/margins": 7.591064453125, "rewards/rejected": -13.922090530395508, "step": 1310 }, { "epoch": 1.0982412321781663, "grad_norm": 1.6737520694732666, "learning_rate": 1.7555878527937164e-07, "logits/chosen": 2.82517147064209, "logits/rejected": 3.0003697872161865, "logps/chosen": -3.843797206878662, "logps/rejected": -8.62804889678955, "loss": 0.1709, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.765696048736572, "rewards/margins": 7.1763763427734375, "rewards/rejected": -12.942071914672852, "step": 1320 }, { "epoch": 1.1065667603288585, "grad_norm": 3.5915114879608154, "learning_rate": 1.567950262702714e-07, "logits/chosen": 2.2123959064483643, "logits/rejected": 2.28818941116333, "logps/chosen": -3.5982773303985596, "logps/rejected": -8.883399963378906, "loss": 0.2119, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.397416114807129, "rewards/margins": 7.927683353424072, "rewards/rejected": -13.325098991394043, "step": 1330 }, { "epoch": 1.1148922884795505, "grad_norm": 2.5043845176696777, "learning_rate": 1.3905907440629752e-07, "logits/chosen": 3.32414174079895, "logits/rejected": 3.320385694503784, "logps/chosen": -4.185269355773926, "logps/rejected": -8.967876434326172, "loss": 0.1964, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.2779035568237305, "rewards/margins": 7.173911094665527, "rewards/rejected": -13.451814651489258, "step": 1340 }, { "epoch": 1.1232178166302424, "grad_norm": 2.6171875, "learning_rate": 1.223587092621162e-07, "logits/chosen": 2.969780445098877, "logits/rejected": 2.9839043617248535, "logps/chosen": -3.973475933074951, "logps/rejected": -8.792182922363281, "loss": 0.1795, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.960214138031006, "rewards/margins": 7.228060722351074, "rewards/rejected": -13.188275337219238, "step": 1350 }, { "epoch": 1.1232178166302424, "eval_logits/chosen": 2.952848196029663, "eval_logits/rejected": 3.1750388145446777, "eval_logps/chosen": -3.727048873901367, "eval_logps/rejected": -8.468390464782715, "eval_loss": 0.21384017169475555, "eval_rewards/accuracies": 0.9489796161651611, "eval_rewards/chosen": -5.590573787689209, "eval_rewards/margins": 7.1120100021362305, "eval_rewards/rejected": -12.702584266662598, "eval_runtime": 29.0236, "eval_samples_per_second": 26.771, "eval_steps_per_second": 3.377, "step": 1350 }, { "epoch": 1.1315433447809347, "grad_norm": 4.194497585296631, "learning_rate": 1.067012561698319e-07, "logits/chosen": 2.2842764854431152, "logits/rejected": 2.4294095039367676, "logps/chosen": -3.902397871017456, "logps/rejected": -8.514913558959961, "loss": 0.1845, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.8535966873168945, "rewards/margins": 6.918773651123047, "rewards/rejected": -12.772371292114258, "step": 1360 }, { "epoch": 1.1398688729316266, "grad_norm": 4.395818710327148, "learning_rate": 9.209358300585474e-08, "logits/chosen": 2.9209914207458496, "logits/rejected": 3.1523938179016113, "logps/chosen": -3.4997806549072266, "logps/rejected": -8.66450023651123, "loss": 0.219, "rewards/accuracies": 0.9375, "rewards/chosen": -5.24967098236084, "rewards/margins": 7.747079372406006, "rewards/rejected": -12.99675178527832, "step": 1370 }, { "epoch": 1.1481944010823186, "grad_norm": 1.6133110523223877, "learning_rate": 7.854209717842231e-08, "logits/chosen": 2.7593953609466553, "logits/rejected": 2.8030319213867188, "logps/chosen": -4.08266544342041, "logps/rejected": -8.519843101501465, "loss": 0.2086, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.123998641967773, "rewards/margins": 6.655766487121582, "rewards/rejected": -12.779764175415039, "step": 1380 }, { "epoch": 1.1565199292330108, "grad_norm": 2.237539768218994, "learning_rate": 6.605274281709929e-08, "logits/chosen": 2.6362128257751465, "logits/rejected": 2.7968506813049316, "logps/chosen": -3.7666690349578857, "logps/rejected": -8.795989990234375, "loss": 0.1914, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.650003910064697, "rewards/margins": 7.543980598449707, "rewards/rejected": -13.193984985351562, "step": 1390 }, { "epoch": 1.1648454573837028, "grad_norm": 5.562379837036133, "learning_rate": 5.463099816548578e-08, "logits/chosen": 2.62992525100708, "logits/rejected": 2.607635736465454, "logps/chosen": -3.903348207473755, "logps/rejected": -8.768390655517578, "loss": 0.2078, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.855021953582764, "rewards/margins": 7.297563076019287, "rewards/rejected": -13.152585983276367, "step": 1400 }, { "epoch": 1.1648454573837028, "eval_logits/chosen": 2.9537577629089355, "eval_logits/rejected": 3.1741788387298584, "eval_logps/chosen": -3.749206781387329, "eval_logps/rejected": -8.490385055541992, "eval_loss": 0.21511533856391907, "eval_rewards/accuracies": 0.9387755393981934, "eval_rewards/chosen": -5.623809337615967, "eval_rewards/margins": 7.111767768859863, "eval_rewards/rejected": -12.735578536987305, "eval_runtime": 29.0212, "eval_samples_per_second": 26.774, "eval_steps_per_second": 3.377, "step": 1400 }, { "epoch": 1.1731709855343948, "grad_norm": 4.330221176147461, "learning_rate": 4.428187317827848e-08, "logits/chosen": 2.4681572914123535, "logits/rejected": 2.5605075359344482, "logps/chosen": -4.081616401672363, "logps/rejected": -8.574247360229492, "loss": 0.2036, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.122424602508545, "rewards/margins": 6.73894739151001, "rewards/rejected": -12.861371994018555, "step": 1410 }, { "epoch": 1.181496513685087, "grad_norm": 5.088175296783447, "learning_rate": 3.5009907323737826e-08, "logits/chosen": 2.8780219554901123, "logits/rejected": 2.9882426261901855, "logps/chosen": -3.9359214305877686, "logps/rejected": -8.451822280883789, "loss": 0.1818, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.903882026672363, "rewards/margins": 6.773850440979004, "rewards/rejected": -12.677732467651367, "step": 1420 }, { "epoch": 1.189822041835779, "grad_norm": 5.896306037902832, "learning_rate": 2.681916759252917e-08, "logits/chosen": 2.7982282638549805, "logits/rejected": 2.849905490875244, "logps/chosen": -4.141797065734863, "logps/rejected": -8.754281044006348, "loss": 0.2398, "rewards/accuracies": 0.9375, "rewards/chosen": -6.212695121765137, "rewards/margins": 6.918726921081543, "rewards/rejected": -13.131421089172363, "step": 1430 }, { "epoch": 1.198147569986471, "grad_norm": 7.218684673309326, "learning_rate": 1.9713246713805588e-08, "logits/chosen": 3.2387046813964844, "logits/rejected": 2.9275758266448975, "logps/chosen": -4.322108268737793, "logps/rejected": -8.867931365966797, "loss": 0.2167, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.483162879943848, "rewards/margins": 6.818734169006348, "rewards/rejected": -13.301897048950195, "step": 1440 }, { "epoch": 1.2064730981371632, "grad_norm": 5.04719352722168, "learning_rate": 1.3695261579316776e-08, "logits/chosen": 3.054384231567383, "logits/rejected": 3.1834464073181152, "logps/chosen": -4.091288089752197, "logps/rejected": -8.895919799804688, "loss": 0.1655, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.136931419372559, "rewards/margins": 7.206948280334473, "rewards/rejected": -13.343879699707031, "step": 1450 }, { "epoch": 1.2064730981371632, "eval_logits/chosen": 2.9581472873687744, "eval_logits/rejected": 3.1791799068450928, "eval_logps/chosen": -3.739999771118164, "eval_logps/rejected": -8.482653617858887, "eval_loss": 0.21536922454833984, "eval_rewards/accuracies": 0.9387755393981934, "eval_rewards/chosen": -5.610000133514404, "eval_rewards/margins": 7.113979816436768, "eval_rewards/rejected": -12.723978996276855, "eval_runtime": 29.0197, "eval_samples_per_second": 26.775, "eval_steps_per_second": 3.377, "step": 1450 }, { "epoch": 1.2147986262878552, "grad_norm": 3.047269105911255, "learning_rate": 8.767851876239075e-09, "logits/chosen": 2.667764186859131, "logits/rejected": 2.8484127521514893, "logps/chosen": -4.095530986785889, "logps/rejected": -9.387045860290527, "loss": 0.1886, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.143296241760254, "rewards/margins": 7.937272548675537, "rewards/rejected": -14.08056926727295, "step": 1460 }, { "epoch": 1.2231241544385472, "grad_norm": 3.252483606338501, "learning_rate": 4.933178929321103e-09, "logits/chosen": 3.266549587249756, "logits/rejected": 3.2040858268737793, "logps/chosen": -3.994887113571167, "logps/rejected": -9.390957832336426, "loss": 0.1917, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.992330551147461, "rewards/margins": 8.094103813171387, "rewards/rejected": -14.086435317993164, "step": 1470 }, { "epoch": 1.2314496825892394, "grad_norm": 7.136248588562012, "learning_rate": 2.192924752854042e-09, "logits/chosen": 2.7313778400421143, "logits/rejected": 2.9808616638183594, "logps/chosen": -4.332667350769043, "logps/rejected": -9.161802291870117, "loss": 0.1683, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.499000549316406, "rewards/margins": 7.2437005043029785, "rewards/rejected": -13.742703437805176, "step": 1480 }, { "epoch": 1.2397752107399314, "grad_norm": 3.236614465713501, "learning_rate": 5.48291312886251e-10, "logits/chosen": 2.6956756114959717, "logits/rejected": 2.7045562267303467, "logps/chosen": -3.7984039783477783, "logps/rejected": -9.055222511291504, "loss": 0.1915, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.697606086730957, "rewards/margins": 7.885225772857666, "rewards/rejected": -13.582832336425781, "step": 1490 }, { "epoch": 1.2481007388906233, "grad_norm": 5.244942665100098, "learning_rate": 0.0, "logits/chosen": 2.7035531997680664, "logits/rejected": 2.9069600105285645, "logps/chosen": -3.9928905963897705, "logps/rejected": -8.639092445373535, "loss": 0.2109, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.989335536956787, "rewards/margins": 6.969302177429199, "rewards/rejected": -12.958638191223145, "step": 1500 }, { "epoch": 1.2481007388906233, "eval_logits/chosen": 2.942487955093384, "eval_logits/rejected": 3.1809263229370117, "eval_logps/chosen": -3.7497386932373047, "eval_logps/rejected": -8.50186538696289, "eval_loss": 0.21422816812992096, "eval_rewards/accuracies": 0.9489796161651611, "eval_rewards/chosen": -5.624608039855957, "eval_rewards/margins": 7.128190040588379, "eval_rewards/rejected": -12.752798080444336, "eval_runtime": 29.025, "eval_samples_per_second": 26.77, "eval_steps_per_second": 3.376, "step": 1500 } ], "logging_steps": 10, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.6541963383687086e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }