{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.45790404828806325, "eval_steps": 50, "global_step": 550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00832552815069206, "grad_norm": 0.04514288529753685, "learning_rate": 4.999451708687114e-06, "logits/chosen": 14.412135124206543, "logits/rejected": 14.867518424987793, "logps/chosen": -0.29279541969299316, "logps/rejected": -0.33705300092697144, "loss": 0.9248, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.43919315934181213, "rewards/margins": 0.066386379301548, "rewards/rejected": -0.5055795311927795, "step": 10 }, { "epoch": 0.01665105630138412, "grad_norm": 0.05052826926112175, "learning_rate": 4.997807075247147e-06, "logits/chosen": 14.956459045410156, "logits/rejected": 15.363263130187988, "logps/chosen": -0.3096744120121002, "logps/rejected": -0.36214715242385864, "loss": 0.9355, "rewards/accuracies": 0.5, "rewards/chosen": -0.46451157331466675, "rewards/margins": 0.07870914041996002, "rewards/rejected": -0.5432207584381104, "step": 20 }, { "epoch": 0.024976584452076178, "grad_norm": 0.04879612475633621, "learning_rate": 4.9950668210706795e-06, "logits/chosen": 14.485757827758789, "logits/rejected": 15.057507514953613, "logps/chosen": -0.27136802673339844, "logps/rejected": -0.31497400999069214, "loss": 0.9268, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.4070519804954529, "rewards/margins": 0.06540900468826294, "rewards/rejected": -0.4724610447883606, "step": 30 }, { "epoch": 0.03330211260276824, "grad_norm": 0.05672155320644379, "learning_rate": 4.9912321481237616e-06, "logits/chosen": 14.529332160949707, "logits/rejected": 14.814855575561523, "logps/chosen": -0.29139184951782227, "logps/rejected": -0.31259119510650635, "loss": 0.9267, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.4370877742767334, "rewards/margins": 0.03179898113012314, "rewards/rejected": -0.46888676285743713, "step": 40 }, { "epoch": 0.041627640753460295, "grad_norm": 0.065071240067482, "learning_rate": 4.986304738420684e-06, "logits/chosen": 14.174386978149414, "logits/rejected": 15.223234176635742, "logps/chosen": -0.2745029330253601, "logps/rejected": -0.37693315744400024, "loss": 0.9243, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.41175442934036255, "rewards/margins": 0.1536453813314438, "rewards/rejected": -0.5653998255729675, "step": 50 }, { "epoch": 0.041627640753460295, "eval_logits/chosen": 14.56569766998291, "eval_logits/rejected": 15.157320976257324, "eval_logps/chosen": -0.27527979016304016, "eval_logps/rejected": -0.3633999824523926, "eval_loss": 0.9083622694015503, "eval_rewards/accuracies": 0.5612244606018066, "eval_rewards/chosen": -0.41291970014572144, "eval_rewards/margins": 0.13218028843402863, "eval_rewards/rejected": -0.5450999736785889, "eval_runtime": 29.029, "eval_samples_per_second": 26.766, "eval_steps_per_second": 3.376, "step": 50 }, { "epoch": 0.049953168904152356, "grad_norm": 0.14002270996570587, "learning_rate": 4.980286753286196e-06, "logits/chosen": 14.408930778503418, "logits/rejected": 14.791458129882812, "logps/chosen": -0.285602867603302, "logps/rejected": -0.3351826071739197, "loss": 0.9177, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.4284043312072754, "rewards/margins": 0.07436960190534592, "rewards/rejected": -0.5027738809585571, "step": 60 }, { "epoch": 0.05827869705484442, "grad_norm": 0.05595069006085396, "learning_rate": 4.973180832407471e-06, "logits/chosen": 14.41168212890625, "logits/rejected": 14.865121841430664, "logps/chosen": -0.25851207971572876, "logps/rejected": -0.32240185141563416, "loss": 0.9168, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.3877681493759155, "rewards/margins": 0.0958346277475357, "rewards/rejected": -0.4836028218269348, "step": 70 }, { "epoch": 0.06660422520553648, "grad_norm": 0.058645494282245636, "learning_rate": 4.964990092676263e-06, "logits/chosen": 14.897825241088867, "logits/rejected": 15.01073932647705, "logps/chosen": -0.2668797969818115, "logps/rejected": -0.3204379975795746, "loss": 0.9242, "rewards/accuracies": 0.5, "rewards/chosen": -0.4003197252750397, "rewards/margins": 0.08033724129199982, "rewards/rejected": -0.4806569516658783, "step": 80 }, { "epoch": 0.07492975335622853, "grad_norm": 0.0597861111164093, "learning_rate": 4.9557181268217225e-06, "logits/chosen": 14.531021118164062, "logits/rejected": 14.767858505249023, "logps/chosen": -0.26787540316581726, "logps/rejected": -0.32972821593284607, "loss": 0.9077, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.4018131196498871, "rewards/margins": 0.09277921915054321, "rewards/rejected": -0.4945923686027527, "step": 90 }, { "epoch": 0.08325528150692059, "grad_norm": 0.0863095372915268, "learning_rate": 4.9453690018345144e-06, "logits/chosen": 14.179275512695312, "logits/rejected": 14.909070014953613, "logps/chosen": -0.2532978057861328, "logps/rejected": -0.35474082827568054, "loss": 0.903, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3799467086791992, "rewards/margins": 0.1521645337343216, "rewards/rejected": -0.5321112275123596, "step": 100 }, { "epoch": 0.08325528150692059, "eval_logits/chosen": 14.326024055480957, "eval_logits/rejected": 14.979863166809082, "eval_logps/chosen": -0.2673422694206238, "eval_logps/rejected": -0.3668619990348816, "eval_loss": 0.8989922404289246, "eval_rewards/accuracies": 0.6020408272743225, "eval_rewards/chosen": -0.4010133445262909, "eval_rewards/margins": 0.1492796391248703, "eval_rewards/rejected": -0.5502930283546448, "eval_runtime": 29.0209, "eval_samples_per_second": 26.774, "eval_steps_per_second": 3.377, "step": 100 }, { "epoch": 0.09158080965761266, "grad_norm": 0.07181967049837112, "learning_rate": 4.933947257182901e-06, "logits/chosen": 14.118756294250488, "logits/rejected": 14.755918502807617, "logps/chosen": -0.27995947003364563, "logps/rejected": -0.3749552369117737, "loss": 0.9097, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.41993919014930725, "rewards/margins": 0.14249366521835327, "rewards/rejected": -0.5624328255653381, "step": 110 }, { "epoch": 0.09990633780830471, "grad_norm": 0.08269819617271423, "learning_rate": 4.921457902821578e-06, "logits/chosen": 13.764413833618164, "logits/rejected": 14.43315315246582, "logps/chosen": -0.28177163004875183, "logps/rejected": -0.3637630343437195, "loss": 0.9075, "rewards/accuracies": 0.625, "rewards/chosen": -0.42265743017196655, "rewards/margins": 0.12298711389303207, "rewards/rejected": -0.5456445813179016, "step": 120 }, { "epoch": 0.10823186595899677, "grad_norm": 1.9071497917175293, "learning_rate": 4.907906416994146e-06, "logits/chosen": 14.103793144226074, "logits/rejected": 14.727777481079102, "logps/chosen": -0.2665451765060425, "logps/rejected": -0.3827117085456848, "loss": 0.9217, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3998177647590637, "rewards/margins": 0.1742497682571411, "rewards/rejected": -0.5740675926208496, "step": 130 }, { "epoch": 0.11655739410968884, "grad_norm": 0.12107716500759125, "learning_rate": 4.893298743830168e-06, "logits/chosen": 13.517863273620605, "logits/rejected": 14.42052173614502, "logps/chosen": -0.26627904176712036, "logps/rejected": -0.3745174705982208, "loss": 0.904, "rewards/accuracies": 0.5625, "rewards/chosen": -0.39941853284835815, "rewards/margins": 0.16235767304897308, "rewards/rejected": -0.5617762207984924, "step": 140 }, { "epoch": 0.12488292226038089, "grad_norm": 0.1638205647468567, "learning_rate": 4.8776412907378845e-06, "logits/chosen": 12.83032512664795, "logits/rejected": 13.673515319824219, "logps/chosen": -0.24289576709270477, "logps/rejected": -0.37163227796554565, "loss": 0.8779, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.36434367299079895, "rewards/margins": 0.19310477375984192, "rewards/rejected": -0.5574483871459961, "step": 150 }, { "epoch": 0.12488292226038089, "eval_logits/chosen": 12.317696571350098, "eval_logits/rejected": 13.164616584777832, "eval_logps/chosen": -0.266156405210495, "eval_logps/rejected": -0.4009220004081726, "eval_loss": 0.8768696784973145, "eval_rewards/accuracies": 0.6224489808082581, "eval_rewards/chosen": -0.3992346227169037, "eval_rewards/margins": 0.20214837789535522, "eval_rewards/rejected": -0.6013829708099365, "eval_runtime": 29.0257, "eval_samples_per_second": 26.769, "eval_steps_per_second": 3.376, "step": 150 }, { "epoch": 0.13320845041107296, "grad_norm": 0.1479438841342926, "learning_rate": 4.860940925593703e-06, "logits/chosen": 12.736433029174805, "logits/rejected": 13.475964546203613, "logps/chosen": -0.2913517355918884, "logps/rejected": -0.36094629764556885, "loss": 0.8756, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.43702763319015503, "rewards/margins": 0.10439182817935944, "rewards/rejected": -0.5414193868637085, "step": 160 }, { "epoch": 0.141533978561765, "grad_norm": 0.17609630525112152, "learning_rate": 4.84320497372973e-06, "logits/chosen": 10.606362342834473, "logits/rejected": 11.537567138671875, "logps/chosen": -0.2560296952724457, "logps/rejected": -0.4312233328819275, "loss": 0.8489, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.38404449820518494, "rewards/margins": 0.2627905011177063, "rewards/rejected": -0.6468349695205688, "step": 170 }, { "epoch": 0.14985950671245707, "grad_norm": 0.18054936826229095, "learning_rate": 4.824441214720629e-06, "logits/chosen": 10.13754653930664, "logits/rejected": 10.914222717285156, "logps/chosen": -0.29278701543807983, "logps/rejected": -0.43448886275291443, "loss": 0.8715, "rewards/accuracies": 0.625, "rewards/chosen": -0.43918052315711975, "rewards/margins": 0.21255281567573547, "rewards/rejected": -0.6517333388328552, "step": 180 }, { "epoch": 0.15818503486314914, "grad_norm": 0.19739146530628204, "learning_rate": 4.804657878971252e-06, "logits/chosen": 8.077766418457031, "logits/rejected": 9.669368743896484, "logps/chosen": -0.2844889760017395, "logps/rejected": -0.5050357580184937, "loss": 0.8582, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.42673349380493164, "rewards/margins": 0.3308201730251312, "rewards/rejected": -0.7575536966323853, "step": 190 }, { "epoch": 0.16651056301384118, "grad_norm": 0.2397814244031906, "learning_rate": 4.783863644106502e-06, "logits/chosen": 6.790783882141113, "logits/rejected": 7.849525451660156, "logps/chosen": -0.2940555512905121, "logps/rejected": -0.5699166059494019, "loss": 0.8196, "rewards/accuracies": 0.75, "rewards/chosen": -0.4410833418369293, "rewards/margins": 0.41379159688949585, "rewards/rejected": -0.8548749089241028, "step": 200 }, { "epoch": 0.16651056301384118, "eval_logits/chosen": 6.290835857391357, "eval_logits/rejected": 6.757873058319092, "eval_logps/chosen": -0.317629337310791, "eval_logps/rejected": -0.581989586353302, "eval_loss": 0.8032433986663818, "eval_rewards/accuracies": 0.6734693646430969, "eval_rewards/chosen": -0.47644397616386414, "eval_rewards/margins": 0.39654040336608887, "eval_rewards/rejected": -0.8729843497276306, "eval_runtime": 29.025, "eval_samples_per_second": 26.77, "eval_steps_per_second": 3.376, "step": 200 }, { "epoch": 0.17483609116453325, "grad_norm": 0.2858545184135437, "learning_rate": 4.762067631165049e-06, "logits/chosen": 6.875879764556885, "logits/rejected": 6.691536903381348, "logps/chosen": -0.37194910645484924, "logps/rejected": -0.5639354586601257, "loss": 0.8129, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5579236745834351, "rewards/margins": 0.2879795432090759, "rewards/rejected": -0.8459032773971558, "step": 210 }, { "epoch": 0.18316161931522532, "grad_norm": 0.30206382274627686, "learning_rate": 4.7392794005985324e-06, "logits/chosen": 4.656112194061279, "logits/rejected": 4.483086585998535, "logps/chosen": -0.360150009393692, "logps/rejected": -0.6204283833503723, "loss": 0.7954, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5402250289916992, "rewards/margins": 0.39041754603385925, "rewards/rejected": -0.9306427240371704, "step": 220 }, { "epoch": 0.19148714746591736, "grad_norm": 0.40204310417175293, "learning_rate": 4.715508948078037e-06, "logits/chosen": 3.9398162364959717, "logits/rejected": 3.38537859916687, "logps/chosen": -0.39010342955589294, "logps/rejected": -0.7167688608169556, "loss": 0.7664, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5851551294326782, "rewards/margins": 0.4899981617927551, "rewards/rejected": -1.0751533508300781, "step": 230 }, { "epoch": 0.19981267561660943, "grad_norm": 0.48389795422554016, "learning_rate": 4.690766700109659e-06, "logits/chosen": 2.925476551055908, "logits/rejected": 2.824068069458008, "logps/chosen": -0.41053348779678345, "logps/rejected": -0.8508625030517578, "loss": 0.7606, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6158002018928528, "rewards/margins": 0.6604936718940735, "rewards/rejected": -1.2762939929962158, "step": 240 }, { "epoch": 0.2081382037673015, "grad_norm": 0.6687452793121338, "learning_rate": 4.665063509461098e-06, "logits/chosen": 2.751737594604492, "logits/rejected": 2.2424545288085938, "logps/chosen": -0.4365699291229248, "logps/rejected": -0.8550359606742859, "loss": 0.7234, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6548548936843872, "rewards/margins": 0.6276990175247192, "rewards/rejected": -1.2825539112091064, "step": 250 }, { "epoch": 0.2081382037673015, "eval_logits/chosen": 2.1380228996276855, "eval_logits/rejected": 1.3922746181488037, "eval_logps/chosen": -0.48307570815086365, "eval_logps/rejected": -1.0382359027862549, "eval_loss": 0.668463945388794, "eval_rewards/accuracies": 0.6938775777816772, "eval_rewards/chosen": -0.7246134877204895, "eval_rewards/margins": 0.8327403664588928, "eval_rewards/rejected": -1.5573538541793823, "eval_runtime": 29.0228, "eval_samples_per_second": 26.772, "eval_steps_per_second": 3.377, "step": 250 }, { "epoch": 0.21646373191799353, "grad_norm": 0.7085956335067749, "learning_rate": 4.638410650401267e-06, "logits/chosen": 1.7889283895492554, "logits/rejected": 0.9420136213302612, "logps/chosen": -0.5195389986038208, "logps/rejected": -1.0534025430679321, "loss": 0.6863, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7793084979057312, "rewards/margins": 0.8007953763008118, "rewards/rejected": -1.580103874206543, "step": 260 }, { "epoch": 0.2247892600686856, "grad_norm": 0.4416671097278595, "learning_rate": 4.610819813755038e-06, "logits/chosen": 1.582745909690857, "logits/rejected": 0.3820720911026001, "logps/chosen": -0.5181297063827515, "logps/rejected": -1.2198141813278198, "loss": 0.5809, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7771945595741272, "rewards/margins": 1.0525267124176025, "rewards/rejected": -1.8297210931777954, "step": 270 }, { "epoch": 0.23311478821937767, "grad_norm": 2.7746617794036865, "learning_rate": 4.582303101775249e-06, "logits/chosen": 1.2947760820388794, "logits/rejected": 0.27237796783447266, "logps/chosen": -0.643541693687439, "logps/rejected": -1.7467323541641235, "loss": 0.5775, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9653124809265137, "rewards/margins": 1.6547861099243164, "rewards/rejected": -2.62009859085083, "step": 280 }, { "epoch": 0.2414403163700697, "grad_norm": 0.6444702744483948, "learning_rate": 4.55287302283426e-06, "logits/chosen": 1.2399464845657349, "logits/rejected": 0.22667090594768524, "logps/chosen": -0.7517040967941284, "logps/rejected": -1.9010766744613647, "loss": 0.5314, "rewards/accuracies": 0.625, "rewards/chosen": -1.1275560855865479, "rewards/margins": 1.724058747291565, "rewards/rejected": -2.8516147136688232, "step": 290 }, { "epoch": 0.24976584452076178, "grad_norm": 0.5103917717933655, "learning_rate": 4.522542485937369e-06, "logits/chosen": 1.438954472541809, "logits/rejected": 0.5288833379745483, "logps/chosen": -0.7871009707450867, "logps/rejected": -2.0329811573028564, "loss": 0.5271, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1806514263153076, "rewards/margins": 1.8688204288482666, "rewards/rejected": -3.049471616744995, "step": 300 }, { "epoch": 0.24976584452076178, "eval_logits/chosen": 1.3706706762313843, "eval_logits/rejected": 0.8007871508598328, "eval_logps/chosen": -0.7460500001907349, "eval_logps/rejected": -2.209245443344116, "eval_loss": 0.5008835792541504, "eval_rewards/accuracies": 0.7244898080825806, "eval_rewards/chosen": -1.1190749406814575, "eval_rewards/margins": 2.194793224334717, "eval_rewards/rejected": -3.313868284225464, "eval_runtime": 29.0227, "eval_samples_per_second": 26.772, "eval_steps_per_second": 3.377, "step": 300 }, { "epoch": 0.2580913726714538, "grad_norm": 0.7984316945075989, "learning_rate": 4.491324795060491e-06, "logits/chosen": 0.9250973463058472, "logits/rejected": 0.1887839138507843, "logps/chosen": -0.8511486053466797, "logps/rejected": -2.447072982788086, "loss": 0.5506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2767229080200195, "rewards/margins": 2.3938865661621094, "rewards/rejected": -3.670609712600708, "step": 310 }, { "epoch": 0.2664169008221459, "grad_norm": 0.5243161916732788, "learning_rate": 4.4592336433146e-06, "logits/chosen": 2.437886953353882, "logits/rejected": 1.6011940240859985, "logps/chosen": -0.7107629776000977, "logps/rejected": -2.132263422012329, "loss": 0.5423, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0661444664001465, "rewards/margins": 2.1322507858276367, "rewards/rejected": -3.198395013809204, "step": 320 }, { "epoch": 0.27474242897283796, "grad_norm": 0.4742359220981598, "learning_rate": 4.426283106939474e-06, "logits/chosen": 1.8433977365493774, "logits/rejected": 1.199568748474121, "logps/chosen": -0.8737133145332336, "logps/rejected": -2.1652615070343018, "loss": 0.5015, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3105700016021729, "rewards/margins": 1.9373222589492798, "rewards/rejected": -3.247892379760742, "step": 330 }, { "epoch": 0.28306795712353, "grad_norm": 0.5529736280441284, "learning_rate": 4.3924876391293915e-06, "logits/chosen": 2.0044589042663574, "logits/rejected": 0.9263212084770203, "logps/chosen": -0.9175036549568176, "logps/rejected": -2.6408374309539795, "loss": 0.4921, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3762553930282593, "rewards/margins": 2.585000991821289, "rewards/rejected": -3.961256504058838, "step": 340 }, { "epoch": 0.2913934852742221, "grad_norm": 0.7060612440109253, "learning_rate": 4.357862063693486e-06, "logits/chosen": 2.243232250213623, "logits/rejected": 1.6251205205917358, "logps/chosen": -0.9481338262557983, "logps/rejected": -2.9519124031066895, "loss": 0.4753, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4222007989883423, "rewards/margins": 3.0056674480438232, "rewards/rejected": -4.427868366241455, "step": 350 }, { "epoch": 0.2913934852742221, "eval_logits/chosen": 1.7781500816345215, "eval_logits/rejected": 1.412752628326416, "eval_logps/chosen": -0.9692521095275879, "eval_logps/rejected": -2.8247811794281006, "eval_loss": 0.4446474015712738, "eval_rewards/accuracies": 0.7346938848495483, "eval_rewards/chosen": -1.4538781642913818, "eval_rewards/margins": 2.7832937240600586, "eval_rewards/rejected": -4.2371721267700195, "eval_runtime": 29.0245, "eval_samples_per_second": 26.77, "eval_steps_per_second": 3.376, "step": 350 }, { "epoch": 0.29971901342491414, "grad_norm": 0.9664792418479919, "learning_rate": 4.322421568553529e-06, "logits/chosen": 1.7094570398330688, "logits/rejected": 1.1617993116378784, "logps/chosen": -0.992924690246582, "logps/rejected": -2.7834811210632324, "loss": 0.4972, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4893869161605835, "rewards/margins": 2.6858346462249756, "rewards/rejected": -4.1752214431762695, "step": 360 }, { "epoch": 0.3080445415756062, "grad_norm": 0.7800536155700684, "learning_rate": 4.286181699082008e-06, "logits/chosen": 2.9170143604278564, "logits/rejected": 2.384690523147583, "logps/chosen": -1.0323909521102905, "logps/rejected": -2.726369857788086, "loss": 0.4689, "rewards/accuracies": 0.625, "rewards/chosen": -1.548586368560791, "rewards/margins": 2.540968418121338, "rewards/rejected": -4.089555263519287, "step": 370 }, { "epoch": 0.3163700697262983, "grad_norm": 1.3163660764694214, "learning_rate": 4.249158351283414e-06, "logits/chosen": 2.780831813812256, "logits/rejected": 1.753291130065918, "logps/chosen": -1.0468894243240356, "logps/rejected": -2.7425389289855957, "loss": 0.4835, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5703339576721191, "rewards/margins": 2.5434746742248535, "rewards/rejected": -4.113808631896973, "step": 380 }, { "epoch": 0.3246955978769903, "grad_norm": 0.6381780505180359, "learning_rate": 4.211367764821722e-06, "logits/chosen": 2.585071086883545, "logits/rejected": 1.9254558086395264, "logps/chosen": -1.2089946269989014, "logps/rejected": -3.615030288696289, "loss": 0.4518, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8134920597076416, "rewards/margins": 3.6090526580810547, "rewards/rejected": -5.422544956207275, "step": 390 }, { "epoch": 0.33302112602768236, "grad_norm": 0.9214782118797302, "learning_rate": 4.172826515897146e-06, "logits/chosen": 1.9765586853027344, "logits/rejected": 1.1926987171173096, "logps/chosen": -1.2852815389633179, "logps/rejected": -3.786972761154175, "loss": 0.4165, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9279224872589111, "rewards/margins": 3.7525367736816406, "rewards/rejected": -5.680459022521973, "step": 400 }, { "epoch": 0.33302112602768236, "eval_logits/chosen": 2.6366844177246094, "eval_logits/rejected": 2.394319534301758, "eval_logps/chosen": -1.322396993637085, "eval_logps/rejected": -3.686817169189453, "eval_loss": 0.4065541923046112, "eval_rewards/accuracies": 0.7551020383834839, "eval_rewards/chosen": -1.9835957288742065, "eval_rewards/margins": 3.5466296672821045, "eval_rewards/rejected": -5.5302252769470215, "eval_runtime": 29.025, "eval_samples_per_second": 26.77, "eval_steps_per_second": 3.376, "step": 400 }, { "epoch": 0.34134665417837445, "grad_norm": 1.5113208293914795, "learning_rate": 4.133551509975264e-06, "logits/chosen": 2.0068416595458984, "logits/rejected": 1.5152744054794312, "logps/chosen": -1.5090525150299072, "logps/rejected": -3.9272122383117676, "loss": 0.4004, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.2635788917541504, "rewards/margins": 3.627239227294922, "rewards/rejected": -5.890818119049072, "step": 410 }, { "epoch": 0.3496721823290665, "grad_norm": 11.516369819641113, "learning_rate": 4.093559974371725e-06, "logits/chosen": 3.343449115753174, "logits/rejected": 2.920070171356201, "logps/chosen": -1.8312532901763916, "logps/rejected": -4.115124702453613, "loss": 0.4045, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.746879816055298, "rewards/margins": 3.425807476043701, "rewards/rejected": -6.17268705368042, "step": 420 }, { "epoch": 0.35799771047975854, "grad_norm": 3.0497395992279053, "learning_rate": 4.052869450695776e-06, "logits/chosen": 2.5527279376983643, "logits/rejected": 2.2495744228363037, "logps/chosen": -2.2998366355895996, "logps/rejected": -4.966278076171875, "loss": 0.3758, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.4497551918029785, "rewards/margins": 3.9996612071990967, "rewards/rejected": -7.4494171142578125, "step": 430 }, { "epoch": 0.36632323863045063, "grad_norm": 3.900503158569336, "learning_rate": 4.011497787155938e-06, "logits/chosen": 2.4560112953186035, "logits/rejected": 2.3936328887939453, "logps/chosen": -2.563218593597412, "logps/rejected": -5.063398838043213, "loss": 0.3739, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8448281288146973, "rewards/margins": 3.750270366668701, "rewards/rejected": -7.595097541809082, "step": 440 }, { "epoch": 0.3746487667811427, "grad_norm": 2.8846070766448975, "learning_rate": 3.969463130731183e-06, "logits/chosen": 2.5467796325683594, "logits/rejected": 2.4370405673980713, "logps/chosen": -2.4494822025299072, "logps/rejected": -5.12601900100708, "loss": 0.2905, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.6742234230041504, "rewards/margins": 4.014804840087891, "rewards/rejected": -7.689028263092041, "step": 450 }, { "epoch": 0.3746487667811427, "eval_logits/chosen": 2.922081232070923, "eval_logits/rejected": 2.879075050354004, "eval_logps/chosen": -2.352473020553589, "eval_logps/rejected": -5.1224799156188965, "eval_loss": 0.3302614390850067, "eval_rewards/accuracies": 0.8673469424247742, "eval_rewards/chosen": -3.5287091732025146, "eval_rewards/margins": 4.155009746551514, "eval_rewards/rejected": -7.683719635009766, "eval_runtime": 29.0235, "eval_samples_per_second": 26.771, "eval_steps_per_second": 3.377, "step": 450 }, { "epoch": 0.3829742949318347, "grad_norm": 4.662614345550537, "learning_rate": 3.92678391921108e-06, "logits/chosen": 2.428154468536377, "logits/rejected": 2.2403202056884766, "logps/chosen": -2.5936172008514404, "logps/rejected": -5.356133460998535, "loss": 0.2881, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.89042592048645, "rewards/margins": 4.143774509429932, "rewards/rejected": -8.034199714660645, "step": 460 }, { "epoch": 0.3912998230825268, "grad_norm": 2.716899871826172, "learning_rate": 3.88347887310836e-06, "logits/chosen": 2.437295436859131, "logits/rejected": 2.271914005279541, "logps/chosen": -2.470245361328125, "logps/rejected": -5.719494819641113, "loss": 0.31, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.70536732673645, "rewards/margins": 4.873874187469482, "rewards/rejected": -8.579241752624512, "step": 470 }, { "epoch": 0.39962535123321885, "grad_norm": 3.343271255493164, "learning_rate": 3.839566987447492e-06, "logits/chosen": 2.144461154937744, "logits/rejected": 2.0314810276031494, "logps/chosen": -2.5805585384368896, "logps/rejected": -5.418456077575684, "loss": 0.3194, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.870838165283203, "rewards/margins": 4.256844997406006, "rewards/rejected": -8.12768268585205, "step": 480 }, { "epoch": 0.4079508793839109, "grad_norm": 6.411283493041992, "learning_rate": 3.795067523432826e-06, "logits/chosen": 2.408092498779297, "logits/rejected": 2.2996156215667725, "logps/chosen": -2.8846375942230225, "logps/rejected": -5.957771301269531, "loss": 0.3353, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.326956748962402, "rewards/margins": 4.6097002029418945, "rewards/rejected": -8.936657905578613, "step": 490 }, { "epoch": 0.416276407534603, "grad_norm": 3.2472238540649414, "learning_rate": 3.7500000000000005e-06, "logits/chosen": 3.0815653800964355, "logits/rejected": 2.8496975898742676, "logps/chosen": -3.061626434326172, "logps/rejected": -5.966124534606934, "loss": 0.3018, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.592440128326416, "rewards/margins": 4.356747627258301, "rewards/rejected": -8.949186325073242, "step": 500 }, { "epoch": 0.416276407534603, "eval_logits/chosen": 2.7115373611450195, "eval_logits/rejected": 2.763493061065674, "eval_logps/chosen": -2.85333251953125, "eval_logps/rejected": -5.915884017944336, "eval_loss": 0.3079966604709625, "eval_rewards/accuracies": 0.8979591727256775, "eval_rewards/chosen": -4.279998302459717, "eval_rewards/margins": 4.593828201293945, "eval_rewards/rejected": -8.873826026916504, "eval_runtime": 29.0268, "eval_samples_per_second": 26.768, "eval_steps_per_second": 3.376, "step": 500 }, { "epoch": 0.42460193568529503, "grad_norm": 10.017457962036133, "learning_rate": 3.7043841852542884e-06, "logits/chosen": 2.775202989578247, "logits/rejected": 2.6122496128082275, "logps/chosen": -3.0054879188537598, "logps/rejected": -6.258307456970215, "loss": 0.3101, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.5082316398620605, "rewards/margins": 4.879229545593262, "rewards/rejected": -9.387460708618164, "step": 510 }, { "epoch": 0.43292746383598707, "grad_norm": 4.494226932525635, "learning_rate": 3.658240087799655e-06, "logits/chosen": 2.816701889038086, "logits/rejected": 2.4107789993286133, "logps/chosen": -3.2932097911834717, "logps/rejected": -6.099677562713623, "loss": 0.2925, "rewards/accuracies": 0.875, "rewards/chosen": -4.939814567565918, "rewards/margins": 4.209702014923096, "rewards/rejected": -9.149517059326172, "step": 520 }, { "epoch": 0.44125299198667917, "grad_norm": 2.957486391067505, "learning_rate": 3.611587947962319e-06, "logits/chosen": 2.3626818656921387, "logits/rejected": 2.4196550846099854, "logps/chosen": -3.085209608078003, "logps/rejected": -6.118277072906494, "loss": 0.3169, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.627814292907715, "rewards/margins": 4.549601078033447, "rewards/rejected": -9.17741584777832, "step": 530 }, { "epoch": 0.4495785201373712, "grad_norm": 3.429408550262451, "learning_rate": 3.564448228912682e-06, "logits/chosen": 2.559816360473633, "logits/rejected": 2.598250150680542, "logps/chosen": -3.3060078620910645, "logps/rejected": -6.124637126922607, "loss": 0.3271, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.959012031555176, "rewards/margins": 4.227944850921631, "rewards/rejected": -9.186956405639648, "step": 540 }, { "epoch": 0.45790404828806325, "grad_norm": 2.110722780227661, "learning_rate": 3.516841607689501e-06, "logits/chosen": 2.4487693309783936, "logits/rejected": 2.0568625926971436, "logps/chosen": -3.396770477294922, "logps/rejected": -6.35222864151001, "loss": 0.3172, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -5.095156192779541, "rewards/margins": 4.4331865310668945, "rewards/rejected": -9.528343200683594, "step": 550 }, { "epoch": 0.45790404828806325, "eval_logits/chosen": 2.5644595623016357, "eval_logits/rejected": 2.6437506675720215, "eval_logps/chosen": -3.1958370208740234, "eval_logps/rejected": -6.542325496673584, "eval_loss": 0.28538385033607483, "eval_rewards/accuracies": 0.918367326259613, "eval_rewards/chosen": -4.793755054473877, "eval_rewards/margins": 5.0197319984436035, "eval_rewards/rejected": -9.813486099243164, "eval_runtime": 29.0252, "eval_samples_per_second": 26.77, "eval_steps_per_second": 3.376, "step": 550 } ], "logging_steps": 10, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3403270445961052e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }