{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2913934852742221, "eval_steps": 50, "global_step": 350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00832552815069206, "grad_norm": 0.04514288529753685, "learning_rate": 4.999451708687114e-06, "logits/chosen": 14.412135124206543, "logits/rejected": 14.867518424987793, "logps/chosen": -0.29279541969299316, "logps/rejected": -0.33705300092697144, "loss": 0.9248, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.43919315934181213, "rewards/margins": 0.066386379301548, "rewards/rejected": -0.5055795311927795, "step": 10 }, { "epoch": 0.01665105630138412, "grad_norm": 0.05052826926112175, "learning_rate": 4.997807075247147e-06, "logits/chosen": 14.956459045410156, "logits/rejected": 15.363263130187988, "logps/chosen": -0.3096744120121002, "logps/rejected": -0.36214715242385864, "loss": 0.9355, "rewards/accuracies": 0.5, "rewards/chosen": -0.46451157331466675, "rewards/margins": 0.07870914041996002, "rewards/rejected": -0.5432207584381104, "step": 20 }, { "epoch": 0.024976584452076178, "grad_norm": 0.04879612475633621, "learning_rate": 4.9950668210706795e-06, "logits/chosen": 14.485757827758789, "logits/rejected": 15.057507514953613, "logps/chosen": -0.27136802673339844, "logps/rejected": -0.31497400999069214, "loss": 0.9268, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.4070519804954529, "rewards/margins": 0.06540900468826294, "rewards/rejected": -0.4724610447883606, "step": 30 }, { "epoch": 0.03330211260276824, "grad_norm": 0.05672155320644379, "learning_rate": 4.9912321481237616e-06, "logits/chosen": 14.529332160949707, "logits/rejected": 14.814855575561523, "logps/chosen": -0.29139184951782227, "logps/rejected": -0.31259119510650635, "loss": 0.9267, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.4370877742767334, "rewards/margins": 0.03179898113012314, "rewards/rejected": -0.46888676285743713, "step": 40 }, { "epoch": 0.041627640753460295, "grad_norm": 0.065071240067482, "learning_rate": 4.986304738420684e-06, "logits/chosen": 14.174386978149414, "logits/rejected": 15.223234176635742, "logps/chosen": -0.2745029330253601, "logps/rejected": -0.37693315744400024, "loss": 0.9243, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.41175442934036255, "rewards/margins": 0.1536453813314438, "rewards/rejected": -0.5653998255729675, "step": 50 }, { "epoch": 0.041627640753460295, "eval_logits/chosen": 14.56569766998291, "eval_logits/rejected": 15.157320976257324, "eval_logps/chosen": -0.27527979016304016, "eval_logps/rejected": -0.3633999824523926, "eval_loss": 0.9083622694015503, "eval_rewards/accuracies": 0.5612244606018066, "eval_rewards/chosen": -0.41291970014572144, "eval_rewards/margins": 0.13218028843402863, "eval_rewards/rejected": -0.5450999736785889, "eval_runtime": 29.029, "eval_samples_per_second": 26.766, "eval_steps_per_second": 3.376, "step": 50 }, { "epoch": 0.049953168904152356, "grad_norm": 0.14002270996570587, "learning_rate": 4.980286753286196e-06, "logits/chosen": 14.408930778503418, "logits/rejected": 14.791458129882812, "logps/chosen": -0.285602867603302, "logps/rejected": -0.3351826071739197, "loss": 0.9177, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.4284043312072754, "rewards/margins": 0.07436960190534592, "rewards/rejected": -0.5027738809585571, "step": 60 }, { "epoch": 0.05827869705484442, "grad_norm": 0.05595069006085396, "learning_rate": 4.973180832407471e-06, "logits/chosen": 14.41168212890625, "logits/rejected": 14.865121841430664, "logps/chosen": -0.25851207971572876, "logps/rejected": -0.32240185141563416, "loss": 0.9168, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.3877681493759155, "rewards/margins": 0.0958346277475357, "rewards/rejected": -0.4836028218269348, "step": 70 }, { "epoch": 0.06660422520553648, "grad_norm": 0.058645494282245636, "learning_rate": 4.964990092676263e-06, "logits/chosen": 14.897825241088867, "logits/rejected": 15.01073932647705, "logps/chosen": -0.2668797969818115, "logps/rejected": -0.3204379975795746, "loss": 0.9242, "rewards/accuracies": 0.5, "rewards/chosen": -0.4003197252750397, "rewards/margins": 0.08033724129199982, "rewards/rejected": -0.4806569516658783, "step": 80 }, { "epoch": 0.07492975335622853, "grad_norm": 0.0597861111164093, "learning_rate": 4.9557181268217225e-06, "logits/chosen": 14.531021118164062, "logits/rejected": 14.767858505249023, "logps/chosen": -0.26787540316581726, "logps/rejected": -0.32972821593284607, "loss": 0.9077, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.4018131196498871, "rewards/margins": 0.09277921915054321, "rewards/rejected": -0.4945923686027527, "step": 90 }, { "epoch": 0.08325528150692059, "grad_norm": 0.0863095372915268, "learning_rate": 4.9453690018345144e-06, "logits/chosen": 14.179275512695312, "logits/rejected": 14.909070014953613, "logps/chosen": -0.2532978057861328, "logps/rejected": -0.35474082827568054, "loss": 0.903, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3799467086791992, "rewards/margins": 0.1521645337343216, "rewards/rejected": -0.5321112275123596, "step": 100 }, { "epoch": 0.08325528150692059, "eval_logits/chosen": 14.326024055480957, "eval_logits/rejected": 14.979863166809082, "eval_logps/chosen": -0.2673422694206238, "eval_logps/rejected": -0.3668619990348816, "eval_loss": 0.8989922404289246, "eval_rewards/accuracies": 0.6020408272743225, "eval_rewards/chosen": -0.4010133445262909, "eval_rewards/margins": 0.1492796391248703, "eval_rewards/rejected": -0.5502930283546448, "eval_runtime": 29.0209, "eval_samples_per_second": 26.774, "eval_steps_per_second": 3.377, "step": 100 }, { "epoch": 0.09158080965761266, "grad_norm": 0.07181967049837112, "learning_rate": 4.933947257182901e-06, "logits/chosen": 14.118756294250488, "logits/rejected": 14.755918502807617, "logps/chosen": -0.27995947003364563, "logps/rejected": -0.3749552369117737, "loss": 0.9097, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.41993919014930725, "rewards/margins": 0.14249366521835327, "rewards/rejected": -0.5624328255653381, "step": 110 }, { "epoch": 0.09990633780830471, "grad_norm": 0.08269819617271423, "learning_rate": 4.921457902821578e-06, "logits/chosen": 13.764413833618164, "logits/rejected": 14.43315315246582, "logps/chosen": -0.28177163004875183, "logps/rejected": -0.3637630343437195, "loss": 0.9075, "rewards/accuracies": 0.625, "rewards/chosen": -0.42265743017196655, "rewards/margins": 0.12298711389303207, "rewards/rejected": -0.5456445813179016, "step": 120 }, { "epoch": 0.10823186595899677, "grad_norm": 1.9071497917175293, "learning_rate": 4.907906416994146e-06, "logits/chosen": 14.103793144226074, "logits/rejected": 14.727777481079102, "logps/chosen": -0.2665451765060425, "logps/rejected": -0.3827117085456848, "loss": 0.9217, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3998177647590637, "rewards/margins": 0.1742497682571411, "rewards/rejected": -0.5740675926208496, "step": 130 }, { "epoch": 0.11655739410968884, "grad_norm": 0.12107716500759125, "learning_rate": 4.893298743830168e-06, "logits/chosen": 13.517863273620605, "logits/rejected": 14.42052173614502, "logps/chosen": -0.26627904176712036, "logps/rejected": -0.3745174705982208, "loss": 0.904, "rewards/accuracies": 0.5625, "rewards/chosen": -0.39941853284835815, "rewards/margins": 0.16235767304897308, "rewards/rejected": -0.5617762207984924, "step": 140 }, { "epoch": 0.12488292226038089, "grad_norm": 0.1638205647468567, "learning_rate": 4.8776412907378845e-06, "logits/chosen": 12.83032512664795, "logits/rejected": 13.673515319824219, "logps/chosen": -0.24289576709270477, "logps/rejected": -0.37163227796554565, "loss": 0.8779, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.36434367299079895, "rewards/margins": 0.19310477375984192, "rewards/rejected": -0.5574483871459961, "step": 150 }, { "epoch": 0.12488292226038089, "eval_logits/chosen": 12.317696571350098, "eval_logits/rejected": 13.164616584777832, "eval_logps/chosen": -0.266156405210495, "eval_logps/rejected": -0.4009220004081726, "eval_loss": 0.8768696784973145, "eval_rewards/accuracies": 0.6224489808082581, "eval_rewards/chosen": -0.3992346227169037, "eval_rewards/margins": 0.20214837789535522, "eval_rewards/rejected": -0.6013829708099365, "eval_runtime": 29.0257, "eval_samples_per_second": 26.769, "eval_steps_per_second": 3.376, "step": 150 }, { "epoch": 0.13320845041107296, "grad_norm": 0.1479438841342926, "learning_rate": 4.860940925593703e-06, "logits/chosen": 12.736433029174805, "logits/rejected": 13.475964546203613, "logps/chosen": -0.2913517355918884, "logps/rejected": -0.36094629764556885, "loss": 0.8756, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.43702763319015503, "rewards/margins": 0.10439182817935944, "rewards/rejected": -0.5414193868637085, "step": 160 }, { "epoch": 0.141533978561765, "grad_norm": 0.17609630525112152, "learning_rate": 4.84320497372973e-06, "logits/chosen": 10.606362342834473, "logits/rejected": 11.537567138671875, "logps/chosen": -0.2560296952724457, "logps/rejected": -0.4312233328819275, "loss": 0.8489, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.38404449820518494, "rewards/margins": 0.2627905011177063, "rewards/rejected": -0.6468349695205688, "step": 170 }, { "epoch": 0.14985950671245707, "grad_norm": 0.18054936826229095, "learning_rate": 4.824441214720629e-06, "logits/chosen": 10.13754653930664, "logits/rejected": 10.914222717285156, "logps/chosen": -0.29278701543807983, "logps/rejected": -0.43448886275291443, "loss": 0.8715, "rewards/accuracies": 0.625, "rewards/chosen": -0.43918052315711975, "rewards/margins": 0.21255281567573547, "rewards/rejected": -0.6517333388328552, "step": 180 }, { "epoch": 0.15818503486314914, "grad_norm": 0.19739146530628204, "learning_rate": 4.804657878971252e-06, "logits/chosen": 8.077766418457031, "logits/rejected": 9.669368743896484, "logps/chosen": -0.2844889760017395, "logps/rejected": -0.5050357580184937, "loss": 0.8582, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.42673349380493164, "rewards/margins": 0.3308201730251312, "rewards/rejected": -0.7575536966323853, "step": 190 }, { "epoch": 0.16651056301384118, "grad_norm": 0.2397814244031906, "learning_rate": 4.783863644106502e-06, "logits/chosen": 6.790783882141113, "logits/rejected": 7.849525451660156, "logps/chosen": -0.2940555512905121, "logps/rejected": -0.5699166059494019, "loss": 0.8196, "rewards/accuracies": 0.75, "rewards/chosen": -0.4410833418369293, "rewards/margins": 0.41379159688949585, "rewards/rejected": -0.8548749089241028, "step": 200 }, { "epoch": 0.16651056301384118, "eval_logits/chosen": 6.290835857391357, "eval_logits/rejected": 6.757873058319092, "eval_logps/chosen": -0.317629337310791, "eval_logps/rejected": -0.581989586353302, "eval_loss": 0.8032433986663818, "eval_rewards/accuracies": 0.6734693646430969, "eval_rewards/chosen": -0.47644397616386414, "eval_rewards/margins": 0.39654040336608887, "eval_rewards/rejected": -0.8729843497276306, "eval_runtime": 29.025, "eval_samples_per_second": 26.77, "eval_steps_per_second": 3.376, "step": 200 }, { "epoch": 0.17483609116453325, "grad_norm": 0.2858545184135437, "learning_rate": 4.762067631165049e-06, "logits/chosen": 6.875879764556885, "logits/rejected": 6.691536903381348, "logps/chosen": -0.37194910645484924, "logps/rejected": -0.5639354586601257, "loss": 0.8129, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5579236745834351, "rewards/margins": 0.2879795432090759, "rewards/rejected": -0.8459032773971558, "step": 210 }, { "epoch": 0.18316161931522532, "grad_norm": 0.30206382274627686, "learning_rate": 4.7392794005985324e-06, "logits/chosen": 4.656112194061279, "logits/rejected": 4.483086585998535, "logps/chosen": -0.360150009393692, "logps/rejected": -0.6204283833503723, "loss": 0.7954, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5402250289916992, "rewards/margins": 0.39041754603385925, "rewards/rejected": -0.9306427240371704, "step": 220 }, { "epoch": 0.19148714746591736, "grad_norm": 0.40204310417175293, "learning_rate": 4.715508948078037e-06, "logits/chosen": 3.9398162364959717, "logits/rejected": 3.38537859916687, "logps/chosen": -0.39010342955589294, "logps/rejected": -0.7167688608169556, "loss": 0.7664, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5851551294326782, "rewards/margins": 0.4899981617927551, "rewards/rejected": -1.0751533508300781, "step": 230 }, { "epoch": 0.19981267561660943, "grad_norm": 0.48389795422554016, "learning_rate": 4.690766700109659e-06, "logits/chosen": 2.925476551055908, "logits/rejected": 2.824068069458008, "logps/chosen": -0.41053348779678345, "logps/rejected": -0.8508625030517578, "loss": 0.7606, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6158002018928528, "rewards/margins": 0.6604936718940735, "rewards/rejected": -1.2762939929962158, "step": 240 }, { "epoch": 0.2081382037673015, "grad_norm": 0.6687452793121338, "learning_rate": 4.665063509461098e-06, "logits/chosen": 2.751737594604492, "logits/rejected": 2.2424545288085938, "logps/chosen": -0.4365699291229248, "logps/rejected": -0.8550359606742859, "loss": 0.7234, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6548548936843872, "rewards/margins": 0.6276990175247192, "rewards/rejected": -1.2825539112091064, "step": 250 }, { "epoch": 0.2081382037673015, "eval_logits/chosen": 2.1380228996276855, "eval_logits/rejected": 1.3922746181488037, "eval_logps/chosen": -0.48307570815086365, "eval_logps/rejected": -1.0382359027862549, "eval_loss": 0.668463945388794, "eval_rewards/accuracies": 0.6938775777816772, "eval_rewards/chosen": -0.7246134877204895, "eval_rewards/margins": 0.8327403664588928, "eval_rewards/rejected": -1.5573538541793823, "eval_runtime": 29.0228, "eval_samples_per_second": 26.772, "eval_steps_per_second": 3.377, "step": 250 }, { "epoch": 0.21646373191799353, "grad_norm": 0.7085956335067749, "learning_rate": 4.638410650401267e-06, "logits/chosen": 1.7889283895492554, "logits/rejected": 0.9420136213302612, "logps/chosen": -0.5195389986038208, "logps/rejected": -1.0534025430679321, "loss": 0.6863, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7793084979057312, "rewards/margins": 0.8007953763008118, "rewards/rejected": -1.580103874206543, "step": 260 }, { "epoch": 0.2247892600686856, "grad_norm": 0.4416671097278595, "learning_rate": 4.610819813755038e-06, "logits/chosen": 1.582745909690857, "logits/rejected": 0.3820720911026001, "logps/chosen": -0.5181297063827515, "logps/rejected": -1.2198141813278198, "loss": 0.5809, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7771945595741272, "rewards/margins": 1.0525267124176025, "rewards/rejected": -1.8297210931777954, "step": 270 }, { "epoch": 0.23311478821937767, "grad_norm": 2.7746617794036865, "learning_rate": 4.582303101775249e-06, "logits/chosen": 1.2947760820388794, "logits/rejected": 0.27237796783447266, "logps/chosen": -0.643541693687439, "logps/rejected": -1.7467323541641235, "loss": 0.5775, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9653124809265137, "rewards/margins": 1.6547861099243164, "rewards/rejected": -2.62009859085083, "step": 280 }, { "epoch": 0.2414403163700697, "grad_norm": 0.6444702744483948, "learning_rate": 4.55287302283426e-06, "logits/chosen": 1.2399464845657349, "logits/rejected": 0.22667090594768524, "logps/chosen": -0.7517040967941284, "logps/rejected": -1.9010766744613647, "loss": 0.5314, "rewards/accuracies": 0.625, "rewards/chosen": -1.1275560855865479, "rewards/margins": 1.724058747291565, "rewards/rejected": -2.8516147136688232, "step": 290 }, { "epoch": 0.24976584452076178, "grad_norm": 0.5103917717933655, "learning_rate": 4.522542485937369e-06, "logits/chosen": 1.438954472541809, "logits/rejected": 0.5288833379745483, "logps/chosen": -0.7871009707450867, "logps/rejected": -2.0329811573028564, "loss": 0.5271, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1806514263153076, "rewards/margins": 1.8688204288482666, "rewards/rejected": -3.049471616744995, "step": 300 }, { "epoch": 0.24976584452076178, "eval_logits/chosen": 1.3706706762313843, "eval_logits/rejected": 0.8007871508598328, "eval_logps/chosen": -0.7460500001907349, "eval_logps/rejected": -2.209245443344116, "eval_loss": 0.5008835792541504, "eval_rewards/accuracies": 0.7244898080825806, "eval_rewards/chosen": -1.1190749406814575, "eval_rewards/margins": 2.194793224334717, "eval_rewards/rejected": -3.313868284225464, "eval_runtime": 29.0227, "eval_samples_per_second": 26.772, "eval_steps_per_second": 3.377, "step": 300 }, { "epoch": 0.2580913726714538, "grad_norm": 0.7984316945075989, "learning_rate": 4.491324795060491e-06, "logits/chosen": 0.9250973463058472, "logits/rejected": 0.1887839138507843, "logps/chosen": -0.8511486053466797, "logps/rejected": -2.447072982788086, "loss": 0.5506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2767229080200195, "rewards/margins": 2.3938865661621094, "rewards/rejected": -3.670609712600708, "step": 310 }, { "epoch": 0.2664169008221459, "grad_norm": 0.5243161916732788, "learning_rate": 4.4592336433146e-06, "logits/chosen": 2.437886953353882, "logits/rejected": 1.6011940240859985, "logps/chosen": -0.7107629776000977, "logps/rejected": -2.132263422012329, "loss": 0.5423, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0661444664001465, "rewards/margins": 2.1322507858276367, "rewards/rejected": -3.198395013809204, "step": 320 }, { "epoch": 0.27474242897283796, "grad_norm": 0.4742359220981598, "learning_rate": 4.426283106939474e-06, "logits/chosen": 1.8433977365493774, "logits/rejected": 1.199568748474121, "logps/chosen": -0.8737133145332336, "logps/rejected": -2.1652615070343018, "loss": 0.5015, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3105700016021729, "rewards/margins": 1.9373222589492798, "rewards/rejected": -3.247892379760742, "step": 330 }, { "epoch": 0.28306795712353, "grad_norm": 0.5529736280441284, "learning_rate": 4.3924876391293915e-06, "logits/chosen": 2.0044589042663574, "logits/rejected": 0.9263212084770203, "logps/chosen": -0.9175036549568176, "logps/rejected": -2.6408374309539795, "loss": 0.4921, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3762553930282593, "rewards/margins": 2.585000991821289, "rewards/rejected": -3.961256504058838, "step": 340 }, { "epoch": 0.2913934852742221, "grad_norm": 0.7060612440109253, "learning_rate": 4.357862063693486e-06, "logits/chosen": 2.243232250213623, "logits/rejected": 1.6251205205917358, "logps/chosen": -0.9481338262557983, "logps/rejected": -2.9519124031066895, "loss": 0.4753, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4222007989883423, "rewards/margins": 3.0056674480438232, "rewards/rejected": -4.427868366241455, "step": 350 }, { "epoch": 0.2913934852742221, "eval_logits/chosen": 1.7781500816345215, "eval_logits/rejected": 1.412752628326416, "eval_logps/chosen": -0.9692521095275879, "eval_logps/rejected": -2.8247811794281006, "eval_loss": 0.4446474015712738, "eval_rewards/accuracies": 0.7346938848495483, "eval_rewards/chosen": -1.4538781642913818, "eval_rewards/margins": 2.7832937240600586, "eval_rewards/rejected": -4.2371721267700195, "eval_runtime": 29.0245, "eval_samples_per_second": 26.77, "eval_steps_per_second": 3.376, "step": 350 } ], "logging_steps": 10, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.5289594781696e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }