diff --git "a/QLoRA_french_dpo/trainer_state.json" "b/QLoRA_french_dpo/trainer_state.json" new file mode 100644--- /dev/null +++ "b/QLoRA_french_dpo/trainer_state.json" @@ -0,0 +1,15042 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001, + "grad_norm": 39.71950149536133, + "learning_rate": 5.0000000000000004e-08, + "logits/chosen": -1.2357934713363647, + "logits/rejected": -0.7058947682380676, + "logps/chosen": -220.3852081298828, + "logps/rejected": -257.87994384765625, + "loss": 0.8354, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.054030876606702805, + "rewards/margins": 0.5196583867073059, + "rewards/rejected": -0.573689341545105, + "step": 10 + }, + { + "epoch": 0.002, + "grad_norm": 27.253252029418945, + "learning_rate": 1.0000000000000001e-07, + "logits/chosen": -1.2473429441452026, + "logits/rejected": -0.5767286419868469, + "logps/chosen": -336.3631896972656, + "logps/rejected": -438.285400390625, + "loss": 1.0858, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.097655177116394, + "rewards/margins": 0.39187589287757874, + "rewards/rejected": -1.4895310401916504, + "step": 20 + }, + { + "epoch": 0.003, + "grad_norm": 2.4999659061431885, + "learning_rate": 1.5000000000000002e-07, + "logits/chosen": -1.7245657444000244, + "logits/rejected": -0.5277743935585022, + "logps/chosen": -109.9853286743164, + "logps/rejected": -330.37506103515625, + "loss": 1.3177, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6884216666221619, + "rewards/margins": 0.37672415375709534, + "rewards/rejected": -1.06514573097229, + "step": 30 + }, + { + "epoch": 0.004, + "grad_norm": 193.43942260742188, + "learning_rate": 2.0000000000000002e-07, + "logits/chosen": -1.815363883972168, + "logits/rejected": -0.4569586217403412, + "logps/chosen": -248.228759765625, + "logps/rejected": -446.4189453125, + "loss": 0.6804, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4480059146881104, + "rewards/margins": 1.2186508178710938, + "rewards/rejected": -2.666656732559204, + "step": 40 + }, + { + "epoch": 0.005, + "grad_norm": 6.413777828216553, + "learning_rate": 2.5000000000000004e-07, + "logits/chosen": -1.0549522638320923, + "logits/rejected": -0.3890644907951355, + "logps/chosen": -196.66943359375, + "logps/rejected": -329.15325927734375, + "loss": 1.0372, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -1.0999469757080078, + "rewards/margins": 0.1718175858259201, + "rewards/rejected": -1.2717645168304443, + "step": 50 + }, + { + "epoch": 0.006, + "grad_norm": 44.80453109741211, + "learning_rate": 3.0000000000000004e-07, + "logits/chosen": -1.1564265489578247, + "logits/rejected": -0.7624977231025696, + "logps/chosen": -212.6322479248047, + "logps/rejected": -237.65902709960938, + "loss": 1.6069, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -1.7349717617034912, + "rewards/margins": -0.5888309478759766, + "rewards/rejected": -1.146140694618225, + "step": 60 + }, + { + "epoch": 0.007, + "grad_norm": 51.307899475097656, + "learning_rate": 3.5000000000000004e-07, + "logits/chosen": -1.3306106328964233, + "logits/rejected": -0.49902287125587463, + "logps/chosen": -187.5395965576172, + "logps/rejected": -324.9596252441406, + "loss": 1.3632, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4904448986053467, + "rewards/margins": -0.47634443640708923, + "rewards/rejected": -1.014100432395935, + "step": 70 + }, + { + "epoch": 0.008, + "grad_norm": 196.0592498779297, + "learning_rate": 4.0000000000000003e-07, + "logits/chosen": -1.136975884437561, + "logits/rejected": -0.4456964135169983, + "logps/chosen": -223.9425048828125, + "logps/rejected": -334.194580078125, + "loss": 0.5711, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8610240817070007, + "rewards/margins": 1.452368974685669, + "rewards/rejected": -2.3133931159973145, + "step": 80 + }, + { + "epoch": 0.009, + "grad_norm": 58.4428596496582, + "learning_rate": 4.5000000000000003e-07, + "logits/chosen": -1.4022276401519775, + "logits/rejected": -0.5699952244758606, + "logps/chosen": -159.07693481445312, + "logps/rejected": -277.25469970703125, + "loss": 0.3843, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.21312984824180603, + "rewards/margins": 1.7562767267227173, + "rewards/rejected": -1.9694064855575562, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 117.02696228027344, + "learning_rate": 5.000000000000001e-07, + "logits/chosen": -1.142896056175232, + "logits/rejected": -0.6683061718940735, + "logps/chosen": -203.57827758789062, + "logps/rejected": -340.763671875, + "loss": 1.0243, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9323091506958008, + "rewards/margins": 0.5186213254928589, + "rewards/rejected": -1.4509305953979492, + "step": 100 + }, + { + "epoch": 0.011, + "grad_norm": 80.57569885253906, + "learning_rate": 5.5e-07, + "logits/chosen": -1.0948550701141357, + "logits/rejected": -0.6490032076835632, + "logps/chosen": -186.33920288085938, + "logps/rejected": -293.6062316894531, + "loss": 0.579, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.49649447202682495, + "rewards/margins": 1.350467562675476, + "rewards/rejected": -1.8469619750976562, + "step": 110 + }, + { + "epoch": 0.012, + "grad_norm": 34.219322204589844, + "learning_rate": 6.000000000000001e-07, + "logits/chosen": -1.7040865421295166, + "logits/rejected": -0.4477524161338806, + "logps/chosen": -151.40792846679688, + "logps/rejected": -396.0971984863281, + "loss": 1.1696, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -1.2884950637817383, + "rewards/margins": -0.33469006419181824, + "rewards/rejected": -0.9538049697875977, + "step": 120 + }, + { + "epoch": 0.013, + "grad_norm": 3.8949766159057617, + "learning_rate": 6.5e-07, + "logits/chosen": -1.3104069232940674, + "logits/rejected": -0.5389059782028198, + "logps/chosen": -209.498291015625, + "logps/rejected": -299.7087097167969, + "loss": 0.7542, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2069940567016602, + "rewards/margins": 1.0890920162200928, + "rewards/rejected": -2.296085834503174, + "step": 130 + }, + { + "epoch": 0.014, + "grad_norm": 12.070839881896973, + "learning_rate": 7.000000000000001e-07, + "logits/chosen": -1.1613848209381104, + "logits/rejected": -0.6944864392280579, + "logps/chosen": -353.5556335449219, + "logps/rejected": -409.09857177734375, + "loss": 0.2059, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0929415225982666, + "rewards/margins": 2.8078646659851074, + "rewards/rejected": -3.900806427001953, + "step": 140 + }, + { + "epoch": 0.015, + "grad_norm": 22.197856903076172, + "learning_rate": 7.5e-07, + "logits/chosen": -0.8517942428588867, + "logits/rejected": -0.323356568813324, + "logps/chosen": -480.0712890625, + "logps/rejected": -459.5816955566406, + "loss": 0.1128, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7829483151435852, + "rewards/margins": 3.3262500762939453, + "rewards/rejected": -4.109198093414307, + "step": 150 + }, + { + "epoch": 0.016, + "grad_norm": 208.33311462402344, + "learning_rate": 8.000000000000001e-07, + "logits/chosen": -1.3437260389328003, + "logits/rejected": -0.5882256031036377, + "logps/chosen": -356.7096862792969, + "logps/rejected": -366.26605224609375, + "loss": 0.9627, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2848951816558838, + "rewards/margins": 1.0965783596038818, + "rewards/rejected": -2.3814735412597656, + "step": 160 + }, + { + "epoch": 0.017, + "grad_norm": 0.00499558774754405, + "learning_rate": 8.500000000000001e-07, + "logits/chosen": -1.5300209522247314, + "logits/rejected": -0.47886618971824646, + "logps/chosen": -170.71347045898438, + "logps/rejected": -377.40386962890625, + "loss": 0.122, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4191989004611969, + "rewards/margins": 3.7101001739501953, + "rewards/rejected": -4.129299163818359, + "step": 170 + }, + { + "epoch": 0.018, + "grad_norm": 0.31560298800468445, + "learning_rate": 9.000000000000001e-07, + "logits/chosen": -1.1970791816711426, + "logits/rejected": -0.5007954835891724, + "logps/chosen": -171.7400665283203, + "logps/rejected": -343.6292419433594, + "loss": 0.3737, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7679370641708374, + "rewards/margins": 2.9629974365234375, + "rewards/rejected": -4.7309346199035645, + "step": 180 + }, + { + "epoch": 0.019, + "grad_norm": 690.4447631835938, + "learning_rate": 9.500000000000001e-07, + "logits/chosen": -1.0829429626464844, + "logits/rejected": -0.633941113948822, + "logps/chosen": -325.78363037109375, + "logps/rejected": -416.4991760253906, + "loss": 0.806, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.0230331420898438, + "rewards/margins": 1.7831542491912842, + "rewards/rejected": -4.806187629699707, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 28.971961975097656, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": -1.0844286680221558, + "logits/rejected": -0.4173709750175476, + "logps/chosen": -377.9032897949219, + "logps/rejected": -432.1002502441406, + "loss": 0.2169, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.7420201301574707, + "rewards/margins": 3.7154648303985596, + "rewards/rejected": -6.457485198974609, + "step": 200 + }, + { + "epoch": 0.021, + "grad_norm": 92.62471771240234, + "learning_rate": 1.0500000000000001e-06, + "logits/chosen": -1.0448482036590576, + "logits/rejected": -0.4706448018550873, + "logps/chosen": -294.4362487792969, + "logps/rejected": -384.4219970703125, + "loss": 0.136, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8045363426208496, + "rewards/margins": 5.647749423980713, + "rewards/rejected": -8.452284812927246, + "step": 210 + }, + { + "epoch": 0.022, + "grad_norm": 6.008492946624756, + "learning_rate": 1.1e-06, + "logits/chosen": -1.2321628332138062, + "logits/rejected": -0.7217515707015991, + "logps/chosen": -389.41192626953125, + "logps/rejected": -409.8401794433594, + "loss": 1.2207, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.440738677978516, + "rewards/margins": 1.8787305355072021, + "rewards/rejected": -6.319469451904297, + "step": 220 + }, + { + "epoch": 0.023, + "grad_norm": 0.00596708245575428, + "learning_rate": 1.1500000000000002e-06, + "logits/chosen": -1.1558558940887451, + "logits/rejected": -0.4543713629245758, + "logps/chosen": -224.6162567138672, + "logps/rejected": -349.88677978515625, + "loss": 0.8165, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.1587278842926025, + "rewards/margins": 2.9656124114990234, + "rewards/rejected": -6.124340057373047, + "step": 230 + }, + { + "epoch": 0.024, + "grad_norm": 9.121099472045898, + "learning_rate": 1.2000000000000002e-06, + "logits/chosen": -1.4009921550750732, + "logits/rejected": -0.6744478940963745, + "logps/chosen": -300.53961181640625, + "logps/rejected": -365.09185791015625, + "loss": 0.2748, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.324301242828369, + "rewards/margins": 2.40966796875, + "rewards/rejected": -5.733969211578369, + "step": 240 + }, + { + "epoch": 0.025, + "grad_norm": 262.2490234375, + "learning_rate": 1.25e-06, + "logits/chosen": -0.9986278414726257, + "logits/rejected": -0.6002539396286011, + "logps/chosen": -246.1274871826172, + "logps/rejected": -286.7688903808594, + "loss": 0.6446, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.917508363723755, + "rewards/margins": 2.402920961380005, + "rewards/rejected": -6.320428371429443, + "step": 250 + }, + { + "epoch": 0.026, + "grad_norm": 2.1036490579717793e-05, + "learning_rate": 1.3e-06, + "logits/chosen": -1.250001072883606, + "logits/rejected": -0.5378462672233582, + "logps/chosen": -335.57952880859375, + "logps/rejected": -407.97222900390625, + "loss": 0.3227, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.664595127105713, + "rewards/margins": 4.549985408782959, + "rewards/rejected": -9.214579582214355, + "step": 260 + }, + { + "epoch": 0.027, + "grad_norm": 4.041048049926758, + "learning_rate": 1.3500000000000002e-06, + "logits/chosen": -0.947732150554657, + "logits/rejected": -0.7202231884002686, + "logps/chosen": -326.36785888671875, + "logps/rejected": -365.80517578125, + "loss": 0.631, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -5.729133129119873, + "rewards/margins": 1.104986310005188, + "rewards/rejected": -6.834118843078613, + "step": 270 + }, + { + "epoch": 0.028, + "grad_norm": 18.695680618286133, + "learning_rate": 1.4000000000000001e-06, + "logits/chosen": -0.966740608215332, + "logits/rejected": -0.5254305005073547, + "logps/chosen": -485.3287658691406, + "logps/rejected": -571.0950927734375, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.315737724304199, + "rewards/margins": 8.604612350463867, + "rewards/rejected": -13.920351028442383, + "step": 280 + }, + { + "epoch": 0.029, + "grad_norm": 0.7097064256668091, + "learning_rate": 1.45e-06, + "logits/chosen": -1.3276937007904053, + "logits/rejected": -0.7066922187805176, + "logps/chosen": -268.1031494140625, + "logps/rejected": -441.033935546875, + "loss": 0.0695, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8867766857147217, + "rewards/margins": 8.865959167480469, + "rewards/rejected": -12.75273609161377, + "step": 290 + }, + { + "epoch": 0.03, + "grad_norm": 0.003646882250905037, + "learning_rate": 1.5e-06, + "logits/chosen": -1.3603465557098389, + "logits/rejected": -0.40795159339904785, + "logps/chosen": -275.05108642578125, + "logps/rejected": -438.65948486328125, + "loss": 0.0554, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.856437683105469, + "rewards/margins": 7.816611289978027, + "rewards/rejected": -12.673048973083496, + "step": 300 + }, + { + "epoch": 0.031, + "grad_norm": 0.048718515783548355, + "learning_rate": 1.5500000000000002e-06, + "logits/chosen": -1.641971230506897, + "logits/rejected": -0.41742610931396484, + "logps/chosen": -204.53335571289062, + "logps/rejected": -590.5496826171875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.203894138336182, + "rewards/margins": 12.468754768371582, + "rewards/rejected": -17.67264747619629, + "step": 310 + }, + { + "epoch": 0.032, + "grad_norm": 2.144958972930908, + "learning_rate": 1.6000000000000001e-06, + "logits/chosen": -1.0715140104293823, + "logits/rejected": -0.611113429069519, + "logps/chosen": -185.79281616210938, + "logps/rejected": -347.6463623046875, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.1656951904296875, + "rewards/margins": 8.577848434448242, + "rewards/rejected": -12.743544578552246, + "step": 320 + }, + { + "epoch": 0.033, + "grad_norm": 15.040058135986328, + "learning_rate": 1.6500000000000003e-06, + "logits/chosen": -1.2094731330871582, + "logits/rejected": -0.5106289386749268, + "logps/chosen": -277.95721435546875, + "logps/rejected": -438.82293701171875, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.8129706382751465, + "rewards/margins": 7.339131832122803, + "rewards/rejected": -15.15210247039795, + "step": 330 + }, + { + "epoch": 0.034, + "grad_norm": 484.30682373046875, + "learning_rate": 1.7000000000000002e-06, + "logits/chosen": -0.9956814050674438, + "logits/rejected": -0.4930063784122467, + "logps/chosen": -486.2621154785156, + "logps/rejected": -595.5841064453125, + "loss": 0.8495, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.835428237915039, + "rewards/margins": 12.453630447387695, + "rewards/rejected": -23.289060592651367, + "step": 340 + }, + { + "epoch": 0.035, + "grad_norm": 80.75074768066406, + "learning_rate": 1.75e-06, + "logits/chosen": -0.7541705369949341, + "logits/rejected": -0.5088824033737183, + "logps/chosen": -468.2110290527344, + "logps/rejected": -566.8349609375, + "loss": 0.0637, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.546435356140137, + "rewards/margins": 11.740583419799805, + "rewards/rejected": -24.287019729614258, + "step": 350 + }, + { + "epoch": 0.036, + "grad_norm": 5.622852086162311e-07, + "learning_rate": 1.8000000000000001e-06, + "logits/chosen": -1.2729134559631348, + "logits/rejected": -0.425194650888443, + "logps/chosen": -341.8668212890625, + "logps/rejected": -553.38818359375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.173413276672363, + "rewards/margins": 13.597102165222168, + "rewards/rejected": -20.77051544189453, + "step": 360 + }, + { + "epoch": 0.037, + "grad_norm": 4.885341020610667e-10, + "learning_rate": 1.85e-06, + "logits/chosen": -1.3934372663497925, + "logits/rejected": -0.39337393641471863, + "logps/chosen": -333.2331848144531, + "logps/rejected": -671.9036254882812, + "loss": 0.0656, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.352590560913086, + "rewards/margins": 14.552865982055664, + "rewards/rejected": -23.90545654296875, + "step": 370 + }, + { + "epoch": 0.038, + "grad_norm": 0.003559031756594777, + "learning_rate": 1.9000000000000002e-06, + "logits/chosen": -1.3900734186172485, + "logits/rejected": -0.4074910581111908, + "logps/chosen": -415.312744140625, + "logps/rejected": -717.7049560546875, + "loss": 0.037, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.72658634185791, + "rewards/margins": 18.283475875854492, + "rewards/rejected": -24.01006317138672, + "step": 380 + }, + { + "epoch": 0.039, + "grad_norm": 618.7286376953125, + "learning_rate": 1.9500000000000004e-06, + "logits/chosen": -1.2673394680023193, + "logits/rejected": -0.5207124948501587, + "logps/chosen": -428.8023376464844, + "logps/rejected": -595.412353515625, + "loss": 0.9008, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -10.832281112670898, + "rewards/margins": 11.771617889404297, + "rewards/rejected": -22.603899002075195, + "step": 390 + }, + { + "epoch": 0.04, + "grad_norm": 2.328858670352929e-07, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -1.086723804473877, + "logits/rejected": -0.42811456322669983, + "logps/chosen": -209.59927368164062, + "logps/rejected": -433.703125, + "loss": 0.0888, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.72649621963501, + "rewards/margins": 12.687189102172852, + "rewards/rejected": -19.413684844970703, + "step": 400 + }, + { + "epoch": 0.041, + "grad_norm": 0.0021633415017277002, + "learning_rate": 2.05e-06, + "logits/chosen": -1.3655961751937866, + "logits/rejected": -0.2905605435371399, + "logps/chosen": -250.3804473876953, + "logps/rejected": -517.5274047851562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.846493721008301, + "rewards/margins": 13.903947830200195, + "rewards/rejected": -19.75044059753418, + "step": 410 + }, + { + "epoch": 0.042, + "grad_norm": 0.0014651113888248801, + "learning_rate": 2.1000000000000002e-06, + "logits/chosen": -0.803868293762207, + "logits/rejected": -0.6163159012794495, + "logps/chosen": -296.12939453125, + "logps/rejected": -395.0677490234375, + "loss": 0.5212, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -7.992659568786621, + "rewards/margins": 8.027377128601074, + "rewards/rejected": -16.020038604736328, + "step": 420 + }, + { + "epoch": 0.043, + "grad_norm": 268.1305847167969, + "learning_rate": 2.15e-06, + "logits/chosen": -1.315779447555542, + "logits/rejected": -0.3827098309993744, + "logps/chosen": -395.30950927734375, + "logps/rejected": -537.7557373046875, + "loss": 0.5238, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -7.556341648101807, + "rewards/margins": 8.877912521362305, + "rewards/rejected": -16.434253692626953, + "step": 430 + }, + { + "epoch": 0.044, + "grad_norm": 2.4609992124169366e-06, + "learning_rate": 2.2e-06, + "logits/chosen": -1.0926239490509033, + "logits/rejected": -0.5294037461280823, + "logps/chosen": -199.20274353027344, + "logps/rejected": -308.11114501953125, + "loss": 0.1815, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.401622772216797, + "rewards/margins": 7.701199531555176, + "rewards/rejected": -14.102824211120605, + "step": 440 + }, + { + "epoch": 0.045, + "grad_norm": 2.4242572180810384e-05, + "learning_rate": 2.25e-06, + "logits/chosen": -1.2972681522369385, + "logits/rejected": -0.34338143467903137, + "logps/chosen": -239.63858032226562, + "logps/rejected": -592.6209106445312, + "loss": 0.0933, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.572920799255371, + "rewards/margins": 17.80848503112793, + "rewards/rejected": -25.381404876708984, + "step": 450 + }, + { + "epoch": 0.046, + "grad_norm": 2.4251374242112433e-09, + "learning_rate": 2.3000000000000004e-06, + "logits/chosen": -1.2712339162826538, + "logits/rejected": -0.3596915602684021, + "logps/chosen": -288.246337890625, + "logps/rejected": -570.5148315429688, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.5534467697143555, + "rewards/margins": 17.334653854370117, + "rewards/rejected": -23.888103485107422, + "step": 460 + }, + { + "epoch": 0.047, + "grad_norm": 0.00030805158894509077, + "learning_rate": 2.35e-06, + "logits/chosen": -1.4388515949249268, + "logits/rejected": -0.34689822793006897, + "logps/chosen": -268.9949035644531, + "logps/rejected": -542.9310302734375, + "loss": 0.1292, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.221593856811523, + "rewards/margins": 14.034378051757812, + "rewards/rejected": -23.255971908569336, + "step": 470 + }, + { + "epoch": 0.048, + "grad_norm": 0.32315555214881897, + "learning_rate": 2.4000000000000003e-06, + "logits/chosen": -1.1578876972198486, + "logits/rejected": -0.5471175312995911, + "logps/chosen": -498.71588134765625, + "logps/rejected": -660.1671752929688, + "loss": 0.3037, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -13.711542129516602, + "rewards/margins": 13.745755195617676, + "rewards/rejected": -27.45729637145996, + "step": 480 + }, + { + "epoch": 0.049, + "grad_norm": 3.87372857789804e-15, + "learning_rate": 2.4500000000000003e-06, + "logits/chosen": -0.8546003103256226, + "logits/rejected": -0.39226824045181274, + "logps/chosen": -500.3169860839844, + "logps/rejected": -690.2376098632812, + "loss": 0.6083, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -14.116811752319336, + "rewards/margins": 16.512969970703125, + "rewards/rejected": -30.62978172302246, + "step": 490 + }, + { + "epoch": 0.05, + "grad_norm": 132.11865234375, + "learning_rate": 2.5e-06, + "logits/chosen": -0.9677556753158569, + "logits/rejected": -0.3073219656944275, + "logps/chosen": -347.23687744140625, + "logps/rejected": -564.311279296875, + "loss": 0.024, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.032984733581543, + "rewards/margins": 16.148744583129883, + "rewards/rejected": -29.181732177734375, + "step": 500 + }, + { + "epoch": 0.051, + "grad_norm": 5.183964965193438e-14, + "learning_rate": 2.55e-06, + "logits/chosen": -0.9282774925231934, + "logits/rejected": -0.3282240927219391, + "logps/chosen": -284.4826354980469, + "logps/rejected": -543.125, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.672806739807129, + "rewards/margins": 19.902965545654297, + "rewards/rejected": -29.57577133178711, + "step": 510 + }, + { + "epoch": 0.052, + "grad_norm": 0.0002157751878257841, + "learning_rate": 2.6e-06, + "logits/chosen": -1.4002107381820679, + "logits/rejected": -0.2400444746017456, + "logps/chosen": -269.77008056640625, + "logps/rejected": -616.1730346679688, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.905624389648438, + "rewards/margins": 22.148664474487305, + "rewards/rejected": -31.054290771484375, + "step": 520 + }, + { + "epoch": 0.053, + "grad_norm": 7.059870767989196e-06, + "learning_rate": 2.6500000000000005e-06, + "logits/chosen": -0.8650500178337097, + "logits/rejected": -0.3586713671684265, + "logps/chosen": -244.1106719970703, + "logps/rejected": -523.0113525390625, + "loss": 0.1536, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.576187133789062, + "rewards/margins": 16.308609008789062, + "rewards/rejected": -26.884796142578125, + "step": 530 + }, + { + "epoch": 0.054, + "grad_norm": 5.5057416958881333e-11, + "learning_rate": 2.7000000000000004e-06, + "logits/chosen": -1.052830696105957, + "logits/rejected": -0.8504500389099121, + "logps/chosen": -346.8498229980469, + "logps/rejected": -457.3299255371094, + "loss": 2.0321, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -11.125707626342773, + "rewards/margins": 10.678735733032227, + "rewards/rejected": -21.804443359375, + "step": 540 + }, + { + "epoch": 0.055, + "grad_norm": 1.947714372363407e-05, + "learning_rate": 2.7500000000000004e-06, + "logits/chosen": -1.0468206405639648, + "logits/rejected": -0.416832834482193, + "logps/chosen": -450.96832275390625, + "logps/rejected": -750.3878173828125, + "loss": 0.2331, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -13.610055923461914, + "rewards/margins": 20.343788146972656, + "rewards/rejected": -33.9538459777832, + "step": 550 + }, + { + "epoch": 0.056, + "grad_norm": 5.5382918383977406e-17, + "learning_rate": 2.8000000000000003e-06, + "logits/chosen": -1.5744976997375488, + "logits/rejected": -0.3118807077407837, + "logps/chosen": -226.67398071289062, + "logps/rejected": -629.4376831054688, + "loss": 0.9147, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -9.122535705566406, + "rewards/margins": 20.281320571899414, + "rewards/rejected": -29.403854370117188, + "step": 560 + }, + { + "epoch": 0.057, + "grad_norm": 0.5190900564193726, + "learning_rate": 2.85e-06, + "logits/chosen": -1.0775177478790283, + "logits/rejected": -0.6953670382499695, + "logps/chosen": -338.7844543457031, + "logps/rejected": -576.4647216796875, + "loss": 0.5182, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.195273399353027, + "rewards/margins": 19.976835250854492, + "rewards/rejected": -25.172109603881836, + "step": 570 + }, + { + "epoch": 0.058, + "grad_norm": 2.0154071535216644e-05, + "learning_rate": 2.9e-06, + "logits/chosen": -0.8628554344177246, + "logits/rejected": -0.36490216851234436, + "logps/chosen": -266.5061340332031, + "logps/rejected": -409.20050048828125, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.164038181304932, + "rewards/margins": 11.171427726745605, + "rewards/rejected": -17.335468292236328, + "step": 580 + }, + { + "epoch": 0.059, + "grad_norm": 0.007016741205006838, + "learning_rate": 2.95e-06, + "logits/chosen": -1.1316006183624268, + "logits/rejected": -0.4583218991756439, + "logps/chosen": -468.5044860839844, + "logps/rejected": -696.3089599609375, + "loss": 0.0496, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.475954055786133, + "rewards/margins": 20.412355422973633, + "rewards/rejected": -29.8883113861084, + "step": 590 + }, + { + "epoch": 0.06, + "grad_norm": 713.2523803710938, + "learning_rate": 3e-06, + "logits/chosen": -1.00506591796875, + "logits/rejected": -0.45325785875320435, + "logps/chosen": -389.286865234375, + "logps/rejected": -647.7471923828125, + "loss": 0.5942, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -14.856890678405762, + "rewards/margins": 18.655689239501953, + "rewards/rejected": -33.51258087158203, + "step": 600 + }, + { + "epoch": 0.061, + "grad_norm": 0.0021474172826856375, + "learning_rate": 3.05e-06, + "logits/chosen": -1.1691076755523682, + "logits/rejected": -0.463383287191391, + "logps/chosen": -381.8735046386719, + "logps/rejected": -664.1435546875, + "loss": 0.5041, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.865372657775879, + "rewards/margins": 17.90451431274414, + "rewards/rejected": -28.769886016845703, + "step": 610 + }, + { + "epoch": 0.062, + "grad_norm": 5.3986898285174334e-11, + "learning_rate": 3.1000000000000004e-06, + "logits/chosen": -1.3650020360946655, + "logits/rejected": -0.35710233449935913, + "logps/chosen": -326.7483215332031, + "logps/rejected": -618.0527954101562, + "loss": 0.1221, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.112927436828613, + "rewards/margins": 21.5576171875, + "rewards/rejected": -28.670541763305664, + "step": 620 + }, + { + "epoch": 0.063, + "grad_norm": 402.3836669921875, + "learning_rate": 3.1500000000000003e-06, + "logits/chosen": -1.0086907148361206, + "logits/rejected": -0.33054283261299133, + "logps/chosen": -393.40740966796875, + "logps/rejected": -667.0352783203125, + "loss": 0.4131, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.809080123901367, + "rewards/margins": 17.650827407836914, + "rewards/rejected": -27.459903717041016, + "step": 630 + }, + { + "epoch": 0.064, + "grad_norm": 0.00016561997472308576, + "learning_rate": 3.2000000000000003e-06, + "logits/chosen": -0.7735254168510437, + "logits/rejected": -0.4900113642215729, + "logps/chosen": -365.861572265625, + "logps/rejected": -569.7633056640625, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.899888038635254, + "rewards/margins": 16.789934158325195, + "rewards/rejected": -28.6898193359375, + "step": 640 + }, + { + "epoch": 0.065, + "grad_norm": 59.89263916015625, + "learning_rate": 3.2500000000000002e-06, + "logits/chosen": -0.9595259428024292, + "logits/rejected": -0.35813766717910767, + "logps/chosen": -261.4634094238281, + "logps/rejected": -478.71539306640625, + "loss": 0.0998, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.686963081359863, + "rewards/margins": 16.35685157775879, + "rewards/rejected": -24.043813705444336, + "step": 650 + }, + { + "epoch": 0.066, + "grad_norm": 1.7982006161876285e-10, + "learning_rate": 3.3000000000000006e-06, + "logits/chosen": -1.4923456907272339, + "logits/rejected": -0.4172714352607727, + "logps/chosen": -519.4299926757812, + "logps/rejected": -703.1495971679688, + "loss": 1.4947, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.720640182495117, + "rewards/margins": 21.21224594116211, + "rewards/rejected": -30.932886123657227, + "step": 660 + }, + { + "epoch": 0.067, + "grad_norm": 2.2147442436571483e-18, + "learning_rate": 3.3500000000000005e-06, + "logits/chosen": -1.2784268856048584, + "logits/rejected": -0.3738354742527008, + "logps/chosen": -467.1693420410156, + "logps/rejected": -785.3607177734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.926630020141602, + "rewards/margins": 24.06552505493164, + "rewards/rejected": -34.992156982421875, + "step": 670 + }, + { + "epoch": 0.068, + "grad_norm": 2.3722683973464997e-12, + "learning_rate": 3.4000000000000005e-06, + "logits/chosen": -1.2794643640518188, + "logits/rejected": -0.5191472172737122, + "logps/chosen": -374.89996337890625, + "logps/rejected": -682.2388916015625, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.074565887451172, + "rewards/margins": 20.966999053955078, + "rewards/rejected": -30.04156494140625, + "step": 680 + }, + { + "epoch": 0.069, + "grad_norm": 3.0734440058070855e-13, + "learning_rate": 3.45e-06, + "logits/chosen": -1.0704734325408936, + "logits/rejected": -0.36915844678878784, + "logps/chosen": -282.9461975097656, + "logps/rejected": -606.0206909179688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.159383773803711, + "rewards/margins": 23.78475570678711, + "rewards/rejected": -31.944141387939453, + "step": 690 + }, + { + "epoch": 0.07, + "grad_norm": 0.0012723951367661357, + "learning_rate": 3.5e-06, + "logits/chosen": -1.42892324924469, + "logits/rejected": -0.40216541290283203, + "logps/chosen": -279.84637451171875, + "logps/rejected": -653.5191040039062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.145906448364258, + "rewards/margins": 22.8740234375, + "rewards/rejected": -31.019927978515625, + "step": 700 + }, + { + "epoch": 0.071, + "grad_norm": 14.484932899475098, + "learning_rate": 3.5500000000000003e-06, + "logits/chosen": -1.364404320716858, + "logits/rejected": -0.25203460454940796, + "logps/chosen": -239.27487182617188, + "logps/rejected": -695.9551391601562, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.248779296875, + "rewards/margins": 23.697933197021484, + "rewards/rejected": -31.946712493896484, + "step": 710 + }, + { + "epoch": 0.072, + "grad_norm": 3.2707375794416294e-06, + "learning_rate": 3.6000000000000003e-06, + "logits/chosen": -1.447434663772583, + "logits/rejected": -0.28548040986061096, + "logps/chosen": -227.4899139404297, + "logps/rejected": -585.2236938476562, + "loss": 0.1291, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.030697345733643, + "rewards/margins": 25.32795524597168, + "rewards/rejected": -30.358654022216797, + "step": 720 + }, + { + "epoch": 0.073, + "grad_norm": 0.06150020286440849, + "learning_rate": 3.65e-06, + "logits/chosen": -1.1551002264022827, + "logits/rejected": -0.3233141005039215, + "logps/chosen": -284.21990966796875, + "logps/rejected": -589.0573120117188, + "loss": 0.4251, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.090296745300293, + "rewards/margins": 24.372478485107422, + "rewards/rejected": -32.46277618408203, + "step": 730 + }, + { + "epoch": 0.074, + "grad_norm": 226.98077392578125, + "learning_rate": 3.7e-06, + "logits/chosen": -1.0338466167449951, + "logits/rejected": -0.30206966400146484, + "logps/chosen": -294.2165832519531, + "logps/rejected": -544.0670166015625, + "loss": 0.0822, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.504319190979004, + "rewards/margins": 18.94892692565918, + "rewards/rejected": -28.4532470703125, + "step": 740 + }, + { + "epoch": 0.075, + "grad_norm": 0.00013524027599487454, + "learning_rate": 3.7500000000000005e-06, + "logits/chosen": -1.2259372472763062, + "logits/rejected": -0.408051073551178, + "logps/chosen": -450.68182373046875, + "logps/rejected": -706.7760620117188, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.297445297241211, + "rewards/margins": 19.66615867614746, + "rewards/rejected": -31.963603973388672, + "step": 750 + }, + { + "epoch": 0.076, + "grad_norm": 3.848186491683947e-10, + "learning_rate": 3.8000000000000005e-06, + "logits/chosen": -0.7417271137237549, + "logits/rejected": -0.5415674448013306, + "logps/chosen": -414.8028259277344, + "logps/rejected": -559.133544921875, + "loss": 0.1328, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.061386108398438, + "rewards/margins": 16.20886993408203, + "rewards/rejected": -27.270259857177734, + "step": 760 + }, + { + "epoch": 0.077, + "grad_norm": 7.364305562931883e-13, + "learning_rate": 3.85e-06, + "logits/chosen": -1.117499589920044, + "logits/rejected": -0.2406352460384369, + "logps/chosen": -344.18463134765625, + "logps/rejected": -646.6490478515625, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.47022819519043, + "rewards/margins": 19.650503158569336, + "rewards/rejected": -33.12073516845703, + "step": 770 + }, + { + "epoch": 0.078, + "grad_norm": 0.0572834387421608, + "learning_rate": 3.900000000000001e-06, + "logits/chosen": -1.432673454284668, + "logits/rejected": -0.17732997238636017, + "logps/chosen": -254.5869903564453, + "logps/rejected": -625.7298583984375, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.25689697265625, + "rewards/margins": 25.62160301208496, + "rewards/rejected": -35.878501892089844, + "step": 780 + }, + { + "epoch": 0.079, + "grad_norm": 6.07656394820133e-13, + "learning_rate": 3.95e-06, + "logits/chosen": -0.8815616369247437, + "logits/rejected": -0.30584144592285156, + "logps/chosen": -766.3916625976562, + "logps/rejected": -845.8123779296875, + "loss": 0.1973, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -16.423290252685547, + "rewards/margins": 19.526325225830078, + "rewards/rejected": -35.94961166381836, + "step": 790 + }, + { + "epoch": 0.08, + "grad_norm": 3.452759151782028e-20, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -1.6766525506973267, + "logits/rejected": -0.2753424346446991, + "logps/chosen": -246.43896484375, + "logps/rejected": -725.5803833007812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.301102638244629, + "rewards/margins": 28.94573974609375, + "rewards/rejected": -41.2468376159668, + "step": 800 + }, + { + "epoch": 0.081, + "grad_norm": 6.12564349466993e-07, + "learning_rate": 4.05e-06, + "logits/chosen": -0.8568048477172852, + "logits/rejected": -0.28903594613075256, + "logps/chosen": -604.75244140625, + "logps/rejected": -807.2450561523438, + "loss": 0.1291, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.469955444335938, + "rewards/margins": 20.681560516357422, + "rewards/rejected": -42.151512145996094, + "step": 810 + }, + { + "epoch": 0.082, + "grad_norm": 0.0315057635307312, + "learning_rate": 4.1e-06, + "logits/chosen": -0.8729526400566101, + "logits/rejected": -0.36734262108802795, + "logps/chosen": -472.77264404296875, + "logps/rejected": -788.4251708984375, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.225198745727539, + "rewards/margins": 28.6204776763916, + "rewards/rejected": -43.845680236816406, + "step": 820 + }, + { + "epoch": 0.083, + "grad_norm": 2.3944246768951416, + "learning_rate": 4.15e-06, + "logits/chosen": -1.023874044418335, + "logits/rejected": -0.07073228061199188, + "logps/chosen": -164.30848693847656, + "logps/rejected": -547.1531982421875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.095312595367432, + "rewards/margins": 27.135452270507812, + "rewards/rejected": -34.23076629638672, + "step": 830 + }, + { + "epoch": 0.084, + "grad_norm": 0.006124300882220268, + "learning_rate": 4.2000000000000004e-06, + "logits/chosen": -0.9126752614974976, + "logits/rejected": -0.3035658299922943, + "logps/chosen": -397.1790466308594, + "logps/rejected": -735.6592407226562, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.258554458618164, + "rewards/margins": 26.314579010009766, + "rewards/rejected": -44.57312774658203, + "step": 840 + }, + { + "epoch": 0.085, + "grad_norm": 3.089939588841582e-18, + "learning_rate": 4.25e-06, + "logits/chosen": -1.015491247177124, + "logits/rejected": -0.30377858877182007, + "logps/chosen": -352.0212097167969, + "logps/rejected": -681.9417724609375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.543761253356934, + "rewards/margins": 27.535030364990234, + "rewards/rejected": -39.07879638671875, + "step": 850 + }, + { + "epoch": 0.086, + "grad_norm": 2.16502828340149e-15, + "learning_rate": 4.3e-06, + "logits/chosen": -1.081601858139038, + "logits/rejected": -0.15704300999641418, + "logps/chosen": -481.53973388671875, + "logps/rejected": -850.2933349609375, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.392127990722656, + "rewards/margins": 25.024600982666016, + "rewards/rejected": -43.416725158691406, + "step": 860 + }, + { + "epoch": 0.087, + "grad_norm": 4.893959339824505e-05, + "learning_rate": 4.350000000000001e-06, + "logits/chosen": -1.2104352712631226, + "logits/rejected": -0.07070871442556381, + "logps/chosen": -370.2465515136719, + "logps/rejected": -821.4221801757812, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.73651123046875, + "rewards/margins": 32.19016647338867, + "rewards/rejected": -47.92667770385742, + "step": 870 + }, + { + "epoch": 0.088, + "grad_norm": 1201.88623046875, + "learning_rate": 4.4e-06, + "logits/chosen": -1.0273677110671997, + "logits/rejected": -0.21135012805461884, + "logps/chosen": -353.56280517578125, + "logps/rejected": -722.2070922851562, + "loss": 1.691, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.636672973632812, + "rewards/margins": 27.89716148376465, + "rewards/rejected": -46.533836364746094, + "step": 880 + }, + { + "epoch": 0.089, + "grad_norm": 3.308795930320285e-15, + "learning_rate": 4.450000000000001e-06, + "logits/chosen": -1.327782154083252, + "logits/rejected": -0.0714651569724083, + "logps/chosen": -502.51007080078125, + "logps/rejected": -904.955078125, + "loss": 1.6639, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.43936538696289, + "rewards/margins": 28.087310791015625, + "rewards/rejected": -52.52668380737305, + "step": 890 + }, + { + "epoch": 0.09, + "grad_norm": 4.506963812572762e-12, + "learning_rate": 4.5e-06, + "logits/chosen": -1.178138017654419, + "logits/rejected": -0.036952096968889236, + "logps/chosen": -337.31756591796875, + "logps/rejected": -817.9973754882812, + "loss": 0.1378, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -15.041440963745117, + "rewards/margins": 30.560115814208984, + "rewards/rejected": -45.60155487060547, + "step": 900 + }, + { + "epoch": 0.091, + "grad_norm": 5.517401950783096e-05, + "learning_rate": 4.5500000000000005e-06, + "logits/chosen": -1.073327898979187, + "logits/rejected": -0.4490521550178528, + "logps/chosen": -490.901611328125, + "logps/rejected": -721.9581909179688, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.316640853881836, + "rewards/margins": 29.4993953704834, + "rewards/rejected": -43.8160400390625, + "step": 910 + }, + { + "epoch": 0.092, + "grad_norm": 3.1243777994617106e-15, + "learning_rate": 4.600000000000001e-06, + "logits/chosen": -1.8327325582504272, + "logits/rejected": -0.01891680620610714, + "logps/chosen": -203.77676391601562, + "logps/rejected": -841.4299926757812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.827959060668945, + "rewards/margins": 43.51868438720703, + "rewards/rejected": -53.34663772583008, + "step": 920 + }, + { + "epoch": 0.093, + "grad_norm": 3.558796279889975e-08, + "learning_rate": 4.65e-06, + "logits/chosen": -1.027311086654663, + "logits/rejected": -0.20076104998588562, + "logps/chosen": -326.1154479980469, + "logps/rejected": -669.1527099609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.236804962158203, + "rewards/margins": 28.084369659423828, + "rewards/rejected": -40.32117462158203, + "step": 930 + }, + { + "epoch": 0.094, + "grad_norm": 7.486784133018432e-23, + "learning_rate": 4.7e-06, + "logits/chosen": -0.8670459985733032, + "logits/rejected": -0.19873929023742676, + "logps/chosen": -254.91256713867188, + "logps/rejected": -628.9371948242188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.05659008026123, + "rewards/margins": 27.286083221435547, + "rewards/rejected": -38.342674255371094, + "step": 940 + }, + { + "epoch": 0.095, + "grad_norm": 4.481721733536244e-16, + "learning_rate": 4.75e-06, + "logits/chosen": -1.0405689477920532, + "logits/rejected": -0.050393976271152496, + "logps/chosen": -293.336669921875, + "logps/rejected": -628.1806640625, + "loss": 0.8787, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -13.126751899719238, + "rewards/margins": 26.10247802734375, + "rewards/rejected": -39.229225158691406, + "step": 950 + }, + { + "epoch": 0.096, + "grad_norm": 0.6695166826248169, + "learning_rate": 4.800000000000001e-06, + "logits/chosen": -1.4227111339569092, + "logits/rejected": -0.11280278861522675, + "logps/chosen": -249.92056274414062, + "logps/rejected": -675.5670776367188, + "loss": 0.2883, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.505855560302734, + "rewards/margins": 25.19675064086914, + "rewards/rejected": -34.702606201171875, + "step": 960 + }, + { + "epoch": 0.097, + "grad_norm": 4.120292729226094e-08, + "learning_rate": 4.85e-06, + "logits/chosen": -1.407098412513733, + "logits/rejected": -0.2420966923236847, + "logps/chosen": -257.65374755859375, + "logps/rejected": -694.7686157226562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.209831237792969, + "rewards/margins": 27.865127563476562, + "rewards/rejected": -38.07495880126953, + "step": 970 + }, + { + "epoch": 0.098, + "grad_norm": 1.1203562983556366e-19, + "learning_rate": 4.9000000000000005e-06, + "logits/chosen": -1.0926339626312256, + "logits/rejected": -0.24714651703834534, + "logps/chosen": -388.4772644042969, + "logps/rejected": -831.8443603515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.813573837280273, + "rewards/margins": 32.880027770996094, + "rewards/rejected": -43.693603515625, + "step": 980 + }, + { + "epoch": 0.099, + "grad_norm": 0.03495605289936066, + "learning_rate": 4.95e-06, + "logits/chosen": -1.2097073793411255, + "logits/rejected": -0.46265825629234314, + "logps/chosen": -424.83245849609375, + "logps/rejected": -783.1539916992188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.046907424926758, + "rewards/margins": 30.06099510192871, + "rewards/rejected": -40.10790252685547, + "step": 990 + }, + { + "epoch": 0.1, + "grad_norm": 1.0239310510584687e-09, + "learning_rate": 5e-06, + "logits/chosen": -1.1762347221374512, + "logits/rejected": -0.045468103140592575, + "logps/chosen": -182.35528564453125, + "logps/rejected": -574.6000366210938, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.4294304847717285, + "rewards/margins": 24.870960235595703, + "rewards/rejected": -31.30039405822754, + "step": 1000 + }, + { + "epoch": 0.101, + "grad_norm": 0.08554869890213013, + "learning_rate": 4.999984769144476e-06, + "logits/chosen": -1.0135505199432373, + "logits/rejected": -0.1592954695224762, + "logps/chosen": -429.33612060546875, + "logps/rejected": -704.2899169921875, + "loss": 0.3016, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.181246757507324, + "rewards/margins": 22.7156925201416, + "rewards/rejected": -32.89693832397461, + "step": 1010 + }, + { + "epoch": 0.102, + "grad_norm": 202.30699157714844, + "learning_rate": 4.999939076763487e-06, + "logits/chosen": -1.2693045139312744, + "logits/rejected": -0.2478354275226593, + "logps/chosen": -160.41709899902344, + "logps/rejected": -502.111083984375, + "loss": 0.0408, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.172953128814697, + "rewards/margins": 22.934494018554688, + "rewards/rejected": -30.107446670532227, + "step": 1020 + }, + { + "epoch": 0.103, + "grad_norm": 6.811764317040607e-16, + "learning_rate": 4.999862923413781e-06, + "logits/chosen": -0.9106602668762207, + "logits/rejected": -0.4323008060455322, + "logps/chosen": -550.83837890625, + "logps/rejected": -812.7847900390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.511116981506348, + "rewards/margins": 26.43093490600586, + "rewards/rejected": -34.94205093383789, + "step": 1030 + }, + { + "epoch": 0.104, + "grad_norm": 6.298041515390151e-13, + "learning_rate": 4.999756310023261e-06, + "logits/chosen": -1.1837074756622314, + "logits/rejected": -0.42569518089294434, + "logps/chosen": -483.1153259277344, + "logps/rejected": -856.2667236328125, + "loss": 1.8217, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -13.297500610351562, + "rewards/margins": 29.203968048095703, + "rewards/rejected": -42.501468658447266, + "step": 1040 + }, + { + "epoch": 0.105, + "grad_norm": 0.015196479856967926, + "learning_rate": 4.9996192378909785e-06, + "logits/chosen": -1.2038367986679077, + "logits/rejected": -0.36220741271972656, + "logps/chosen": -272.37530517578125, + "logps/rejected": -554.7415161132812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.584593772888184, + "rewards/margins": 23.410310745239258, + "rewards/rejected": -28.994903564453125, + "step": 1050 + }, + { + "epoch": 0.106, + "grad_norm": 0.07376130670309067, + "learning_rate": 4.999451708687114e-06, + "logits/chosen": -1.2006012201309204, + "logits/rejected": -0.46303287148475647, + "logps/chosen": -300.4767761230469, + "logps/rejected": -579.3289794921875, + "loss": 1.0262, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.391415119171143, + "rewards/margins": 13.561741828918457, + "rewards/rejected": -18.953155517578125, + "step": 1060 + }, + { + "epoch": 0.107, + "grad_norm": 5.374231726307244e-09, + "learning_rate": 4.9992537244529585e-06, + "logits/chosen": -0.9604480862617493, + "logits/rejected": -0.30329519510269165, + "logps/chosen": -316.0964660644531, + "logps/rejected": -505.31396484375, + "loss": 0.0341, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.198069095611572, + "rewards/margins": 17.348926544189453, + "rewards/rejected": -21.546995162963867, + "step": 1070 + }, + { + "epoch": 0.108, + "grad_norm": 0.00011263292981311679, + "learning_rate": 4.999025287600886e-06, + "logits/chosen": -1.090127944946289, + "logits/rejected": -0.5740963220596313, + "logps/chosen": -243.5624237060547, + "logps/rejected": -525.05126953125, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.5334296226501465, + "rewards/margins": 18.794811248779297, + "rewards/rejected": -25.3282413482666, + "step": 1080 + }, + { + "epoch": 0.109, + "grad_norm": 112.01258850097656, + "learning_rate": 4.998766400914329e-06, + "logits/chosen": -1.1208736896514893, + "logits/rejected": -0.2704886198043823, + "logps/chosen": -215.65731811523438, + "logps/rejected": -516.1124267578125, + "loss": 0.0734, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.3899407386779785, + "rewards/margins": 19.51913070678711, + "rewards/rejected": -25.909076690673828, + "step": 1090 + }, + { + "epoch": 0.11, + "grad_norm": 2.881012919550563e-12, + "learning_rate": 4.99847706754774e-06, + "logits/chosen": -1.248564600944519, + "logits/rejected": -0.23786959052085876, + "logps/chosen": -295.33233642578125, + "logps/rejected": -660.6017456054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.113610744476318, + "rewards/margins": 29.882919311523438, + "rewards/rejected": -35.99652862548828, + "step": 1100 + }, + { + "epoch": 0.111, + "grad_norm": 8.900973014203117e-11, + "learning_rate": 4.998157291026553e-06, + "logits/chosen": -0.8095799684524536, + "logits/rejected": -0.5679833889007568, + "logps/chosen": -364.50518798828125, + "logps/rejected": -604.2783203125, + "loss": 1.2719, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -10.384870529174805, + "rewards/margins": 17.38606834411621, + "rewards/rejected": -27.77094078063965, + "step": 1110 + }, + { + "epoch": 0.112, + "grad_norm": 1.76853864886084e-10, + "learning_rate": 4.997807075247147e-06, + "logits/chosen": -1.0766984224319458, + "logits/rejected": -0.40137988328933716, + "logps/chosen": -344.74493408203125, + "logps/rejected": -616.8096923828125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.687139987945557, + "rewards/margins": 19.36139488220215, + "rewards/rejected": -26.048538208007812, + "step": 1120 + }, + { + "epoch": 0.113, + "grad_norm": 0.18956993520259857, + "learning_rate": 4.997426424476787e-06, + "logits/chosen": -1.1012532711029053, + "logits/rejected": -0.47621792554855347, + "logps/chosen": -413.59075927734375, + "logps/rejected": -553.7152099609375, + "loss": 0.0778, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.19697380065918, + "rewards/margins": 17.737911224365234, + "rewards/rejected": -22.934885025024414, + "step": 1130 + }, + { + "epoch": 0.114, + "grad_norm": 4.789621829986572, + "learning_rate": 4.9970153433535855e-06, + "logits/chosen": -0.9956803321838379, + "logits/rejected": -0.15662881731987, + "logps/chosen": -178.8624725341797, + "logps/rejected": -406.60443115234375, + "loss": 0.0942, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.233434200286865, + "rewards/margins": 15.290153503417969, + "rewards/rejected": -21.523588180541992, + "step": 1140 + }, + { + "epoch": 0.115, + "grad_norm": 1.3092710560158594e-07, + "learning_rate": 4.9965738368864345e-06, + "logits/chosen": -1.0208417177200317, + "logits/rejected": -0.3284724950790405, + "logps/chosen": -319.4895935058594, + "logps/rejected": -578.760009765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.61468505859375, + "rewards/margins": 19.40301513671875, + "rewards/rejected": -28.017696380615234, + "step": 1150 + }, + { + "epoch": 0.116, + "grad_norm": 2.5855173589661717e-05, + "learning_rate": 4.996101910454953e-06, + "logits/chosen": -1.500450611114502, + "logits/rejected": -0.21137702465057373, + "logps/chosen": -271.80279541015625, + "logps/rejected": -712.6658935546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.964298248291016, + "rewards/margins": 27.46476173400879, + "rewards/rejected": -36.42905807495117, + "step": 1160 + }, + { + "epoch": 0.117, + "grad_norm": 3.229709277796644e-10, + "learning_rate": 4.995599569809414e-06, + "logits/chosen": -1.355883240699768, + "logits/rejected": -0.21876247227191925, + "logps/chosen": -178.31338500976562, + "logps/rejected": -683.889892578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.416735649108887, + "rewards/margins": 30.885372161865234, + "rewards/rejected": -38.30210876464844, + "step": 1170 + }, + { + "epoch": 0.118, + "grad_norm": 0.000150295440107584, + "learning_rate": 4.9950668210706795e-06, + "logits/chosen": -0.8052603602409363, + "logits/rejected": -0.5494809746742249, + "logps/chosen": -378.8117370605469, + "logps/rejected": -612.0853271484375, + "loss": 0.268, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.668352127075195, + "rewards/margins": 22.803325653076172, + "rewards/rejected": -33.4716796875, + "step": 1180 + }, + { + "epoch": 0.119, + "grad_norm": 9.76726077794865e-09, + "learning_rate": 4.994503670730126e-06, + "logits/chosen": -0.9996203184127808, + "logits/rejected": -0.3798294961452484, + "logps/chosen": -486.4266052246094, + "logps/rejected": -707.1964111328125, + "loss": 0.2434, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -11.749849319458008, + "rewards/margins": 24.026813507080078, + "rewards/rejected": -35.77666473388672, + "step": 1190 + }, + { + "epoch": 0.12, + "grad_norm": 7.2385314409118e-10, + "learning_rate": 4.993910125649561e-06, + "logits/chosen": -1.1892848014831543, + "logits/rejected": -0.22287265956401825, + "logps/chosen": -295.4376525878906, + "logps/rejected": -631.9881591796875, + "loss": 0.0264, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.681472301483154, + "rewards/margins": 18.257129669189453, + "rewards/rejected": -23.9385986328125, + "step": 1200 + }, + { + "epoch": 0.121, + "grad_norm": 4.749460824626794e-10, + "learning_rate": 4.993286193061145e-06, + "logits/chosen": -1.0906132459640503, + "logits/rejected": -0.20747146010398865, + "logps/chosen": -257.7426452636719, + "logps/rejected": -655.1525268554688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.186422348022461, + "rewards/margins": 27.005573272705078, + "rewards/rejected": -35.19198989868164, + "step": 1210 + }, + { + "epoch": 0.122, + "grad_norm": 6.223758930445911e-08, + "learning_rate": 4.992631880567301e-06, + "logits/chosen": -1.656432867050171, + "logits/rejected": -0.3102510869503021, + "logps/chosen": -348.0914306640625, + "logps/rejected": -825.349609375, + "loss": 0.0834, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.202226638793945, + "rewards/margins": 29.386104583740234, + "rewards/rejected": -39.58833312988281, + "step": 1220 + }, + { + "epoch": 0.123, + "grad_norm": 1.3237407074484508e-05, + "learning_rate": 4.991947196140619e-06, + "logits/chosen": -1.154322624206543, + "logits/rejected": -0.4798669219017029, + "logps/chosen": -273.0415344238281, + "logps/rejected": -574.0501708984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.697595119476318, + "rewards/margins": 23.833728790283203, + "rewards/rejected": -30.531320571899414, + "step": 1230 + }, + { + "epoch": 0.124, + "grad_norm": 0.009432843886315823, + "learning_rate": 4.9912321481237616e-06, + "logits/chosen": -1.3529380559921265, + "logits/rejected": -0.28025001287460327, + "logps/chosen": -214.38296508789062, + "logps/rejected": -579.0044555664062, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.91778564453125, + "rewards/margins": 20.354816436767578, + "rewards/rejected": -26.27260398864746, + "step": 1240 + }, + { + "epoch": 0.125, + "grad_norm": 1.7649913475192847e-10, + "learning_rate": 4.990486745229364e-06, + "logits/chosen": -1.2412010431289673, + "logits/rejected": -0.19147519767284393, + "logps/chosen": -286.7811279296875, + "logps/rejected": -753.50341796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.454304218292236, + "rewards/margins": 29.799936294555664, + "rewards/rejected": -35.25423812866211, + "step": 1250 + }, + { + "epoch": 0.126, + "grad_norm": 1.9105982074218986e-10, + "learning_rate": 4.989710996539926e-06, + "logits/chosen": -1.4346072673797607, + "logits/rejected": -0.3352917730808258, + "logps/chosen": -304.7495422363281, + "logps/rejected": -753.637939453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.5735673904418945, + "rewards/margins": 38.21772003173828, + "rewards/rejected": -42.791290283203125, + "step": 1260 + }, + { + "epoch": 0.127, + "grad_norm": 2.4584993596477034e-08, + "learning_rate": 4.9889049115077e-06, + "logits/chosen": -1.0849168300628662, + "logits/rejected": -0.21878328919410706, + "logps/chosen": -340.9671325683594, + "logps/rejected": -746.2839965820312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.222436904907227, + "rewards/margins": 35.21474838256836, + "rewards/rejected": -43.437191009521484, + "step": 1270 + }, + { + "epoch": 0.128, + "grad_norm": 9.595669325790368e-07, + "learning_rate": 4.988068499954578e-06, + "logits/chosen": -1.0678956508636475, + "logits/rejected": -0.3092970848083496, + "logps/chosen": -186.111572265625, + "logps/rejected": -427.2423400878906, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.491952419281006, + "rewards/margins": 16.506975173950195, + "rewards/rejected": -22.998926162719727, + "step": 1280 + }, + { + "epoch": 0.129, + "grad_norm": 6.616324241953686e-12, + "learning_rate": 4.987201772071971e-06, + "logits/chosen": -0.824511706829071, + "logits/rejected": -0.7110381126403809, + "logps/chosen": -341.7249450683594, + "logps/rejected": -489.94921875, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.455601692199707, + "rewards/margins": 21.240055084228516, + "rewards/rejected": -27.69565773010254, + "step": 1290 + }, + { + "epoch": 0.13, + "grad_norm": 0.00014927101437933743, + "learning_rate": 4.986304738420684e-06, + "logits/chosen": -1.0902425050735474, + "logits/rejected": -0.05907214805483818, + "logps/chosen": -247.59384155273438, + "logps/rejected": -632.3173828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.22404956817627, + "rewards/margins": 30.62563705444336, + "rewards/rejected": -39.84968566894531, + "step": 1300 + }, + { + "epoch": 0.131, + "grad_norm": 0.0, + "learning_rate": 4.985377409930789e-06, + "logits/chosen": -1.1010768413543701, + "logits/rejected": -0.34575071930885315, + "logps/chosen": -489.7151794433594, + "logps/rejected": -749.903564453125, + "loss": 1.3087, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.157520294189453, + "rewards/margins": 30.689640045166016, + "rewards/rejected": -39.84716033935547, + "step": 1310 + }, + { + "epoch": 0.132, + "grad_norm": 1.2251684909647675e-11, + "learning_rate": 4.984419797901491e-06, + "logits/chosen": -0.7680908441543579, + "logits/rejected": -0.5849398374557495, + "logps/chosen": -323.7127685546875, + "logps/rejected": -463.91302490234375, + "loss": 0.1654, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.299878120422363, + "rewards/margins": 16.57649040222168, + "rewards/rejected": -25.876373291015625, + "step": 1320 + }, + { + "epoch": 0.133, + "grad_norm": 4.208574894831729e-12, + "learning_rate": 4.983431914000991e-06, + "logits/chosen": -0.7924326658248901, + "logits/rejected": -0.30698415637016296, + "logps/chosen": -515.7220458984375, + "logps/rejected": -723.1101684570312, + "loss": 0.0457, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.746968269348145, + "rewards/margins": 22.760456085205078, + "rewards/rejected": -37.507423400878906, + "step": 1330 + }, + { + "epoch": 0.134, + "grad_norm": 1.1323597230195467e-19, + "learning_rate": 4.9824137702663424e-06, + "logits/chosen": -1.2904155254364014, + "logits/rejected": 0.13465338945388794, + "logps/chosen": -374.4416198730469, + "logps/rejected": -968.5813598632812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.409658432006836, + "rewards/margins": 43.10974884033203, + "rewards/rejected": -60.5194091796875, + "step": 1340 + }, + { + "epoch": 0.135, + "grad_norm": 1.4024823240688794e-13, + "learning_rate": 4.981365379103306e-06, + "logits/chosen": -0.8922684788703918, + "logits/rejected": -0.23835989832878113, + "logps/chosen": -437.84259033203125, + "logps/rejected": -763.39013671875, + "loss": 0.765, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.31096649169922, + "rewards/margins": 28.40448570251465, + "rewards/rejected": -47.7154541015625, + "step": 1350 + }, + { + "epoch": 0.136, + "grad_norm": 8.513531676510033e-13, + "learning_rate": 4.980286753286196e-06, + "logits/chosen": -1.1230740547180176, + "logits/rejected": -0.06482603400945663, + "logps/chosen": -502.35980224609375, + "logps/rejected": -846.9089965820312, + "loss": 0.2196, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -11.695874214172363, + "rewards/margins": 28.752582550048828, + "rewards/rejected": -40.448455810546875, + "step": 1360 + }, + { + "epoch": 0.137, + "grad_norm": 3.9868555505584435e-12, + "learning_rate": 4.979177905957726e-06, + "logits/chosen": -0.9407769441604614, + "logits/rejected": 0.15109823644161224, + "logps/chosen": -332.65191650390625, + "logps/rejected": -824.77392578125, + "loss": 0.4068, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.708237648010254, + "rewards/margins": 37.805335998535156, + "rewards/rejected": -45.513572692871094, + "step": 1370 + }, + { + "epoch": 0.138, + "grad_norm": 0.012242639437317848, + "learning_rate": 4.978038850628855e-06, + "logits/chosen": -0.960767924785614, + "logits/rejected": -0.6809446811676025, + "logps/chosen": -340.8134765625, + "logps/rejected": -660.6018676757812, + "loss": 1.068, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -12.148280143737793, + "rewards/margins": 26.553054809570312, + "rewards/rejected": -38.70133590698242, + "step": 1380 + }, + { + "epoch": 0.139, + "grad_norm": 1.2719906408165116e-05, + "learning_rate": 4.9768696011786095e-06, + "logits/chosen": -1.1656242609024048, + "logits/rejected": -0.02300162985920906, + "logps/chosen": -227.0721893310547, + "logps/rejected": -624.4703369140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.238847732543945, + "rewards/margins": 24.26767921447754, + "rewards/rejected": -34.50652313232422, + "step": 1390 + }, + { + "epoch": 0.14, + "grad_norm": 18.823410034179688, + "learning_rate": 4.975670171853926e-06, + "logits/chosen": -1.070709466934204, + "logits/rejected": -0.17539706826210022, + "logps/chosen": -367.56500244140625, + "logps/rejected": -743.7613525390625, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.786466121673584, + "rewards/margins": 26.3763484954834, + "rewards/rejected": -34.162811279296875, + "step": 1400 + }, + { + "epoch": 0.141, + "grad_norm": 0.0017390275606885552, + "learning_rate": 4.974440577269473e-06, + "logits/chosen": -0.8289377093315125, + "logits/rejected": -0.35058996081352234, + "logps/chosen": -452.47021484375, + "logps/rejected": -699.7356567382812, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.515665054321289, + "rewards/margins": 25.03122329711914, + "rewards/rejected": -34.54689025878906, + "step": 1410 + }, + { + "epoch": 0.142, + "grad_norm": 4.752753739012405e-05, + "learning_rate": 4.973180832407471e-06, + "logits/chosen": -0.5075998902320862, + "logits/rejected": -0.3585384786128998, + "logps/chosen": -574.3099365234375, + "logps/rejected": -675.3820190429688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.680159568786621, + "rewards/margins": 25.572025299072266, + "rewards/rejected": -33.25218200683594, + "step": 1420 + }, + { + "epoch": 0.143, + "grad_norm": 2.4035329156310446e-17, + "learning_rate": 4.971890952617515e-06, + "logits/chosen": -1.3064539432525635, + "logits/rejected": 0.013357448391616344, + "logps/chosen": -317.8672790527344, + "logps/rejected": -717.834716796875, + "loss": 0.399, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.242793083190918, + "rewards/margins": 26.032501220703125, + "rewards/rejected": -36.275291442871094, + "step": 1430 + }, + { + "epoch": 0.144, + "grad_norm": 6.969142060317401e-13, + "learning_rate": 4.970570953616383e-06, + "logits/chosen": -1.1992871761322021, + "logits/rejected": -0.16932205855846405, + "logps/chosen": -274.62652587890625, + "logps/rejected": -713.0188598632812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.339066505432129, + "rewards/margins": 26.44083595275879, + "rewards/rejected": -34.77989959716797, + "step": 1440 + }, + { + "epoch": 0.145, + "grad_norm": 2.2085606946926776e-15, + "learning_rate": 4.9692208514878445e-06, + "logits/chosen": -1.0547511577606201, + "logits/rejected": -0.20317073166370392, + "logps/chosen": -360.881103515625, + "logps/rejected": -724.4630737304688, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.842625141143799, + "rewards/margins": 26.726482391357422, + "rewards/rejected": -32.5691032409668, + "step": 1450 + }, + { + "epoch": 0.146, + "grad_norm": 1.7620060965839457e-09, + "learning_rate": 4.96784066268247e-06, + "logits/chosen": -1.0773088932037354, + "logits/rejected": -0.14823777973651886, + "logps/chosen": -206.27163696289062, + "logps/rejected": -585.3616333007812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.059189796447754, + "rewards/margins": 25.024410247802734, + "rewards/rejected": -31.083599090576172, + "step": 1460 + }, + { + "epoch": 0.147, + "grad_norm": 5.868219886906445e-05, + "learning_rate": 4.966430404017424e-06, + "logits/chosen": -0.9046875238418579, + "logits/rejected": -0.4450520873069763, + "logps/chosen": -223.2705535888672, + "logps/rejected": -525.7073974609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.235110282897949, + "rewards/margins": 22.660573959350586, + "rewards/rejected": -28.89568519592285, + "step": 1470 + }, + { + "epoch": 0.148, + "grad_norm": 7.98956989456201e-09, + "learning_rate": 4.964990092676263e-06, + "logits/chosen": -1.0882080793380737, + "logits/rejected": -0.07989266514778137, + "logps/chosen": -289.68768310546875, + "logps/rejected": -685.7807006835938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.977403163909912, + "rewards/margins": 24.825477600097656, + "rewards/rejected": -31.802881240844727, + "step": 1480 + }, + { + "epoch": 0.149, + "grad_norm": 3.933014531458737e-14, + "learning_rate": 4.963519746208726e-06, + "logits/chosen": -1.6329920291900635, + "logits/rejected": -0.03951167315244675, + "logps/chosen": -350.8020935058594, + "logps/rejected": -891.4786987304688, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.784034729003906, + "rewards/margins": 27.927047729492188, + "rewards/rejected": -32.711082458496094, + "step": 1490 + }, + { + "epoch": 0.15, + "grad_norm": 2.4590647220611572, + "learning_rate": 4.962019382530521e-06, + "logits/chosen": -0.8447348475456238, + "logits/rejected": -0.46400079131126404, + "logps/chosen": -466.80584716796875, + "logps/rejected": -604.7357788085938, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.835474967956543, + "rewards/margins": 16.11906623840332, + "rewards/rejected": -22.954544067382812, + "step": 1500 + }, + { + "epoch": 0.151, + "grad_norm": 6.710806227123306e-14, + "learning_rate": 4.960489019923105e-06, + "logits/chosen": -1.0921003818511963, + "logits/rejected": -0.08635418117046356, + "logps/chosen": -269.69671630859375, + "logps/rejected": -679.5653686523438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.616307258605957, + "rewards/margins": 28.96999740600586, + "rewards/rejected": -35.5863037109375, + "step": 1510 + }, + { + "epoch": 0.152, + "grad_norm": 3.868865228184859e-10, + "learning_rate": 4.958928677033465e-06, + "logits/chosen": -1.3451675176620483, + "logits/rejected": -0.0031303453724831343, + "logps/chosen": -277.45928955078125, + "logps/rejected": -788.315673828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.414937973022461, + "rewards/margins": 32.910972595214844, + "rewards/rejected": -41.3259162902832, + "step": 1520 + }, + { + "epoch": 0.153, + "grad_norm": 3.853562976531555e-13, + "learning_rate": 4.957338372873886e-06, + "logits/chosen": -0.8953266143798828, + "logits/rejected": -0.23004481196403503, + "logps/chosen": -368.2197265625, + "logps/rejected": -699.2587280273438, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.798627853393555, + "rewards/margins": 28.176212310791016, + "rewards/rejected": -36.97483444213867, + "step": 1530 + }, + { + "epoch": 0.154, + "grad_norm": 3.0009095668792725, + "learning_rate": 4.9557181268217225e-06, + "logits/chosen": -1.0614800453186035, + "logits/rejected": -0.3679501712322235, + "logps/chosen": -328.5702819824219, + "logps/rejected": -513.2587890625, + "loss": 1.0402, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.790850639343262, + "rewards/margins": 17.721195220947266, + "rewards/rejected": -26.512048721313477, + "step": 1540 + }, + { + "epoch": 0.155, + "grad_norm": 1.3953936096877673e-11, + "learning_rate": 4.9540679586191605e-06, + "logits/chosen": -0.8217973709106445, + "logits/rejected": -0.2370542585849762, + "logps/chosen": -196.01568603515625, + "logps/rejected": -472.8330078125, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.229257106781006, + "rewards/margins": 22.110279083251953, + "rewards/rejected": -26.33953857421875, + "step": 1550 + }, + { + "epoch": 0.156, + "grad_norm": 5.3244052141692783e-20, + "learning_rate": 4.9523878883729794e-06, + "logits/chosen": -1.1830456256866455, + "logits/rejected": -0.013311699032783508, + "logps/chosen": -356.19744873046875, + "logps/rejected": -797.1712646484375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.410015106201172, + "rewards/margins": 31.568416595458984, + "rewards/rejected": -39.978431701660156, + "step": 1560 + }, + { + "epoch": 0.157, + "grad_norm": 8.391878054681001e-07, + "learning_rate": 4.9506779365543054e-06, + "logits/chosen": -0.6549090147018433, + "logits/rejected": -0.2016439139842987, + "logps/chosen": -334.9619140625, + "logps/rejected": -612.3310546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.262808799743652, + "rewards/margins": 28.336029052734375, + "rewards/rejected": -35.598838806152344, + "step": 1570 + }, + { + "epoch": 0.158, + "grad_norm": 105.29165649414062, + "learning_rate": 4.94893812399836e-06, + "logits/chosen": -1.018554449081421, + "logits/rejected": -0.3415969908237457, + "logps/chosen": -261.0390625, + "logps/rejected": -572.3067626953125, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.529544830322266, + "rewards/margins": 26.439651489257812, + "rewards/rejected": -32.96919631958008, + "step": 1580 + }, + { + "epoch": 0.159, + "grad_norm": 0.03807740658521652, + "learning_rate": 4.947168471904213e-06, + "logits/chosen": -0.9052284955978394, + "logits/rejected": -0.2753170132637024, + "logps/chosen": -466.19232177734375, + "logps/rejected": -726.24755859375, + "loss": 0.0445, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.326812744140625, + "rewards/margins": 23.40597152709961, + "rewards/rejected": -32.732784271240234, + "step": 1590 + }, + { + "epoch": 0.16, + "grad_norm": 1.2223373897259082e-13, + "learning_rate": 4.9453690018345144e-06, + "logits/chosen": -1.063926100730896, + "logits/rejected": -0.015577336773276329, + "logps/chosen": -194.1289825439453, + "logps/rejected": -624.2291259765625, + "loss": 0.0661, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.718228340148926, + "rewards/margins": 30.950618743896484, + "rewards/rejected": -37.66884994506836, + "step": 1600 + }, + { + "epoch": 0.161, + "grad_norm": 3.4670115628236686e-13, + "learning_rate": 4.9435397357152406e-06, + "logits/chosen": -0.7654945850372314, + "logits/rejected": -0.07977879047393799, + "logps/chosen": -287.39947509765625, + "logps/rejected": -611.6721801757812, + "loss": 0.0506, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.743795394897461, + "rewards/margins": 24.993785858154297, + "rewards/rejected": -36.737579345703125, + "step": 1610 + }, + { + "epoch": 0.162, + "grad_norm": 5.27442256716182e-19, + "learning_rate": 4.9416806958354206e-06, + "logits/chosen": -1.0056555271148682, + "logits/rejected": -0.032809026539325714, + "logps/chosen": -188.92874145507812, + "logps/rejected": -566.5960693359375, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.255312442779541, + "rewards/margins": 29.489330291748047, + "rewards/rejected": -36.74464797973633, + "step": 1620 + }, + { + "epoch": 0.163, + "grad_norm": 2.0571063841061388e-13, + "learning_rate": 4.939791904846869e-06, + "logits/chosen": -1.0535011291503906, + "logits/rejected": 0.179019033908844, + "logps/chosen": -235.6096649169922, + "logps/rejected": -637.8615112304688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.38923168182373, + "rewards/margins": 27.224655151367188, + "rewards/rejected": -37.61388397216797, + "step": 1630 + }, + { + "epoch": 0.164, + "grad_norm": 0.007213903125375509, + "learning_rate": 4.937873385763909e-06, + "logits/chosen": -0.9827578663825989, + "logits/rejected": 0.1371154934167862, + "logps/chosen": -250.7228546142578, + "logps/rejected": -675.5855712890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.908210754394531, + "rewards/margins": 28.934223175048828, + "rewards/rejected": -36.842437744140625, + "step": 1640 + }, + { + "epoch": 0.165, + "grad_norm": 0.0, + "learning_rate": 4.935925161963089e-06, + "logits/chosen": -0.8703482747077942, + "logits/rejected": 0.06418517976999283, + "logps/chosen": -367.58697509765625, + "logps/rejected": -749.47509765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.039400100708008, + "rewards/margins": 35.043800354003906, + "rewards/rejected": -49.08320236206055, + "step": 1650 + }, + { + "epoch": 0.166, + "grad_norm": 6.4045049645578e-12, + "learning_rate": 4.933947257182901e-06, + "logits/chosen": -0.9856742024421692, + "logits/rejected": 0.08307775110006332, + "logps/chosen": -305.7294006347656, + "logps/rejected": -904.21240234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.920564651489258, + "rewards/margins": 46.30979919433594, + "rewards/rejected": -55.23036575317383, + "step": 1660 + }, + { + "epoch": 0.167, + "grad_norm": 1.2415427403392098e-22, + "learning_rate": 4.9319396955234925e-06, + "logits/chosen": -0.8148317337036133, + "logits/rejected": -0.1875368058681488, + "logps/chosen": -424.0921936035156, + "logps/rejected": -860.6064453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.587820053100586, + "rewards/margins": 38.1686897277832, + "rewards/rejected": -49.75651168823242, + "step": 1670 + }, + { + "epoch": 0.168, + "grad_norm": 0.0003383158764336258, + "learning_rate": 4.9299025014463665e-06, + "logits/chosen": -1.0005062818527222, + "logits/rejected": 0.22835354506969452, + "logps/chosen": -508.1582946777344, + "logps/rejected": -1031.61572265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.51125717163086, + "rewards/margins": 43.42790603637695, + "rewards/rejected": -61.93916702270508, + "step": 1680 + }, + { + "epoch": 0.169, + "grad_norm": 1.0438952626574613e-13, + "learning_rate": 4.92783569977409e-06, + "logits/chosen": -0.7535207867622375, + "logits/rejected": 0.13958851993083954, + "logps/chosen": -349.7854309082031, + "logps/rejected": -842.4319458007812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.62995433807373, + "rewards/margins": 43.466609954833984, + "rewards/rejected": -59.09656524658203, + "step": 1690 + }, + { + "epoch": 0.17, + "grad_norm": 3.889030228090189e-15, + "learning_rate": 4.925739315689991e-06, + "logits/chosen": -0.6942230463027954, + "logits/rejected": -0.04554635286331177, + "logps/chosen": -540.0942993164062, + "logps/rejected": -736.609130859375, + "loss": 0.1435, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -11.868269920349121, + "rewards/margins": 18.347684860229492, + "rewards/rejected": -30.215953826904297, + "step": 1700 + }, + { + "epoch": 0.171, + "grad_norm": 1.0066810395264331e-13, + "learning_rate": 4.923613374737848e-06, + "logits/chosen": -1.0682138204574585, + "logits/rejected": 0.1531905233860016, + "logps/chosen": -301.99566650390625, + "logps/rejected": -800.46923828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.131755828857422, + "rewards/margins": 36.60453796386719, + "rewards/rejected": -45.73629379272461, + "step": 1710 + }, + { + "epoch": 0.172, + "grad_norm": 0.000590948446188122, + "learning_rate": 4.921457902821578e-06, + "logits/chosen": -1.2912073135375977, + "logits/rejected": 0.09540309756994247, + "logps/chosen": -261.936767578125, + "logps/rejected": -738.4759521484375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.333407402038574, + "rewards/margins": 31.073110580444336, + "rewards/rejected": -37.406517028808594, + "step": 1720 + }, + { + "epoch": 0.173, + "grad_norm": 5.637766364863239e-10, + "learning_rate": 4.9192729262049285e-06, + "logits/chosen": -0.7073559165000916, + "logits/rejected": 0.033843234181404114, + "logps/chosen": -345.47894287109375, + "logps/rejected": -699.8843994140625, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.881582260131836, + "rewards/margins": 29.07110023498535, + "rewards/rejected": -42.95268630981445, + "step": 1730 + }, + { + "epoch": 0.174, + "grad_norm": 2.4128008023104536e-19, + "learning_rate": 4.917058471511149e-06, + "logits/chosen": -0.7510659694671631, + "logits/rejected": -0.06710796803236008, + "logps/chosen": -461.6541442871094, + "logps/rejected": -844.5467529296875, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.30162525177002, + "rewards/margins": 33.08237838745117, + "rewards/rejected": -44.38400650024414, + "step": 1740 + }, + { + "epoch": 0.175, + "grad_norm": 9.65522123906729e-19, + "learning_rate": 4.914814565722671e-06, + "logits/chosen": -0.9065462350845337, + "logits/rejected": 0.027768870815634727, + "logps/chosen": -382.0977478027344, + "logps/rejected": -884.6822509765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.617523193359375, + "rewards/margins": 45.349388122558594, + "rewards/rejected": -57.96691131591797, + "step": 1750 + }, + { + "epoch": 0.176, + "grad_norm": 1.2494966172837962e-09, + "learning_rate": 4.912541236180779e-06, + "logits/chosen": -0.8603243827819824, + "logits/rejected": 0.08707042783498764, + "logps/chosen": -439.8019104003906, + "logps/rejected": -842.1697998046875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.627752304077148, + "rewards/margins": 33.403865814208984, + "rewards/rejected": -47.0316162109375, + "step": 1760 + }, + { + "epoch": 0.177, + "grad_norm": 1.0479344451455618e-15, + "learning_rate": 4.910238510585275e-06, + "logits/chosen": -1.1826056241989136, + "logits/rejected": 0.29609158635139465, + "logps/chosen": -289.4844665527344, + "logps/rejected": -989.2132568359375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.005671501159668, + "rewards/margins": 54.07866287231445, + "rewards/rejected": -67.08433532714844, + "step": 1770 + }, + { + "epoch": 0.178, + "grad_norm": 3.945892224077596e-10, + "learning_rate": 4.907906416994146e-06, + "logits/chosen": -0.7862873077392578, + "logits/rejected": 0.3076401948928833, + "logps/chosen": -392.3423156738281, + "logps/rejected": -1059.8935546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.646921157836914, + "rewards/margins": 50.55992889404297, + "rewards/rejected": -64.20684814453125, + "step": 1780 + }, + { + "epoch": 0.179, + "grad_norm": 1.2721311702071532e-14, + "learning_rate": 4.905544983823214e-06, + "logits/chosen": -0.8739240765571594, + "logits/rejected": 0.2608945965766907, + "logps/chosen": -400.95867919921875, + "logps/rejected": -937.3480224609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.51372718811035, + "rewards/margins": 46.725791931152344, + "rewards/rejected": -63.23952102661133, + "step": 1790 + }, + { + "epoch": 0.18, + "grad_norm": 1.5841317382157312e-16, + "learning_rate": 4.903154239845798e-06, + "logits/chosen": -0.866929829120636, + "logits/rejected": 0.0072061000391840935, + "logps/chosen": -300.40277099609375, + "logps/rejected": -879.0672607421875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.66291618347168, + "rewards/margins": 46.29401397705078, + "rewards/rejected": -58.956932067871094, + "step": 1800 + }, + { + "epoch": 0.181, + "grad_norm": 0.0, + "learning_rate": 4.900734214192358e-06, + "logits/chosen": -0.9717338681221008, + "logits/rejected": 0.19119112193584442, + "logps/chosen": -286.30841064453125, + "logps/rejected": -806.5407104492188, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.181900024414062, + "rewards/margins": 40.651390075683594, + "rewards/rejected": -50.833290100097656, + "step": 1810 + }, + { + "epoch": 0.182, + "grad_norm": 0.03354150429368019, + "learning_rate": 4.898284936350144e-06, + "logits/chosen": -0.608110249042511, + "logits/rejected": 0.10701987892389297, + "logps/chosen": -460.31756591796875, + "logps/rejected": -805.1393432617188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.571718215942383, + "rewards/margins": 26.98175621032715, + "rewards/rejected": -45.55347442626953, + "step": 1820 + }, + { + "epoch": 0.183, + "grad_norm": 4.13684983868734e-06, + "learning_rate": 4.8958064361628334e-06, + "logits/chosen": -0.7738394141197205, + "logits/rejected": 0.21792516112327576, + "logps/chosen": -432.724609375, + "logps/rejected": -925.1007690429688, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.735455513000488, + "rewards/margins": 44.763633728027344, + "rewards/rejected": -59.49909210205078, + "step": 1830 + }, + { + "epoch": 0.184, + "grad_norm": 7.981440584233542e-16, + "learning_rate": 4.893298743830168e-06, + "logits/chosen": -0.6500649452209473, + "logits/rejected": 0.09442566335201263, + "logps/chosen": -341.99603271484375, + "logps/rejected": -855.1522216796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.761459350585938, + "rewards/margins": 49.43480682373047, + "rewards/rejected": -60.19626998901367, + "step": 1840 + }, + { + "epoch": 0.185, + "grad_norm": 7.473514168632178e-11, + "learning_rate": 4.890761889907589e-06, + "logits/chosen": -0.5870779752731323, + "logits/rejected": 0.15625113248825073, + "logps/chosen": -409.39178466796875, + "logps/rejected": -909.2233276367188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.405719757080078, + "rewards/margins": 41.44813537597656, + "rewards/rejected": -59.853851318359375, + "step": 1850 + }, + { + "epoch": 0.186, + "grad_norm": 4.363639305579245e-14, + "learning_rate": 4.888195905305859e-06, + "logits/chosen": -0.59303218126297, + "logits/rejected": 0.18804967403411865, + "logps/chosen": -403.5731506347656, + "logps/rejected": -938.0885009765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.057333946228027, + "rewards/margins": 49.87173843383789, + "rewards/rejected": -58.9290771484375, + "step": 1860 + }, + { + "epoch": 0.187, + "grad_norm": 4.640702172764577e-06, + "learning_rate": 4.885600821290692e-06, + "logits/chosen": -0.6436842679977417, + "logits/rejected": 0.3434585630893707, + "logps/chosen": -349.7669372558594, + "logps/rejected": -794.9974975585938, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.01045036315918, + "rewards/margins": 38.5369987487793, + "rewards/rejected": -50.54745101928711, + "step": 1870 + }, + { + "epoch": 0.188, + "grad_norm": 2.3586930070771187e-14, + "learning_rate": 4.882976669482368e-06, + "logits/chosen": -1.0222156047821045, + "logits/rejected": 0.18538489937782288, + "logps/chosen": -431.86614990234375, + "logps/rejected": -936.9752197265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.006241798400879, + "rewards/margins": 40.09410858154297, + "rewards/rejected": -50.1003532409668, + "step": 1880 + }, + { + "epoch": 0.189, + "grad_norm": 3.1524606411897062e-21, + "learning_rate": 4.880323481855347e-06, + "logits/chosen": -0.9518159031867981, + "logits/rejected": 0.019279232248663902, + "logps/chosen": -267.8340759277344, + "logps/rejected": -740.6573486328125, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.248370170593262, + "rewards/margins": 34.88971710205078, + "rewards/rejected": -45.138084411621094, + "step": 1890 + }, + { + "epoch": 0.19, + "grad_norm": 1.8914584597745732e-19, + "learning_rate": 4.8776412907378845e-06, + "logits/chosen": -1.017465353012085, + "logits/rejected": 0.13340437412261963, + "logps/chosen": -412.73297119140625, + "logps/rejected": -891.4244384765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.114110946655273, + "rewards/margins": 41.254695892333984, + "rewards/rejected": -52.368812561035156, + "step": 1900 + }, + { + "epoch": 0.191, + "grad_norm": 0.011506685987114906, + "learning_rate": 4.874930128811631e-06, + "logits/chosen": -1.0823355913162231, + "logits/rejected": 0.020175794139504433, + "logps/chosen": -399.35394287109375, + "logps/rejected": -887.8450927734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.699014663696289, + "rewards/margins": 38.75952911376953, + "rewards/rejected": -53.45853805541992, + "step": 1910 + }, + { + "epoch": 0.192, + "grad_norm": 0.0, + "learning_rate": 4.8721900291112415e-06, + "logits/chosen": -0.6110566854476929, + "logits/rejected": 0.08111194521188736, + "logps/chosen": -354.4425964355469, + "logps/rejected": -915.6475830078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.294811248779297, + "rewards/margins": 48.587364196777344, + "rewards/rejected": -60.882171630859375, + "step": 1920 + }, + { + "epoch": 0.193, + "grad_norm": 5.552347877824371e-22, + "learning_rate": 4.869421025023965e-06, + "logits/chosen": -1.0768553018569946, + "logits/rejected": 0.2780148983001709, + "logps/chosen": -283.3021545410156, + "logps/rejected": -874.9591064453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.546971321105957, + "rewards/margins": 46.320335388183594, + "rewards/rejected": -59.86730194091797, + "step": 1930 + }, + { + "epoch": 0.194, + "grad_norm": 2.024776508438119e-17, + "learning_rate": 4.866623150289241e-06, + "logits/chosen": -1.423117756843567, + "logits/rejected": 0.02501138485968113, + "logps/chosen": -237.41915893554688, + "logps/rejected": -842.1968994140625, + "loss": 0.5865, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.444206237792969, + "rewards/margins": 47.00324249267578, + "rewards/rejected": -56.44744873046875, + "step": 1940 + }, + { + "epoch": 0.195, + "grad_norm": 7.398041645956255e-08, + "learning_rate": 4.863796438998293e-06, + "logits/chosen": -0.9332197308540344, + "logits/rejected": 0.01928057335317135, + "logps/chosen": -153.62330627441406, + "logps/rejected": -553.70361328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.0888495445251465, + "rewards/margins": 29.092769622802734, + "rewards/rejected": -35.181617736816406, + "step": 1950 + }, + { + "epoch": 0.196, + "grad_norm": 0.09112061560153961, + "learning_rate": 4.860940925593703e-06, + "logits/chosen": -0.7799338102340698, + "logits/rejected": -0.06467507779598236, + "logps/chosen": -520.5501708984375, + "logps/rejected": -877.4957275390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.793745040893555, + "rewards/margins": 35.70417785644531, + "rewards/rejected": -48.497928619384766, + "step": 1960 + }, + { + "epoch": 0.197, + "grad_norm": 0.011287910863757133, + "learning_rate": 4.858056644869002e-06, + "logits/chosen": -0.8448853492736816, + "logits/rejected": -0.1531025767326355, + "logps/chosen": -388.1729431152344, + "logps/rejected": -773.3935546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.335466384887695, + "rewards/margins": 34.20120620727539, + "rewards/rejected": -46.53667068481445, + "step": 1970 + }, + { + "epoch": 0.198, + "grad_norm": 2.031313246360152e-19, + "learning_rate": 4.855143631968242e-06, + "logits/chosen": -0.9060547947883606, + "logits/rejected": 0.058340221643447876, + "logps/chosen": -466.2227478027344, + "logps/rejected": -1013.80908203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.147106170654297, + "rewards/margins": 42.48836135864258, + "rewards/rejected": -53.635467529296875, + "step": 1980 + }, + { + "epoch": 0.199, + "grad_norm": 3.3271295874631734e-12, + "learning_rate": 4.852201922385564e-06, + "logits/chosen": -1.5826104879379272, + "logits/rejected": 0.2186201810836792, + "logps/chosen": -353.6684875488281, + "logps/rejected": -870.4959716796875, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.283282279968262, + "rewards/margins": 34.60364532470703, + "rewards/rejected": -42.886932373046875, + "step": 1990 + }, + { + "epoch": 0.2, + "grad_norm": 4.831719453110865e-16, + "learning_rate": 4.849231551964771e-06, + "logits/chosen": -0.8290309906005859, + "logits/rejected": 0.015381842851638794, + "logps/chosen": -291.46063232421875, + "logps/rejected": -669.02685546875, + "loss": 0.0872, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -11.124984741210938, + "rewards/margins": 33.23019027709961, + "rewards/rejected": -44.35517883300781, + "step": 2000 + }, + { + "epoch": 0.201, + "grad_norm": 0.002687457948923111, + "learning_rate": 4.84623255689889e-06, + "logits/chosen": -0.6981030702590942, + "logits/rejected": 0.045853037387132645, + "logps/chosen": -382.01776123046875, + "logps/rejected": -782.2008666992188, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.968143463134766, + "rewards/margins": 36.116371154785156, + "rewards/rejected": -53.08452224731445, + "step": 2010 + }, + { + "epoch": 0.202, + "grad_norm": 2.2274776711128652e-05, + "learning_rate": 4.84320497372973e-06, + "logits/chosen": -1.1311920881271362, + "logits/rejected": 0.06535493582487106, + "logps/chosen": -234.904296875, + "logps/rejected": -763.780517578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.619631290435791, + "rewards/margins": 35.88097381591797, + "rewards/rejected": -43.50060272216797, + "step": 2020 + }, + { + "epoch": 0.203, + "grad_norm": 3.651647127039803e-15, + "learning_rate": 4.840148839347434e-06, + "logits/chosen": -1.1937439441680908, + "logits/rejected": 0.05392221733927727, + "logps/chosen": -231.7136688232422, + "logps/rejected": -714.2911987304688, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.07800579071045, + "rewards/margins": 35.264305114746094, + "rewards/rejected": -46.342308044433594, + "step": 2030 + }, + { + "epoch": 0.204, + "grad_norm": 5.8405490221957734e-08, + "learning_rate": 4.837064190990036e-06, + "logits/chosen": -0.9981630444526672, + "logits/rejected": 0.05016200616955757, + "logps/chosen": -309.5982971191406, + "logps/rejected": -757.8245239257812, + "loss": 0.1777, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.973468780517578, + "rewards/margins": 33.17815017700195, + "rewards/rejected": -44.15161895751953, + "step": 2040 + }, + { + "epoch": 0.205, + "grad_norm": 6.093002491436295e-11, + "learning_rate": 4.833951066243004e-06, + "logits/chosen": -0.9039271473884583, + "logits/rejected": 0.15411342680454254, + "logps/chosen": -275.2000427246094, + "logps/rejected": -727.4915771484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.230191230773926, + "rewards/margins": 36.9615364074707, + "rewards/rejected": -44.19172668457031, + "step": 2050 + }, + { + "epoch": 0.206, + "grad_norm": 9.243760677691767e-14, + "learning_rate": 4.830809503038781e-06, + "logits/chosen": -1.0659441947937012, + "logits/rejected": -0.04560966417193413, + "logps/chosen": -430.9668884277344, + "logps/rejected": -842.630859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.900915145874023, + "rewards/margins": 39.50886154174805, + "rewards/rejected": -52.4097785949707, + "step": 2060 + }, + { + "epoch": 0.207, + "grad_norm": 3.012393055812877e-14, + "learning_rate": 4.8276395396563215e-06, + "logits/chosen": -0.707872211933136, + "logits/rejected": -0.12487606704235077, + "logps/chosen": -333.6909484863281, + "logps/rejected": -650.5589599609375, + "loss": 0.0424, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.892992973327637, + "rewards/margins": 29.976953506469727, + "rewards/rejected": -42.86994552612305, + "step": 2070 + }, + { + "epoch": 0.208, + "grad_norm": 0.020325161516666412, + "learning_rate": 4.824441214720629e-06, + "logits/chosen": -1.0603435039520264, + "logits/rejected": -0.13006174564361572, + "logps/chosen": -430.6055603027344, + "logps/rejected": -714.2709350585938, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.49073600769043, + "rewards/margins": 26.808090209960938, + "rewards/rejected": -37.298828125, + "step": 2080 + }, + { + "epoch": 0.209, + "grad_norm": 4.8418461080779185e-12, + "learning_rate": 4.821214567202284e-06, + "logits/chosen": -0.6019529104232788, + "logits/rejected": -0.05509559437632561, + "logps/chosen": -446.13134765625, + "logps/rejected": -764.00830078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.890316009521484, + "rewards/margins": 34.51970672607422, + "rewards/rejected": -44.4100227355957, + "step": 2090 + }, + { + "epoch": 0.21, + "grad_norm": 0.015145066194236279, + "learning_rate": 4.817959636416969e-06, + "logits/chosen": -0.5784981846809387, + "logits/rejected": -0.10248645395040512, + "logps/chosen": -570.4688720703125, + "logps/rejected": -822.8307495117188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.669044494628906, + "rewards/margins": 33.286407470703125, + "rewards/rejected": -42.95545196533203, + "step": 2100 + }, + { + "epoch": 0.211, + "grad_norm": 2.6829666960326293e-15, + "learning_rate": 4.814676462024988e-06, + "logits/chosen": -0.9595147371292114, + "logits/rejected": 0.07999895513057709, + "logps/chosen": -278.6533203125, + "logps/rejected": -740.11767578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.203185081481934, + "rewards/margins": 35.281524658203125, + "rewards/rejected": -45.48470687866211, + "step": 2110 + }, + { + "epoch": 0.212, + "grad_norm": 1.7712128943383588e-19, + "learning_rate": 4.811365084030784e-06, + "logits/chosen": -1.53053879737854, + "logits/rejected": 0.17698611319065094, + "logps/chosen": -161.68276977539062, + "logps/rejected": -739.840087890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.296278953552246, + "rewards/margins": 40.40182113647461, + "rewards/rejected": -47.69810104370117, + "step": 2120 + }, + { + "epoch": 0.213, + "grad_norm": 5.7248204881482545e-21, + "learning_rate": 4.808025542782453e-06, + "logits/chosen": -1.048194408416748, + "logits/rejected": 0.06344493478536606, + "logps/chosen": -280.8482360839844, + "logps/rejected": -703.5811767578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.806572914123535, + "rewards/margins": 31.60160255432129, + "rewards/rejected": -43.40817642211914, + "step": 2130 + }, + { + "epoch": 0.214, + "grad_norm": 1.4793377484237152e-17, + "learning_rate": 4.804657878971252e-06, + "logits/chosen": -1.3861225843429565, + "logits/rejected": 0.1794586479663849, + "logps/chosen": -385.51275634765625, + "logps/rejected": -1037.645751953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.886439323425293, + "rewards/margins": 45.353580474853516, + "rewards/rejected": -61.240013122558594, + "step": 2140 + }, + { + "epoch": 0.215, + "grad_norm": 2.505604057567723e-10, + "learning_rate": 4.801262133631101e-06, + "logits/chosen": -0.9277293086051941, + "logits/rejected": -0.0969119742512703, + "logps/chosen": -468.418212890625, + "logps/rejected": -712.3648681640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.228230476379395, + "rewards/margins": 27.422122955322266, + "rewards/rejected": -39.650352478027344, + "step": 2150 + }, + { + "epoch": 0.216, + "grad_norm": 0.2817370891571045, + "learning_rate": 4.7978383481380865e-06, + "logits/chosen": -1.1354224681854248, + "logits/rejected": 0.02800583280622959, + "logps/chosen": -397.59820556640625, + "logps/rejected": -760.2446899414062, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.743389129638672, + "rewards/margins": 31.957714080810547, + "rewards/rejected": -42.701107025146484, + "step": 2160 + }, + { + "epoch": 0.217, + "grad_norm": 1.5132579434321003e-12, + "learning_rate": 4.794386564209953e-06, + "logits/chosen": -1.0182678699493408, + "logits/rejected": 0.03556183725595474, + "logps/chosen": -417.7530212402344, + "logps/rejected": -964.2975463867188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.372791290283203, + "rewards/margins": 37.62430191040039, + "rewards/rejected": -53.997093200683594, + "step": 2170 + }, + { + "epoch": 0.218, + "grad_norm": 5.079949820211189e-18, + "learning_rate": 4.790906823905599e-06, + "logits/chosen": -1.0524531602859497, + "logits/rejected": -0.14839516580104828, + "logps/chosen": -273.4908142089844, + "logps/rejected": -714.5278930664062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.830060958862305, + "rewards/margins": 30.115550994873047, + "rewards/rejected": -41.94561004638672, + "step": 2180 + }, + { + "epoch": 0.219, + "grad_norm": 0.0, + "learning_rate": 4.787399169624562e-06, + "logits/chosen": -1.0520641803741455, + "logits/rejected": -0.10660145431756973, + "logps/chosen": -434.2923889160156, + "logps/rejected": -936.0968017578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.649426460266113, + "rewards/margins": 35.36643600463867, + "rewards/rejected": -49.01586151123047, + "step": 2190 + }, + { + "epoch": 0.22, + "grad_norm": 2.784252162157941e-09, + "learning_rate": 4.783863644106502e-06, + "logits/chosen": -0.8814069032669067, + "logits/rejected": -0.05634657293558121, + "logps/chosen": -431.2822265625, + "logps/rejected": -949.1829223632812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.263689994812012, + "rewards/margins": 38.06841278076172, + "rewards/rejected": -51.33210372924805, + "step": 2200 + }, + { + "epoch": 0.221, + "grad_norm": 6.444813432926466e-11, + "learning_rate": 4.780300290430683e-06, + "logits/chosen": -1.072237253189087, + "logits/rejected": -0.06441137939691544, + "logps/chosen": -376.80572509765625, + "logps/rejected": -824.3206176757812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.14746379852295, + "rewards/margins": 35.05595779418945, + "rewards/rejected": -48.20341873168945, + "step": 2210 + }, + { + "epoch": 0.222, + "grad_norm": 2.647671499414526e-19, + "learning_rate": 4.776709152015443e-06, + "logits/chosen": -0.9050602912902832, + "logits/rejected": -0.0204143263399601, + "logps/chosen": -308.73077392578125, + "logps/rejected": -743.6929931640625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.167070388793945, + "rewards/margins": 32.204490661621094, + "rewards/rejected": -44.371559143066406, + "step": 2220 + }, + { + "epoch": 0.223, + "grad_norm": 0.0, + "learning_rate": 4.773090272617672e-06, + "logits/chosen": -1.1379512548446655, + "logits/rejected": 0.2818123996257782, + "logps/chosen": -341.7289123535156, + "logps/rejected": -922.5020751953125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.875950813293457, + "rewards/margins": 41.55849075317383, + "rewards/rejected": -52.43444061279297, + "step": 2230 + }, + { + "epoch": 0.224, + "grad_norm": 5.748112752042268e-16, + "learning_rate": 4.769443696332272e-06, + "logits/chosen": -1.0871905088424683, + "logits/rejected": 0.021607961505651474, + "logps/chosen": -422.33416748046875, + "logps/rejected": -950.8518676757812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.902830123901367, + "rewards/margins": 45.880855560302734, + "rewards/rejected": -58.78368377685547, + "step": 2240 + }, + { + "epoch": 0.225, + "grad_norm": 2.0195723493543483e-07, + "learning_rate": 4.765769467591626e-06, + "logits/chosen": -0.7570234537124634, + "logits/rejected": 0.23347148299217224, + "logps/chosen": -582.53515625, + "logps/rejected": -1147.514892578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -23.481950759887695, + "rewards/margins": 51.846229553222656, + "rewards/rejected": -75.32818603515625, + "step": 2250 + }, + { + "epoch": 0.226, + "grad_norm": 2.10311114904509e-14, + "learning_rate": 4.762067631165049e-06, + "logits/chosen": -1.2956483364105225, + "logits/rejected": 0.10478191077709198, + "logps/chosen": -331.7798767089844, + "logps/rejected": -1047.0546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.469011306762695, + "rewards/margins": 46.84741973876953, + "rewards/rejected": -62.316429138183594, + "step": 2260 + }, + { + "epoch": 0.227, + "grad_norm": 1.4005004621286954e-11, + "learning_rate": 4.7583382321582525e-06, + "logits/chosen": -0.673801600933075, + "logits/rejected": -0.1823035627603531, + "logps/chosen": -460.52874755859375, + "logps/rejected": -798.1029052734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.7214298248291, + "rewards/margins": 31.615991592407227, + "rewards/rejected": -48.337425231933594, + "step": 2270 + }, + { + "epoch": 0.228, + "grad_norm": 1.375444922278239e-16, + "learning_rate": 4.754581316012785e-06, + "logits/chosen": -0.8537979125976562, + "logits/rejected": 0.008811051957309246, + "logps/chosen": -436.7643127441406, + "logps/rejected": -1024.1982421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.984895706176758, + "rewards/margins": 50.079132080078125, + "rewards/rejected": -67.06402587890625, + "step": 2280 + }, + { + "epoch": 0.229, + "grad_norm": 1.5123860094102626e-15, + "learning_rate": 4.750796928505484e-06, + "logits/chosen": -0.7344772219657898, + "logits/rejected": -0.0585576593875885, + "logps/chosen": -478.2090759277344, + "logps/rejected": -905.4718627929688, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -21.288328170776367, + "rewards/margins": 33.283172607421875, + "rewards/rejected": -54.571502685546875, + "step": 2290 + }, + { + "epoch": 0.23, + "grad_norm": 5.140918073187617e-13, + "learning_rate": 4.746985115747918e-06, + "logits/chosen": -0.9128534197807312, + "logits/rejected": 0.12439526617527008, + "logps/chosen": -514.3945922851562, + "logps/rejected": -916.0233154296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.596830368041992, + "rewards/margins": 31.888294219970703, + "rewards/rejected": -48.48512649536133, + "step": 2300 + }, + { + "epoch": 0.231, + "grad_norm": 6.90247385077927e-22, + "learning_rate": 4.743145924185821e-06, + "logits/chosen": -0.7148122787475586, + "logits/rejected": -0.020510563626885414, + "logps/chosen": -363.60162353515625, + "logps/rejected": -751.8236083984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.753090858459473, + "rewards/margins": 31.371822357177734, + "rewards/rejected": -43.124916076660156, + "step": 2310 + }, + { + "epoch": 0.232, + "grad_norm": 2.671577152210669e-13, + "learning_rate": 4.7392794005985324e-06, + "logits/chosen": -0.8572785258293152, + "logits/rejected": -0.11579354107379913, + "logps/chosen": -425.34954833984375, + "logps/rejected": -812.0877075195312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.852200508117676, + "rewards/margins": 35.16447448730469, + "rewards/rejected": -50.01667404174805, + "step": 2320 + }, + { + "epoch": 0.233, + "grad_norm": 7.430420967973477e-22, + "learning_rate": 4.735385592098421e-06, + "logits/chosen": -1.1626355648040771, + "logits/rejected": -0.20799453556537628, + "logps/chosen": -299.6154479980469, + "logps/rejected": -671.9234008789062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.785792350769043, + "rewards/margins": 30.775033950805664, + "rewards/rejected": -42.56082534790039, + "step": 2330 + }, + { + "epoch": 0.234, + "grad_norm": 1.9862537974128506e-18, + "learning_rate": 4.731464546130315e-06, + "logits/chosen": -1.1884291172027588, + "logits/rejected": 0.12569603323936462, + "logps/chosen": -248.02633666992188, + "logps/rejected": -826.1163940429688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.018312454223633, + "rewards/margins": 39.91654586791992, + "rewards/rejected": -49.93485641479492, + "step": 2340 + }, + { + "epoch": 0.235, + "grad_norm": 0.0, + "learning_rate": 4.72751631047092e-06, + "logits/chosen": -0.9497979879379272, + "logits/rejected": 0.24937982857227325, + "logps/chosen": -394.09246826171875, + "logps/rejected": -831.47900390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.291522026062012, + "rewards/margins": 34.22481155395508, + "rewards/rejected": -47.516334533691406, + "step": 2350 + }, + { + "epoch": 0.236, + "grad_norm": 0.0, + "learning_rate": 4.723540933228245e-06, + "logits/chosen": -0.6847087144851685, + "logits/rejected": -0.33126509189605713, + "logps/chosen": -548.4906005859375, + "logps/rejected": -793.0479736328125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.37607192993164, + "rewards/margins": 32.81710433959961, + "rewards/rejected": -49.193180084228516, + "step": 2360 + }, + { + "epoch": 0.237, + "grad_norm": 2.270067621580634e-16, + "learning_rate": 4.719538462841003e-06, + "logits/chosen": -0.1746879518032074, + "logits/rejected": 0.21270795166492462, + "logps/chosen": -448.89263916015625, + "logps/rejected": -816.49609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -21.152996063232422, + "rewards/margins": 38.69011688232422, + "rewards/rejected": -59.843109130859375, + "step": 2370 + }, + { + "epoch": 0.238, + "grad_norm": 3.4636649104413664e-10, + "learning_rate": 4.715508948078037e-06, + "logits/chosen": -0.8395630717277527, + "logits/rejected": 0.0964946523308754, + "logps/chosen": -501.96148681640625, + "logps/rejected": -984.0113525390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.775581359863281, + "rewards/margins": 43.83132553100586, + "rewards/rejected": -58.606910705566406, + "step": 2380 + }, + { + "epoch": 0.239, + "grad_norm": 7.653852551747775e-11, + "learning_rate": 4.71145243803771e-06, + "logits/chosen": -1.217986822128296, + "logits/rejected": 0.4775959551334381, + "logps/chosen": -415.57720947265625, + "logps/rejected": -1103.2398681640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.806949615478516, + "rewards/margins": 50.73517608642578, + "rewards/rejected": -69.54212951660156, + "step": 2390 + }, + { + "epoch": 0.24, + "grad_norm": 1.6787088386038818e-10, + "learning_rate": 4.707368982147318e-06, + "logits/chosen": -0.7131022214889526, + "logits/rejected": 0.4084620475769043, + "logps/chosen": -376.76824951171875, + "logps/rejected": -944.67236328125, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.048091888427734, + "rewards/margins": 53.066627502441406, + "rewards/rejected": -73.11471557617188, + "step": 2400 + }, + { + "epoch": 0.241, + "grad_norm": 0.0, + "learning_rate": 4.703258630162481e-06, + "logits/chosen": -0.9870785474777222, + "logits/rejected": 0.13874481618404388, + "logps/chosen": -472.11407470703125, + "logps/rejected": -1179.2464599609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.6230525970459, + "rewards/margins": 60.36846160888672, + "rewards/rejected": -77.99150848388672, + "step": 2410 + }, + { + "epoch": 0.242, + "grad_norm": 1.0387123823165894, + "learning_rate": 4.699121432166541e-06, + "logits/chosen": -0.8836095929145813, + "logits/rejected": 0.36422693729400635, + "logps/chosen": -379.01629638671875, + "logps/rejected": -1056.7281494140625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.028507232666016, + "rewards/margins": 54.816200256347656, + "rewards/rejected": -74.84471130371094, + "step": 2420 + }, + { + "epoch": 0.243, + "grad_norm": 0.0, + "learning_rate": 4.6949574385699514e-06, + "logits/chosen": -0.4048551917076111, + "logits/rejected": 0.24003490805625916, + "logps/chosen": -461.79315185546875, + "logps/rejected": -1028.343505859375, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.345182418823242, + "rewards/margins": 55.839576721191406, + "rewards/rejected": -75.18475341796875, + "step": 2430 + }, + { + "epoch": 0.244, + "grad_norm": 0.0, + "learning_rate": 4.690766700109659e-06, + "logits/chosen": -0.5087685585021973, + "logits/rejected": 0.617567777633667, + "logps/chosen": -413.0282287597656, + "logps/rejected": -1033.424560546875, + "loss": 0.3753, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.37348747253418, + "rewards/margins": 55.75347900390625, + "rewards/rejected": -76.12696075439453, + "step": 2440 + }, + { + "epoch": 0.245, + "grad_norm": 0.0, + "learning_rate": 4.68654926784849e-06, + "logits/chosen": -0.9591018557548523, + "logits/rejected": 0.39723971486091614, + "logps/chosen": -504.660400390625, + "logps/rejected": -1122.2666015625, + "loss": 0.0374, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.985147476196289, + "rewards/margins": 48.736328125, + "rewards/rejected": -64.72147369384766, + "step": 2450 + }, + { + "epoch": 0.246, + "grad_norm": 4.3823277605042146e-21, + "learning_rate": 4.682305193174524e-06, + "logits/chosen": -0.8720195889472961, + "logits/rejected": 0.5519598126411438, + "logps/chosen": -364.02899169921875, + "logps/rejected": -1117.5640869140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.054271697998047, + "rewards/margins": 64.96049499511719, + "rewards/rejected": -84.0147705078125, + "step": 2460 + }, + { + "epoch": 0.247, + "grad_norm": 0.0, + "learning_rate": 4.6780345278004744e-06, + "logits/chosen": -0.40707603096961975, + "logits/rejected": 0.39848193526268005, + "logps/chosen": -595.8785400390625, + "logps/rejected": -996.6804809570312, + "loss": 0.1193, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.818096160888672, + "rewards/margins": 48.075130462646484, + "rewards/rejected": -68.89323425292969, + "step": 2470 + }, + { + "epoch": 0.248, + "grad_norm": 0.0, + "learning_rate": 4.673737323763048e-06, + "logits/chosen": -0.6710236668586731, + "logits/rejected": 0.3840904235839844, + "logps/chosen": -578.5338745117188, + "logps/rejected": -1234.1817626953125, + "loss": 0.4578, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -26.0264949798584, + "rewards/margins": 55.99785232543945, + "rewards/rejected": -82.02433776855469, + "step": 2480 + }, + { + "epoch": 0.249, + "grad_norm": 0.0, + "learning_rate": 4.669413633422322e-06, + "logits/chosen": -0.7051594853401184, + "logits/rejected": 0.35642680525779724, + "logps/chosen": -423.958984375, + "logps/rejected": -1116.8492431640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.242204666137695, + "rewards/margins": 61.63847732543945, + "rewards/rejected": -78.88069152832031, + "step": 2490 + }, + { + "epoch": 0.25, + "grad_norm": 3.743392066509216e-23, + "learning_rate": 4.665063509461098e-06, + "logits/chosen": -0.799991250038147, + "logits/rejected": 0.2828425168991089, + "logps/chosen": -328.34417724609375, + "logps/rejected": -939.0185546875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.603776931762695, + "rewards/margins": 48.95137405395508, + "rewards/rejected": -61.555145263671875, + "step": 2500 + }, + { + "epoch": 0.251, + "grad_norm": 2.941124880411644e-21, + "learning_rate": 4.6606870048842626e-06, + "logits/chosen": -1.1419764757156372, + "logits/rejected": 0.26133501529693604, + "logps/chosen": -367.69720458984375, + "logps/rejected": -1034.8980712890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.324525833129883, + "rewards/margins": 53.705535888671875, + "rewards/rejected": -71.03005981445312, + "step": 2510 + }, + { + "epoch": 0.252, + "grad_norm": 0.0, + "learning_rate": 4.656284173018144e-06, + "logits/chosen": -1.3266280889511108, + "logits/rejected": 0.24299263954162598, + "logps/chosen": -277.9893798828125, + "logps/rejected": -921.3558349609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.854076385498047, + "rewards/margins": 48.599815368652344, + "rewards/rejected": -59.453895568847656, + "step": 2520 + }, + { + "epoch": 0.253, + "grad_norm": 855.6547241210938, + "learning_rate": 4.65185506750986e-06, + "logits/chosen": -0.75602787733078, + "logits/rejected": -0.04198075085878372, + "logps/chosen": -404.5797119140625, + "logps/rejected": -844.5486450195312, + "loss": 0.3552, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -12.992825508117676, + "rewards/margins": 37.5327033996582, + "rewards/rejected": -50.52552795410156, + "step": 2530 + }, + { + "epoch": 0.254, + "grad_norm": 2.341215069034952e-11, + "learning_rate": 4.6473997423266615e-06, + "logits/chosen": -1.0960681438446045, + "logits/rejected": 0.11165539175271988, + "logps/chosen": -309.8856201171875, + "logps/rejected": -850.849609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.645660400390625, + "rewards/margins": 41.12796401977539, + "rewards/rejected": -50.77361297607422, + "step": 2540 + }, + { + "epoch": 0.255, + "grad_norm": 0.0, + "learning_rate": 4.642918251755281e-06, + "logits/chosen": -1.3027342557907104, + "logits/rejected": 0.24640479683876038, + "logps/chosen": -373.795654296875, + "logps/rejected": -1020.3546752929688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.509068489074707, + "rewards/margins": 44.65599822998047, + "rewards/rejected": -54.165069580078125, + "step": 2550 + }, + { + "epoch": 0.256, + "grad_norm": 7.732006110927614e-07, + "learning_rate": 4.638410650401267e-06, + "logits/chosen": -1.2951546907424927, + "logits/rejected": 0.06202126666903496, + "logps/chosen": -278.08746337890625, + "logps/rejected": -755.7003173828125, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.85218620300293, + "rewards/margins": 38.408973693847656, + "rewards/rejected": -47.26116180419922, + "step": 2560 + }, + { + "epoch": 0.257, + "grad_norm": 2.750820075636127e-22, + "learning_rate": 4.633876993188319e-06, + "logits/chosen": -0.5697834491729736, + "logits/rejected": -0.17692281305789948, + "logps/chosen": -330.2970886230469, + "logps/rejected": -675.277587890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.79739761352539, + "rewards/margins": 36.20167922973633, + "rewards/rejected": -46.99907684326172, + "step": 2570 + }, + { + "epoch": 0.258, + "grad_norm": 0.0, + "learning_rate": 4.62931733535762e-06, + "logits/chosen": -0.24441662430763245, + "logits/rejected": 0.09682003408670425, + "logps/chosen": -453.4608459472656, + "logps/rejected": -752.7393188476562, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.770639419555664, + "rewards/margins": 33.42501449584961, + "rewards/rejected": -44.195655822753906, + "step": 2580 + }, + { + "epoch": 0.259, + "grad_norm": 0.0, + "learning_rate": 4.62473173246716e-06, + "logits/chosen": -0.736251950263977, + "logits/rejected": 0.019975418224930763, + "logps/chosen": -448.28643798828125, + "logps/rejected": -904.33642578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.213622093200684, + "rewards/margins": 47.143733978271484, + "rewards/rejected": -57.35735321044922, + "step": 2590 + }, + { + "epoch": 0.26, + "grad_norm": 0.0, + "learning_rate": 4.620120240391065e-06, + "logits/chosen": -0.46649056673049927, + "logits/rejected": 0.018019551411271095, + "logps/chosen": -447.64166259765625, + "logps/rejected": -842.9020385742188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.979809761047363, + "rewards/margins": 42.84097671508789, + "rewards/rejected": -58.8207893371582, + "step": 2600 + }, + { + "epoch": 0.261, + "grad_norm": 0.0, + "learning_rate": 4.6154829153189105e-06, + "logits/chosen": -0.7219793796539307, + "logits/rejected": 0.47301802039146423, + "logps/chosen": -324.66229248046875, + "logps/rejected": -1142.9036865234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.965510368347168, + "rewards/margins": 67.4233627319336, + "rewards/rejected": -82.38887023925781, + "step": 2610 + }, + { + "epoch": 0.262, + "grad_norm": 1256.3182373046875, + "learning_rate": 4.610819813755038e-06, + "logits/chosen": -0.7228553891181946, + "logits/rejected": 0.12379207462072372, + "logps/chosen": -552.47900390625, + "logps/rejected": -960.6402587890625, + "loss": 0.5346, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -24.032733917236328, + "rewards/margins": 34.79325485229492, + "rewards/rejected": -58.82598876953125, + "step": 2620 + }, + { + "epoch": 0.263, + "grad_norm": 30.419069290161133, + "learning_rate": 4.60613099251787e-06, + "logits/chosen": -1.010801076889038, + "logits/rejected": 0.1571781188249588, + "logps/chosen": -313.34747314453125, + "logps/rejected": -845.5224609375, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.521732330322266, + "rewards/margins": 42.821800231933594, + "rewards/rejected": -53.343536376953125, + "step": 2630 + }, + { + "epoch": 0.264, + "grad_norm": 4.838548232731629e-15, + "learning_rate": 4.601416508739211e-06, + "logits/chosen": -1.073610544204712, + "logits/rejected": 0.5915216207504272, + "logps/chosen": -371.45989990234375, + "logps/rejected": -1123.2381591796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.641399383544922, + "rewards/margins": 60.57389450073242, + "rewards/rejected": -81.21529388427734, + "step": 2640 + }, + { + "epoch": 0.265, + "grad_norm": 2.1994967028149404e-05, + "learning_rate": 4.596676419863561e-06, + "logits/chosen": -0.4245632290840149, + "logits/rejected": -0.18038161098957062, + "logps/chosen": -603.9671630859375, + "logps/rejected": -949.8527221679688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.18203353881836, + "rewards/margins": 47.4343376159668, + "rewards/rejected": -66.61637878417969, + "step": 2650 + }, + { + "epoch": 0.266, + "grad_norm": 0.0, + "learning_rate": 4.591910783647405e-06, + "logits/chosen": -0.6803088784217834, + "logits/rejected": 0.3073822855949402, + "logps/chosen": -396.9436340332031, + "logps/rejected": -895.3616333007812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.408302307128906, + "rewards/margins": 48.797462463378906, + "rewards/rejected": -64.20576477050781, + "step": 2660 + }, + { + "epoch": 0.267, + "grad_norm": 0.0, + "learning_rate": 4.587119658158517e-06, + "logits/chosen": -0.844752311706543, + "logits/rejected": 1.0756970643997192, + "logps/chosen": -336.9431457519531, + "logps/rejected": -1189.891357421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.892147064208984, + "rewards/margins": 72.17032623291016, + "rewards/rejected": -93.06246948242188, + "step": 2670 + }, + { + "epoch": 0.268, + "grad_norm": 1.235100492332914e-18, + "learning_rate": 4.582303101775249e-06, + "logits/chosen": -0.6267115473747253, + "logits/rejected": 0.35415276885032654, + "logps/chosen": -635.7947998046875, + "logps/rejected": -1453.327880859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.361858367919922, + "rewards/margins": 78.53101348876953, + "rewards/rejected": -98.89286804199219, + "step": 2680 + }, + { + "epoch": 0.269, + "grad_norm": 2.9888548233603096e-13, + "learning_rate": 4.577461173185821e-06, + "logits/chosen": -0.8019062280654907, + "logits/rejected": 0.57016521692276, + "logps/chosen": -379.5988464355469, + "logps/rejected": -1071.9979248046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.178808212280273, + "rewards/margins": 59.92768478393555, + "rewards/rejected": -77.10649108886719, + "step": 2690 + }, + { + "epoch": 0.27, + "grad_norm": 4.660129422205473e-20, + "learning_rate": 4.572593931387604e-06, + "logits/chosen": -0.770391583442688, + "logits/rejected": 0.47237473726272583, + "logps/chosen": -390.8031311035156, + "logps/rejected": -1198.1607666015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.715108871459961, + "rewards/margins": 70.4637451171875, + "rewards/rejected": -85.1788558959961, + "step": 2700 + }, + { + "epoch": 0.271, + "grad_norm": 0.0, + "learning_rate": 4.567701435686405e-06, + "logits/chosen": -1.1127533912658691, + "logits/rejected": 0.7476900815963745, + "logps/chosen": -383.3504943847656, + "logps/rejected": -1261.5062255859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.624284744262695, + "rewards/margins": 68.47001647949219, + "rewards/rejected": -87.09429931640625, + "step": 2710 + }, + { + "epoch": 0.272, + "grad_norm": 0.0, + "learning_rate": 4.562783745695738e-06, + "logits/chosen": -0.4293700158596039, + "logits/rejected": -0.32799363136291504, + "logps/chosen": -723.4006958007812, + "logps/rejected": -1026.2906494140625, + "loss": 1.5142, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.579858779907227, + "rewards/margins": 37.50405502319336, + "rewards/rejected": -59.08391571044922, + "step": 2720 + }, + { + "epoch": 0.273, + "grad_norm": 8.267878001788631e-06, + "learning_rate": 4.5578409213361055e-06, + "logits/chosen": -0.38455820083618164, + "logits/rejected": -0.13795824348926544, + "logps/chosen": -422.9434509277344, + "logps/rejected": -579.631591796875, + "loss": 1.4694, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.918153762817383, + "rewards/margins": 20.949848175048828, + "rewards/rejected": -30.868000030517578, + "step": 2730 + }, + { + "epoch": 0.274, + "grad_norm": 3.323042983538471e-05, + "learning_rate": 4.55287302283426e-06, + "logits/chosen": -1.2258391380310059, + "logits/rejected": 0.2508848309516907, + "logps/chosen": -230.49356079101562, + "logps/rejected": -710.0572509765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.474408149719238, + "rewards/margins": 33.696800231933594, + "rewards/rejected": -40.17120361328125, + "step": 2740 + }, + { + "epoch": 0.275, + "grad_norm": 2.5122176339209545e-07, + "learning_rate": 4.54788011072248e-06, + "logits/chosen": -1.135868787765503, + "logits/rejected": -0.020803770050406456, + "logps/chosen": -258.2652282714844, + "logps/rejected": -616.9526977539062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.1399359703063965, + "rewards/margins": 25.91520118713379, + "rewards/rejected": -33.055137634277344, + "step": 2750 + }, + { + "epoch": 0.276, + "grad_norm": 4.9381639660099445e-15, + "learning_rate": 4.542862245837821e-06, + "logits/chosen": -0.38430148363113403, + "logits/rejected": -0.3277333378791809, + "logps/chosen": -369.1678771972656, + "logps/rejected": -599.8237915039062, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.9210052490234375, + "rewards/margins": 25.239404678344727, + "rewards/rejected": -31.160409927368164, + "step": 2760 + }, + { + "epoch": 0.277, + "grad_norm": 2.8379170894622803, + "learning_rate": 4.537819489321385e-06, + "logits/chosen": -1.19253408908844, + "logits/rejected": -0.09504680335521698, + "logps/chosen": -227.212646484375, + "logps/rejected": -592.4996948242188, + "loss": 0.0937, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.070375919342041, + "rewards/margins": 26.809539794921875, + "rewards/rejected": -32.87991714477539, + "step": 2770 + }, + { + "epoch": 0.278, + "grad_norm": 0.0, + "learning_rate": 4.5327519026175694e-06, + "logits/chosen": -1.2971299886703491, + "logits/rejected": 0.0600772388279438, + "logps/chosen": -272.6226501464844, + "logps/rejected": -866.8453979492188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.302969932556152, + "rewards/margins": 43.80091094970703, + "rewards/rejected": -50.1038818359375, + "step": 2780 + }, + { + "epoch": 0.279, + "grad_norm": 0.0, + "learning_rate": 4.527659547473317e-06, + "logits/chosen": -0.9313802719116211, + "logits/rejected": -0.12935884296894073, + "logps/chosen": -352.4999694824219, + "logps/rejected": -728.7339477539062, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.947843074798584, + "rewards/margins": 34.59426498413086, + "rewards/rejected": -42.54210662841797, + "step": 2790 + }, + { + "epoch": 0.28, + "grad_norm": 4.836014401432553e-13, + "learning_rate": 4.522542485937369e-06, + "logits/chosen": -1.0397319793701172, + "logits/rejected": -0.26670709252357483, + "logps/chosen": -367.3852233886719, + "logps/rejected": -756.3072509765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.932214736938477, + "rewards/margins": 33.279319763183594, + "rewards/rejected": -43.2115364074707, + "step": 2800 + }, + { + "epoch": 0.281, + "grad_norm": 5.379221512669119e-10, + "learning_rate": 4.517400780359505e-06, + "logits/chosen": -0.9027656316757202, + "logits/rejected": -0.13679035007953644, + "logps/chosen": -492.4688415527344, + "logps/rejected": -851.7306518554688, + "loss": 0.2074, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -12.402759552001953, + "rewards/margins": 34.22053527832031, + "rewards/rejected": -46.62329864501953, + "step": 2810 + }, + { + "epoch": 0.282, + "grad_norm": 0.0, + "learning_rate": 4.512234493389785e-06, + "logits/chosen": -1.3217861652374268, + "logits/rejected": 0.2540954053401947, + "logps/chosen": -446.2145080566406, + "logps/rejected": -1161.365234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.036980628967285, + "rewards/margins": 58.81218338012695, + "rewards/rejected": -68.84915924072266, + "step": 2820 + }, + { + "epoch": 0.283, + "grad_norm": 5.972985661628627e-08, + "learning_rate": 4.507043687977787e-06, + "logits/chosen": -0.5777202844619751, + "logits/rejected": 0.18531468510627747, + "logps/chosen": -338.59832763671875, + "logps/rejected": -736.2530517578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.795352935791016, + "rewards/margins": 31.938003540039062, + "rewards/rejected": -49.73335647583008, + "step": 2830 + }, + { + "epoch": 0.284, + "grad_norm": 0.004476075526326895, + "learning_rate": 4.501828427371834e-06, + "logits/chosen": -0.9521886110305786, + "logits/rejected": 0.17324507236480713, + "logps/chosen": -289.2124328613281, + "logps/rejected": -889.6448974609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.43809700012207, + "rewards/margins": 41.17052459716797, + "rewards/rejected": -51.60862350463867, + "step": 2840 + }, + { + "epoch": 0.285, + "grad_norm": 1.6598479533058708e-06, + "learning_rate": 4.496588775118232e-06, + "logits/chosen": -1.029706358909607, + "logits/rejected": 0.3757559061050415, + "logps/chosen": -307.02191162109375, + "logps/rejected": -935.701171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.742807388305664, + "rewards/margins": 45.651023864746094, + "rewards/rejected": -58.393836975097656, + "step": 2850 + }, + { + "epoch": 0.286, + "grad_norm": 0.0, + "learning_rate": 4.491324795060491e-06, + "logits/chosen": -0.9051336050033569, + "logits/rejected": 0.1347285807132721, + "logps/chosen": -230.3036651611328, + "logps/rejected": -741.8186645507812, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.721261978149414, + "rewards/margins": 39.301753997802734, + "rewards/rejected": -51.023014068603516, + "step": 2860 + }, + { + "epoch": 0.287, + "grad_norm": 2.454706430494147e-22, + "learning_rate": 4.4860365513385456e-06, + "logits/chosen": -0.9631183743476868, + "logits/rejected": 0.08518068492412567, + "logps/chosen": -438.61004638671875, + "logps/rejected": -948.8082275390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.66820240020752, + "rewards/margins": 44.123191833496094, + "rewards/rejected": -59.7913932800293, + "step": 2870 + }, + { + "epoch": 0.288, + "grad_norm": 1.4058292646456716e-15, + "learning_rate": 4.4807241083879774e-06, + "logits/chosen": -0.9526158571243286, + "logits/rejected": 0.23263370990753174, + "logps/chosen": -412.12567138671875, + "logps/rejected": -976.13916015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.246286392211914, + "rewards/margins": 44.95268630981445, + "rewards/rejected": -62.19896697998047, + "step": 2880 + }, + { + "epoch": 0.289, + "grad_norm": 3.208458630850027e-16, + "learning_rate": 4.475387530939226e-06, + "logits/chosen": -0.9484020471572876, + "logits/rejected": 0.1921675205230713, + "logps/chosen": -313.9803466796875, + "logps/rejected": -839.4591064453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.41915225982666, + "rewards/margins": 41.73972702026367, + "rewards/rejected": -53.15887451171875, + "step": 2890 + }, + { + "epoch": 0.29, + "grad_norm": 1.1192635156476172e-06, + "learning_rate": 4.470026884016805e-06, + "logits/chosen": -1.1647913455963135, + "logits/rejected": 0.28769174218177795, + "logps/chosen": -178.21139526367188, + "logps/rejected": -632.3584594726562, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.1957526206970215, + "rewards/margins": 32.56772994995117, + "rewards/rejected": -39.76348114013672, + "step": 2900 + }, + { + "epoch": 0.291, + "grad_norm": 3.7680575104559466e-08, + "learning_rate": 4.464642232938505e-06, + "logits/chosen": -0.1608423888683319, + "logits/rejected": -0.09430716931819916, + "logps/chosen": -674.2434692382812, + "logps/rejected": -874.7601318359375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -26.605106353759766, + "rewards/margins": 29.54248046875, + "rewards/rejected": -56.14759063720703, + "step": 2910 + }, + { + "epoch": 0.292, + "grad_norm": 0.0, + "learning_rate": 4.4592336433146e-06, + "logits/chosen": -0.9661940336227417, + "logits/rejected": 0.5986965894699097, + "logps/chosen": -401.9140930175781, + "logps/rejected": -1143.014892578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.071043014526367, + "rewards/margins": 57.54827880859375, + "rewards/rejected": -72.61932373046875, + "step": 2920 + }, + { + "epoch": 0.293, + "grad_norm": 3.743392066509216e-23, + "learning_rate": 4.453801181047047e-06, + "logits/chosen": -0.7017570734024048, + "logits/rejected": 0.8444482088088989, + "logps/chosen": -495.0025939941406, + "logps/rejected": -1380.618896484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -26.271066665649414, + "rewards/margins": 69.16044616699219, + "rewards/rejected": -95.4314956665039, + "step": 2930 + }, + { + "epoch": 0.294, + "grad_norm": 0.0, + "learning_rate": 4.448344912328686e-06, + "logits/chosen": -0.6854699850082397, + "logits/rejected": 0.7347670197486877, + "logps/chosen": -314.5595397949219, + "logps/rejected": -929.7980346679688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.546504974365234, + "rewards/margins": 46.76772689819336, + "rewards/rejected": -63.314231872558594, + "step": 2940 + }, + { + "epoch": 0.295, + "grad_norm": 0.0, + "learning_rate": 4.442864903642428e-06, + "logits/chosen": -1.2334846258163452, + "logits/rejected": 0.5767850875854492, + "logps/chosen": -262.81158447265625, + "logps/rejected": -999.1419067382812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.631780624389648, + "rewards/margins": 57.38239288330078, + "rewards/rejected": -73.01416778564453, + "step": 2950 + }, + { + "epoch": 0.296, + "grad_norm": 0.0, + "learning_rate": 4.437361221760449e-06, + "logits/chosen": -0.767090916633606, + "logits/rejected": 0.47413578629493713, + "logps/chosen": -399.1095275878906, + "logps/rejected": -1085.1121826171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.238618850708008, + "rewards/margins": 58.78046417236328, + "rewards/rejected": -74.01908111572266, + "step": 2960 + }, + { + "epoch": 0.297, + "grad_norm": 0.0, + "learning_rate": 4.431833933743378e-06, + "logits/chosen": -0.8993236422538757, + "logits/rejected": -0.05611775070428848, + "logps/chosen": -527.3430786132812, + "logps/rejected": -1073.151611328125, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -28.32375144958496, + "rewards/margins": 49.06525802612305, + "rewards/rejected": -77.38900756835938, + "step": 2970 + }, + { + "epoch": 0.298, + "grad_norm": 3.127561820637226e-10, + "learning_rate": 4.426283106939474e-06, + "logits/chosen": -0.27920812368392944, + "logits/rejected": 0.6835567951202393, + "logps/chosen": -497.9610290527344, + "logps/rejected": -932.0389404296875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -22.19385528564453, + "rewards/margins": 40.395111083984375, + "rewards/rejected": -62.588966369628906, + "step": 2980 + }, + { + "epoch": 0.299, + "grad_norm": 2.7995953999493395e-08, + "learning_rate": 4.420708808983809e-06, + "logits/chosen": -0.8772599101066589, + "logits/rejected": 0.5049712061882019, + "logps/chosen": -325.65692138671875, + "logps/rejected": -930.7125244140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.707733154296875, + "rewards/margins": 54.16173553466797, + "rewards/rejected": -67.86946868896484, + "step": 2990 + }, + { + "epoch": 0.3, + "grad_norm": 0.023355349898338318, + "learning_rate": 4.415111107797445e-06, + "logits/chosen": -0.9795050621032715, + "logits/rejected": 0.5347188115119934, + "logps/chosen": -458.56268310546875, + "logps/rejected": -967.0360107421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.780598640441895, + "rewards/margins": 42.03343200683594, + "rewards/rejected": -51.81402587890625, + "step": 3000 + }, + { + "epoch": 0.301, + "grad_norm": 7.46494047132451e-11, + "learning_rate": 4.409490071586606e-06, + "logits/chosen": -0.8965972065925598, + "logits/rejected": 0.2855593264102936, + "logps/chosen": -317.4532775878906, + "logps/rejected": -826.0565185546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.005922794342041, + "rewards/margins": 46.22938537597656, + "rewards/rejected": -53.23530960083008, + "step": 3010 + }, + { + "epoch": 0.302, + "grad_norm": 8.980111374512489e-07, + "learning_rate": 4.403845768841842e-06, + "logits/chosen": -0.7695995569229126, + "logits/rejected": 0.45763593912124634, + "logps/chosen": -346.1946716308594, + "logps/rejected": -922.2404174804688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.814911842346191, + "rewards/margins": 51.58049774169922, + "rewards/rejected": -61.395408630371094, + "step": 3020 + }, + { + "epoch": 0.303, + "grad_norm": 1.9820722507823787e-20, + "learning_rate": 4.398178268337203e-06, + "logits/chosen": -0.7960144877433777, + "logits/rejected": 0.06469273567199707, + "logps/chosen": -330.5350341796875, + "logps/rejected": -910.6071166992188, + "loss": 0.0216, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.916448593139648, + "rewards/margins": 42.928802490234375, + "rewards/rejected": -55.84525680541992, + "step": 3030 + }, + { + "epoch": 0.304, + "grad_norm": 9.17138249002274e-20, + "learning_rate": 4.3924876391293915e-06, + "logits/chosen": -0.8251537084579468, + "logits/rejected": 0.2735294699668884, + "logps/chosen": -439.69549560546875, + "logps/rejected": -902.1564331054688, + "loss": 0.646, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -11.860895156860352, + "rewards/margins": 38.2791748046875, + "rewards/rejected": -50.140071868896484, + "step": 3040 + }, + { + "epoch": 0.305, + "grad_norm": 0.001080155256204307, + "learning_rate": 4.386773950556931e-06, + "logits/chosen": -0.98908531665802, + "logits/rejected": 0.057259947061538696, + "logps/chosen": -372.20709228515625, + "logps/rejected": -811.25732421875, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.590067863464355, + "rewards/margins": 33.14325714111328, + "rewards/rejected": -41.73332595825195, + "step": 3050 + }, + { + "epoch": 0.306, + "grad_norm": 4.190588143160312e-08, + "learning_rate": 4.381037272239311e-06, + "logits/chosen": -1.0001757144927979, + "logits/rejected": -0.20020675659179688, + "logps/chosen": -442.28668212890625, + "logps/rejected": -700.0807495117188, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.053180694580078, + "rewards/margins": 25.24466896057129, + "rewards/rejected": -32.2978515625, + "step": 3060 + }, + { + "epoch": 0.307, + "grad_norm": 9.719526133267209e-06, + "learning_rate": 4.3752776740761495e-06, + "logits/chosen": -0.9625173807144165, + "logits/rejected": -0.23919124901294708, + "logps/chosen": -319.67620849609375, + "logps/rejected": -535.2568969726562, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.780137538909912, + "rewards/margins": 20.768360137939453, + "rewards/rejected": -28.54849624633789, + "step": 3070 + }, + { + "epoch": 0.308, + "grad_norm": 1.6489254852588456e-19, + "learning_rate": 4.36949522624633e-06, + "logits/chosen": -0.850079357624054, + "logits/rejected": -0.02385178580880165, + "logps/chosen": -372.00897216796875, + "logps/rejected": -700.4833374023438, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.652767181396484, + "rewards/margins": 29.697641372680664, + "rewards/rejected": -37.35040283203125, + "step": 3080 + }, + { + "epoch": 0.309, + "grad_norm": 0.0, + "learning_rate": 4.3636899992071555e-06, + "logits/chosen": -1.0359759330749512, + "logits/rejected": 0.22866709530353546, + "logps/chosen": -348.95001220703125, + "logps/rejected": -818.4954833984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.694208145141602, + "rewards/margins": 37.28623580932617, + "rewards/rejected": -46.980445861816406, + "step": 3090 + }, + { + "epoch": 0.31, + "grad_norm": 0.0, + "learning_rate": 4.357862063693486e-06, + "logits/chosen": -1.073272943496704, + "logits/rejected": -0.13734038174152374, + "logps/chosen": -310.79510498046875, + "logps/rejected": -881.04638671875, + "loss": 0.1594, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.433976173400879, + "rewards/margins": 35.776695251464844, + "rewards/rejected": -44.210670471191406, + "step": 3100 + }, + { + "epoch": 0.311, + "grad_norm": 1.0155697793834406e-07, + "learning_rate": 4.352011490716875e-06, + "logits/chosen": -0.9790974855422974, + "logits/rejected": 0.12780261039733887, + "logps/chosen": -317.59039306640625, + "logps/rejected": -703.695068359375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.203834533691406, + "rewards/margins": 30.016159057617188, + "rewards/rejected": -39.219993591308594, + "step": 3110 + }, + { + "epoch": 0.312, + "grad_norm": 9.904084925088661e-23, + "learning_rate": 4.346138351564711e-06, + "logits/chosen": -1.1815786361694336, + "logits/rejected": 0.22094345092773438, + "logps/chosen": -266.4033203125, + "logps/rejected": -817.2267456054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.6234712600708, + "rewards/margins": 35.64247512817383, + "rewards/rejected": -45.26594161987305, + "step": 3120 + }, + { + "epoch": 0.313, + "grad_norm": 5.889712003694092e-13, + "learning_rate": 4.340242717799337e-06, + "logits/chosen": -1.1714904308319092, + "logits/rejected": 0.30348244309425354, + "logps/chosen": -227.6770782470703, + "logps/rejected": -735.3243408203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.901235103607178, + "rewards/margins": 38.13142776489258, + "rewards/rejected": -44.03266525268555, + "step": 3130 + }, + { + "epoch": 0.314, + "grad_norm": 0.00010150240268558264, + "learning_rate": 4.334324661257191e-06, + "logits/chosen": -0.5792279839515686, + "logits/rejected": -0.24451354146003723, + "logps/chosen": -522.0756225585938, + "logps/rejected": -817.6600341796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.947957992553711, + "rewards/margins": 35.51698303222656, + "rewards/rejected": -44.46493911743164, + "step": 3140 + }, + { + "epoch": 0.315, + "grad_norm": 0.0, + "learning_rate": 4.328384254047927e-06, + "logits/chosen": -0.7550392746925354, + "logits/rejected": -0.0565187931060791, + "logps/chosen": -419.688720703125, + "logps/rejected": -683.5443115234375, + "loss": 1.1448, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -13.32347297668457, + "rewards/margins": 25.387094497680664, + "rewards/rejected": -38.7105712890625, + "step": 3150 + }, + { + "epoch": 0.316, + "grad_norm": 4.217108611345691e-20, + "learning_rate": 4.322421568553529e-06, + "logits/chosen": -0.8440915942192078, + "logits/rejected": 0.08169057220220566, + "logps/chosen": -244.05636596679688, + "logps/rejected": -744.2340087890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.935759544372559, + "rewards/margins": 36.457157135009766, + "rewards/rejected": -41.392913818359375, + "step": 3160 + }, + { + "epoch": 0.317, + "grad_norm": 7.900644760638897e-08, + "learning_rate": 4.316436677427441e-06, + "logits/chosen": -0.670063853263855, + "logits/rejected": 0.19559910893440247, + "logps/chosen": -433.0946350097656, + "logps/rejected": -748.577392578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.119277000427246, + "rewards/margins": 27.639232635498047, + "rewards/rejected": -34.758506774902344, + "step": 3170 + }, + { + "epoch": 0.318, + "grad_norm": 4.573404710495055e-10, + "learning_rate": 4.3104296535936695e-06, + "logits/chosen": -0.7517415285110474, + "logits/rejected": -0.16787463426589966, + "logps/chosen": -229.44692993164062, + "logps/rejected": -462.67431640625, + "loss": 0.09, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -7.413804531097412, + "rewards/margins": 22.188800811767578, + "rewards/rejected": -29.602609634399414, + "step": 3180 + }, + { + "epoch": 0.319, + "grad_norm": 7.644340000072052e-19, + "learning_rate": 4.3044005702459055e-06, + "logits/chosen": -1.4360209703445435, + "logits/rejected": 0.1630072295665741, + "logps/chosen": -207.827880859375, + "logps/rejected": -807.9542846679688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.8019585609436035, + "rewards/margins": 40.77233123779297, + "rewards/rejected": -47.57428741455078, + "step": 3190 + }, + { + "epoch": 0.32, + "grad_norm": 1.3456431064914898e-12, + "learning_rate": 4.2983495008466285e-06, + "logits/chosen": -0.7747661471366882, + "logits/rejected": 0.05215495079755783, + "logps/chosen": -299.6119079589844, + "logps/rejected": -676.51806640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.976496696472168, + "rewards/margins": 34.04780578613281, + "rewards/rejected": -42.0243034362793, + "step": 3200 + }, + { + "epoch": 0.321, + "grad_norm": 16.59024429321289, + "learning_rate": 4.2922765191262075e-06, + "logits/chosen": -1.094308614730835, + "logits/rejected": 0.11995135247707367, + "logps/chosen": -300.5848693847656, + "logps/rejected": -866.1476440429688, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.33760929107666, + "rewards/margins": 39.02849197387695, + "rewards/rejected": -47.36610412597656, + "step": 3210 + }, + { + "epoch": 0.322, + "grad_norm": 0.0, + "learning_rate": 4.286181699082008e-06, + "logits/chosen": -1.0322264432907104, + "logits/rejected": 0.15850770473480225, + "logps/chosen": -371.9905700683594, + "logps/rejected": -892.07568359375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.305109977722168, + "rewards/margins": 43.74433135986328, + "rewards/rejected": -57.0494384765625, + "step": 3220 + }, + { + "epoch": 0.323, + "grad_norm": 1.3465262645469115e-15, + "learning_rate": 4.280065114977492e-06, + "logits/chosen": -1.3070619106292725, + "logits/rejected": 0.4935874044895172, + "logps/chosen": -305.3742370605469, + "logps/rejected": -1267.24609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.339349746704102, + "rewards/margins": 78.47554016113281, + "rewards/rejected": -91.81489562988281, + "step": 3230 + }, + { + "epoch": 0.324, + "grad_norm": 7.401005694337914e-19, + "learning_rate": 4.273926841341303e-06, + "logits/chosen": -0.48427170515060425, + "logits/rejected": 0.372748464345932, + "logps/chosen": -363.49005126953125, + "logps/rejected": -985.8108520507812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.015512466430664, + "rewards/margins": 58.766265869140625, + "rewards/rejected": -74.78177642822266, + "step": 3240 + }, + { + "epoch": 0.325, + "grad_norm": 0.0, + "learning_rate": 4.267766952966369e-06, + "logits/chosen": -0.9825299382209778, + "logits/rejected": 0.47985076904296875, + "logps/chosen": -534.361328125, + "logps/rejected": -1301.0528564453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.222763061523438, + "rewards/margins": 67.33207702636719, + "rewards/rejected": -86.55484008789062, + "step": 3250 + }, + { + "epoch": 0.326, + "grad_norm": 0.0, + "learning_rate": 4.261585524908987e-06, + "logits/chosen": -0.5122109651565552, + "logits/rejected": 0.4423252046108246, + "logps/chosen": -444.8743591308594, + "logps/rejected": -1195.684814453125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.829586029052734, + "rewards/margins": 67.60523986816406, + "rewards/rejected": -87.43482208251953, + "step": 3260 + }, + { + "epoch": 0.327, + "grad_norm": 0.0, + "learning_rate": 4.255382632487907e-06, + "logits/chosen": -1.0329724550247192, + "logits/rejected": 0.7542210221290588, + "logps/chosen": -447.3409729003906, + "logps/rejected": -1327.537841796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -25.9887638092041, + "rewards/margins": 72.96781921386719, + "rewards/rejected": -98.95658111572266, + "step": 3270 + }, + { + "epoch": 0.328, + "grad_norm": 0.0, + "learning_rate": 4.249158351283414e-06, + "logits/chosen": -0.4629250168800354, + "logits/rejected": 1.0543975830078125, + "logps/chosen": -495.45379638671875, + "logps/rejected": -1429.62744140625, + "loss": 0.0993, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -27.047439575195312, + "rewards/margins": 84.02159881591797, + "rewards/rejected": -111.06903076171875, + "step": 3280 + }, + { + "epoch": 0.329, + "grad_norm": 5.326713596085457e-17, + "learning_rate": 4.242912757136412e-06, + "logits/chosen": -0.8477737307548523, + "logits/rejected": 0.3507903814315796, + "logps/chosen": -372.31951904296875, + "logps/rejected": -992.7218627929688, + "loss": 1.0371, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -13.680524826049805, + "rewards/margins": 61.86509323120117, + "rewards/rejected": -75.54561614990234, + "step": 3290 + }, + { + "epoch": 0.33, + "grad_norm": 0.0, + "learning_rate": 4.236645926147493e-06, + "logits/chosen": -0.4431152939796448, + "logits/rejected": 0.42956599593162537, + "logps/chosen": -286.32318115234375, + "logps/rejected": -851.01611328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.283034324645996, + "rewards/margins": 55.66416549682617, + "rewards/rejected": -65.94720458984375, + "step": 3300 + }, + { + "epoch": 0.331, + "grad_norm": 7.99737845599944e-21, + "learning_rate": 4.230357934676017e-06, + "logits/chosen": -0.689271867275238, + "logits/rejected": 0.30687215924263, + "logps/chosen": -594.4046020507812, + "logps/rejected": -1044.365234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.995326042175293, + "rewards/margins": 45.54576873779297, + "rewards/rejected": -60.54109573364258, + "step": 3310 + }, + { + "epoch": 0.332, + "grad_norm": 76.74764251708984, + "learning_rate": 4.224048859339175e-06, + "logits/chosen": -0.7327234148979187, + "logits/rejected": 0.20926149189472198, + "logps/chosen": -365.96612548828125, + "logps/rejected": -895.5828247070312, + "loss": 0.0936, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -8.642107009887695, + "rewards/margins": 46.72553634643555, + "rewards/rejected": -55.367645263671875, + "step": 3320 + }, + { + "epoch": 0.333, + "grad_norm": 4.214549327455149e-19, + "learning_rate": 4.217718777011058e-06, + "logits/chosen": -0.9751136898994446, + "logits/rejected": 0.4269269108772278, + "logps/chosen": -265.28314208984375, + "logps/rejected": -883.4361572265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.531183242797852, + "rewards/margins": 47.17866134643555, + "rewards/rejected": -57.7098503112793, + "step": 3330 + }, + { + "epoch": 0.334, + "grad_norm": 9.039357564688544e-07, + "learning_rate": 4.211367764821722e-06, + "logits/chosen": -1.0815322399139404, + "logits/rejected": 0.3311161398887634, + "logps/chosen": -205.88211059570312, + "logps/rejected": -701.0584716796875, + "loss": 0.8769, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.06307315826416, + "rewards/margins": 33.633060455322266, + "rewards/rejected": -42.696136474609375, + "step": 3340 + }, + { + "epoch": 0.335, + "grad_norm": 1.3225919914816586e-08, + "learning_rate": 4.204995900156247e-06, + "logits/chosen": -0.7970871925354004, + "logits/rejected": -0.21015918254852295, + "logps/chosen": -584.626220703125, + "logps/rejected": -836.6129760742188, + "loss": 0.031, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.297735214233398, + "rewards/margins": 29.705364227294922, + "rewards/rejected": -39.00310134887695, + "step": 3350 + }, + { + "epoch": 0.336, + "grad_norm": 4.926764821598923e-17, + "learning_rate": 4.198603260653792e-06, + "logits/chosen": -0.6659067869186401, + "logits/rejected": -0.021963249891996384, + "logps/chosen": -236.69851684570312, + "logps/rejected": -597.7718505859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.031260013580322, + "rewards/margins": 31.933868408203125, + "rewards/rejected": -37.965126037597656, + "step": 3360 + }, + { + "epoch": 0.337, + "grad_norm": 0.04632464796304703, + "learning_rate": 4.192189924206652e-06, + "logits/chosen": -0.6960038542747498, + "logits/rejected": 0.19569489359855652, + "logps/chosen": -214.2620086669922, + "logps/rejected": -617.9694213867188, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.799983024597168, + "rewards/margins": 27.065723419189453, + "rewards/rejected": -34.86570358276367, + "step": 3370 + }, + { + "epoch": 0.338, + "grad_norm": 1.2611774125037556e-10, + "learning_rate": 4.185755968959308e-06, + "logits/chosen": -0.9139487147331238, + "logits/rejected": 0.12455719709396362, + "logps/chosen": -476.13140869140625, + "logps/rejected": -759.7525634765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.588300704956055, + "rewards/margins": 31.280990600585938, + "rewards/rejected": -41.869293212890625, + "step": 3380 + }, + { + "epoch": 0.339, + "grad_norm": 6.151973502710462e-05, + "learning_rate": 4.179301473307476e-06, + "logits/chosen": -0.8834859132766724, + "logits/rejected": -0.13724976778030396, + "logps/chosen": -228.32272338867188, + "logps/rejected": -701.5811767578125, + "loss": 0.0581, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.848291873931885, + "rewards/margins": 34.67478561401367, + "rewards/rejected": -42.52307891845703, + "step": 3390 + }, + { + "epoch": 0.34, + "grad_norm": 0.0, + "learning_rate": 4.172826515897146e-06, + "logits/chosen": -1.3233671188354492, + "logits/rejected": 0.1761193573474884, + "logps/chosen": -345.5697326660156, + "logps/rejected": -839.4622802734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.214780807495117, + "rewards/margins": 35.262630462646484, + "rewards/rejected": -41.47740936279297, + "step": 3400 + }, + { + "epoch": 0.341, + "grad_norm": 8.89336535423426e-18, + "learning_rate": 4.166331175623631e-06, + "logits/chosen": -0.9410387277603149, + "logits/rejected": 0.25650379061698914, + "logps/chosen": -382.94012451171875, + "logps/rejected": -868.2106323242188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.65563440322876, + "rewards/margins": 41.03944778442383, + "rewards/rejected": -47.69507598876953, + "step": 3410 + }, + { + "epoch": 0.342, + "grad_norm": 6.152333132706959e-19, + "learning_rate": 4.159815531630604e-06, + "logits/chosen": -0.9895069003105164, + "logits/rejected": -0.024330515414476395, + "logps/chosen": -419.55145263671875, + "logps/rejected": -816.4887084960938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.520910739898682, + "rewards/margins": 36.27510070800781, + "rewards/rejected": -42.79601287841797, + "step": 3420 + }, + { + "epoch": 0.343, + "grad_norm": 1.754929014607942e-11, + "learning_rate": 4.15327966330913e-06, + "logits/chosen": -0.9556158781051636, + "logits/rejected": 0.02099316194653511, + "logps/chosen": -331.62164306640625, + "logps/rejected": -952.9747924804688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.674660682678223, + "rewards/margins": 45.470130920410156, + "rewards/rejected": -54.1447868347168, + "step": 3430 + }, + { + "epoch": 0.344, + "grad_norm": 4.288543058541573e-15, + "learning_rate": 4.146723650296701e-06, + "logits/chosen": -0.505741536617279, + "logits/rejected": 0.18241751194000244, + "logps/chosen": -326.7845458984375, + "logps/rejected": -704.0743408203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.242865562438965, + "rewards/margins": 33.074039459228516, + "rewards/rejected": -41.31690216064453, + "step": 3440 + }, + { + "epoch": 0.345, + "grad_norm": 5.038234667154029e-05, + "learning_rate": 4.140147572476269e-06, + "logits/chosen": -0.7677274942398071, + "logits/rejected": -0.07271343469619751, + "logps/chosen": -296.08465576171875, + "logps/rejected": -587.8889770507812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.46233081817627, + "rewards/margins": 25.770553588867188, + "rewards/rejected": -34.23288345336914, + "step": 3450 + }, + { + "epoch": 0.346, + "grad_norm": 1.3172983603804974e-17, + "learning_rate": 4.133551509975264e-06, + "logits/chosen": -1.312073826789856, + "logits/rejected": 0.2874998450279236, + "logps/chosen": -234.2183380126953, + "logps/rejected": -836.1720581054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.3478875160217285, + "rewards/margins": 40.78498077392578, + "rewards/rejected": -46.13286590576172, + "step": 3460 + }, + { + "epoch": 0.347, + "grad_norm": 3.578258837236975e-13, + "learning_rate": 4.126935543164628e-06, + "logits/chosen": -0.6281191110610962, + "logits/rejected": 0.1928253471851349, + "logps/chosen": -385.08843994140625, + "logps/rejected": -773.7591552734375, + "loss": 1.2362, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -11.442838668823242, + "rewards/margins": 35.947547912597656, + "rewards/rejected": -47.39038848876953, + "step": 3470 + }, + { + "epoch": 0.348, + "grad_norm": 0.02128647267818451, + "learning_rate": 4.120299752657828e-06, + "logits/chosen": -1.0916345119476318, + "logits/rejected": -0.13186690211296082, + "logps/chosen": -405.6370849609375, + "logps/rejected": -581.0313720703125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9594109058380127, + "rewards/margins": 23.054386138916016, + "rewards/rejected": -27.013797760009766, + "step": 3480 + }, + { + "epoch": 0.349, + "grad_norm": 0.692663311958313, + "learning_rate": 4.113644219309877e-06, + "logits/chosen": -0.8682696223258972, + "logits/rejected": -0.07819642126560211, + "logps/chosen": -241.39566040039062, + "logps/rejected": -459.0397033691406, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.898946762084961, + "rewards/margins": 18.163551330566406, + "rewards/rejected": -21.062496185302734, + "step": 3490 + }, + { + "epoch": 0.35, + "grad_norm": 4.095468061904306e-11, + "learning_rate": 4.106969024216348e-06, + "logits/chosen": -1.165206789970398, + "logits/rejected": 0.32706567645072937, + "logps/chosen": -347.5617370605469, + "logps/rejected": -740.993896484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.557514190673828, + "rewards/margins": 27.075342178344727, + "rewards/rejected": -32.63285446166992, + "step": 3500 + }, + { + "epoch": 0.351, + "grad_norm": 1.331050469674763e-16, + "learning_rate": 4.1002742487123896e-06, + "logits/chosen": -0.9754883050918579, + "logits/rejected": 0.30378806591033936, + "logps/chosen": -375.8541259765625, + "logps/rejected": -662.5955810546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.981966495513916, + "rewards/margins": 23.170042037963867, + "rewards/rejected": -30.152008056640625, + "step": 3510 + }, + { + "epoch": 0.352, + "grad_norm": 4.5469066098300764e-17, + "learning_rate": 4.093559974371725e-06, + "logits/chosen": -0.9044982194900513, + "logits/rejected": 0.12653622031211853, + "logps/chosen": -287.69097900390625, + "logps/rejected": -752.1959228515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.752053260803223, + "rewards/margins": 33.69719696044922, + "rewards/rejected": -42.449249267578125, + "step": 3520 + }, + { + "epoch": 0.353, + "grad_norm": 1.1403776073152431e-20, + "learning_rate": 4.086826283005669e-06, + "logits/chosen": -0.9870929718017578, + "logits/rejected": 0.21940307319164276, + "logps/chosen": -317.22210693359375, + "logps/rejected": -676.0667114257812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.004629611968994, + "rewards/margins": 28.83846664428711, + "rewards/rejected": -35.84309768676758, + "step": 3530 + }, + { + "epoch": 0.354, + "grad_norm": 2.823813923990982e-19, + "learning_rate": 4.080073256662128e-06, + "logits/chosen": -0.7896633744239807, + "logits/rejected": 0.13053257763385773, + "logps/chosen": -226.0159149169922, + "logps/rejected": -630.7973022460938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.601428031921387, + "rewards/margins": 30.512859344482422, + "rewards/rejected": -38.114288330078125, + "step": 3540 + }, + { + "epoch": 0.355, + "grad_norm": 122.45453643798828, + "learning_rate": 4.073300977624594e-06, + "logits/chosen": -0.7716548442840576, + "logits/rejected": 0.1828223615884781, + "logps/chosen": -394.8473205566406, + "logps/rejected": -690.2156372070312, + "loss": 0.0363, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.993786334991455, + "rewards/margins": 25.322904586791992, + "rewards/rejected": -31.316692352294922, + "step": 3550 + }, + { + "epoch": 0.356, + "grad_norm": 2.582539650880511e-12, + "learning_rate": 4.066509528411151e-06, + "logits/chosen": -0.7894098162651062, + "logits/rejected": 0.20214462280273438, + "logps/chosen": -178.9608612060547, + "logps/rejected": -506.6351623535156, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.481031894683838, + "rewards/margins": 23.32442855834961, + "rewards/rejected": -29.805461883544922, + "step": 3560 + }, + { + "epoch": 0.357, + "grad_norm": 3.1474918671392516e-08, + "learning_rate": 4.059698991773466e-06, + "logits/chosen": -0.5627990961074829, + "logits/rejected": -0.15371878445148468, + "logps/chosen": -318.6998596191406, + "logps/rejected": -579.3805541992188, + "loss": 0.1134, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.315168380737305, + "rewards/margins": 27.846643447875977, + "rewards/rejected": -38.16181182861328, + "step": 3570 + }, + { + "epoch": 0.358, + "grad_norm": 8.990660717245191e-06, + "learning_rate": 4.052869450695776e-06, + "logits/chosen": -1.002862811088562, + "logits/rejected": 0.23829932510852814, + "logps/chosen": -290.24456787109375, + "logps/rejected": -817.5097045898438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.639854431152344, + "rewards/margins": 39.795753479003906, + "rewards/rejected": -48.435604095458984, + "step": 3580 + }, + { + "epoch": 0.359, + "grad_norm": 0.005798167083412409, + "learning_rate": 4.046020988393886e-06, + "logits/chosen": -0.8494859933853149, + "logits/rejected": 0.033515565097332, + "logps/chosen": -402.35382080078125, + "logps/rejected": -763.16552734375, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.38691234588623, + "rewards/margins": 29.28324317932129, + "rewards/rejected": -38.67015838623047, + "step": 3590 + }, + { + "epoch": 0.36, + "grad_norm": 6.382561840156953e-11, + "learning_rate": 4.039153688314146e-06, + "logits/chosen": -1.2625701427459717, + "logits/rejected": 0.5352746248245239, + "logps/chosen": -263.8447265625, + "logps/rejected": -885.6497192382812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.839212417602539, + "rewards/margins": 43.649513244628906, + "rewards/rejected": -52.48872756958008, + "step": 3600 + }, + { + "epoch": 0.361, + "grad_norm": 1.465044301375816e-20, + "learning_rate": 4.032267634132442e-06, + "logits/chosen": -0.5378260016441345, + "logits/rejected": 0.22525055706501007, + "logps/chosen": -348.68145751953125, + "logps/rejected": -841.7462158203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.665252685546875, + "rewards/margins": 47.320892333984375, + "rewards/rejected": -55.98614501953125, + "step": 3610 + }, + { + "epoch": 0.362, + "grad_norm": 0.0037902365438640118, + "learning_rate": 4.02536290975317e-06, + "logits/chosen": -0.46131354570388794, + "logits/rejected": 0.11978636682033539, + "logps/chosen": -471.6434631347656, + "logps/rejected": -775.079833984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.731196403503418, + "rewards/margins": 34.168792724609375, + "rewards/rejected": -44.899986267089844, + "step": 3620 + }, + { + "epoch": 0.363, + "grad_norm": 1.2722707273555428e-13, + "learning_rate": 4.018439599308217e-06, + "logits/chosen": -0.8846076130867004, + "logits/rejected": 0.47798728942871094, + "logps/chosen": -317.46466064453125, + "logps/rejected": -997.0470581054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.721329689025879, + "rewards/margins": 53.88054275512695, + "rewards/rejected": -64.60187530517578, + "step": 3630 + }, + { + "epoch": 0.364, + "grad_norm": 1.3620032538678645e-17, + "learning_rate": 4.011497787155938e-06, + "logits/chosen": -0.5411085486412048, + "logits/rejected": 0.30354979634284973, + "logps/chosen": -375.63720703125, + "logps/rejected": -837.1346435546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.404248237609863, + "rewards/margins": 42.27107620239258, + "rewards/rejected": -52.675315856933594, + "step": 3640 + }, + { + "epoch": 0.365, + "grad_norm": 0.0, + "learning_rate": 4.0045375578801216e-06, + "logits/chosen": -0.9094980955123901, + "logits/rejected": 0.5757162570953369, + "logps/chosen": -241.68801879882812, + "logps/rejected": -861.6231689453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.014677047729492, + "rewards/margins": 42.840049743652344, + "rewards/rejected": -52.8547248840332, + "step": 3650 + }, + { + "epoch": 0.366, + "grad_norm": 7.864214921632362e-13, + "learning_rate": 3.997558996288965e-06, + "logits/chosen": -0.6316531896591187, + "logits/rejected": 0.5172852873802185, + "logps/chosen": -322.84295654296875, + "logps/rejected": -878.1337890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.732378959655762, + "rewards/margins": 43.56822204589844, + "rewards/rejected": -53.30059814453125, + "step": 3660 + }, + { + "epoch": 0.367, + "grad_norm": 1.152663465690726e-18, + "learning_rate": 3.9905621874140396e-06, + "logits/chosen": -1.038159966468811, + "logits/rejected": 0.21830615401268005, + "logps/chosen": -243.6871337890625, + "logps/rejected": -831.5692138671875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.551340579986572, + "rewards/margins": 44.354488372802734, + "rewards/rejected": -51.90583038330078, + "step": 3670 + }, + { + "epoch": 0.368, + "grad_norm": 1.1498552876775108e-20, + "learning_rate": 3.983547216509254e-06, + "logits/chosen": -0.6559784412384033, + "logits/rejected": 0.5230801105499268, + "logps/chosen": -262.62261962890625, + "logps/rejected": -746.4537963867188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.013997077941895, + "rewards/margins": 36.55512237548828, + "rewards/rejected": -46.56912612915039, + "step": 3680 + }, + { + "epoch": 0.369, + "grad_norm": 7.899832930232531e-18, + "learning_rate": 3.976514169049814e-06, + "logits/chosen": -0.8977417945861816, + "logits/rejected": 0.6272139549255371, + "logps/chosen": -352.23822021484375, + "logps/rejected": -923.0206298828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.092788696289062, + "rewards/margins": 47.015174865722656, + "rewards/rejected": -60.10795974731445, + "step": 3690 + }, + { + "epoch": 0.37, + "grad_norm": 4.87312933061812e-11, + "learning_rate": 3.969463130731183e-06, + "logits/chosen": -0.737012505531311, + "logits/rejected": 0.5239855051040649, + "logps/chosen": -440.50286865234375, + "logps/rejected": -1080.082275390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.673296928405762, + "rewards/margins": 50.824745178222656, + "rewards/rejected": -64.498046875, + "step": 3700 + }, + { + "epoch": 0.371, + "grad_norm": 0.0011436374625191092, + "learning_rate": 3.962394187468039e-06, + "logits/chosen": -0.7259347438812256, + "logits/rejected": 0.4603849947452545, + "logps/chosen": -348.16156005859375, + "logps/rejected": -792.4857788085938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.061820983886719, + "rewards/margins": 41.306678771972656, + "rewards/rejected": -53.368507385253906, + "step": 3710 + }, + { + "epoch": 0.372, + "grad_norm": 3.8318094357225885e-12, + "learning_rate": 3.955307425393224e-06, + "logits/chosen": -0.9659935832023621, + "logits/rejected": 0.44564709067344666, + "logps/chosen": -257.97076416015625, + "logps/rejected": -842.8034057617188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.789884567260742, + "rewards/margins": 40.528785705566406, + "rewards/rejected": -50.31867218017578, + "step": 3720 + }, + { + "epoch": 0.373, + "grad_norm": 0.0, + "learning_rate": 3.948202930856697e-06, + "logits/chosen": -0.8581470251083374, + "logits/rejected": 0.4110802114009857, + "logps/chosen": -404.4557800292969, + "logps/rejected": -1082.4560546875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.478370666503906, + "rewards/margins": 45.96970748901367, + "rewards/rejected": -59.44807052612305, + "step": 3730 + }, + { + "epoch": 0.374, + "grad_norm": 2.034799845979096e-12, + "learning_rate": 3.941080790424483e-06, + "logits/chosen": -0.8508933186531067, + "logits/rejected": 0.5776023268699646, + "logps/chosen": -221.8153076171875, + "logps/rejected": -761.3008422851562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.045284271240234, + "rewards/margins": 43.52240753173828, + "rewards/rejected": -50.56769561767578, + "step": 3740 + }, + { + "epoch": 0.375, + "grad_norm": 1.2156111672823045e-09, + "learning_rate": 3.933941090877615e-06, + "logits/chosen": -0.8493801951408386, + "logits/rejected": 0.4712657332420349, + "logps/chosen": -307.35791015625, + "logps/rejected": -789.6890869140625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.67790412902832, + "rewards/margins": 38.20088577270508, + "rewards/rejected": -46.87879180908203, + "step": 3750 + }, + { + "epoch": 0.376, + "grad_norm": 0.0, + "learning_rate": 3.92678391921108e-06, + "logits/chosen": -0.7485553622245789, + "logits/rejected": 0.5399482250213623, + "logps/chosen": -329.4286193847656, + "logps/rejected": -833.7262573242188, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.084185600280762, + "rewards/margins": 44.51618194580078, + "rewards/rejected": -57.600372314453125, + "step": 3760 + }, + { + "epoch": 0.377, + "grad_norm": 9.002846889207342e-14, + "learning_rate": 3.9196093626327535e-06, + "logits/chosen": -0.64151531457901, + "logits/rejected": 0.6924604177474976, + "logps/chosen": -387.8374328613281, + "logps/rejected": -1071.4794921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.649490356445312, + "rewards/margins": 58.88508987426758, + "rewards/rejected": -74.53457641601562, + "step": 3770 + }, + { + "epoch": 0.378, + "grad_norm": 0.0, + "learning_rate": 3.912417508562345e-06, + "logits/chosen": -0.8945374488830566, + "logits/rejected": 0.7975891828536987, + "logps/chosen": -276.60089111328125, + "logps/rejected": -1138.664794921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.599028587341309, + "rewards/margins": 64.93830871582031, + "rewards/rejected": -77.53733825683594, + "step": 3780 + }, + { + "epoch": 0.379, + "grad_norm": 2.029291677120462e-15, + "learning_rate": 3.905208444630326e-06, + "logits/chosen": -0.4724903106689453, + "logits/rejected": 0.20428940653800964, + "logps/chosen": -431.5040588378906, + "logps/rejected": -842.90673828125, + "loss": 2.9491, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.42316246032715, + "rewards/margins": 38.17705154418945, + "rewards/rejected": -57.60021209716797, + "step": 3790 + }, + { + "epoch": 0.38, + "grad_norm": 3.7381135605496125e-18, + "learning_rate": 3.897982258676867e-06, + "logits/chosen": -0.48396188020706177, + "logits/rejected": 0.40527671575546265, + "logps/chosen": -599.632080078125, + "logps/rejected": -958.6070556640625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.483598709106445, + "rewards/margins": 35.09453582763672, + "rewards/rejected": -50.57814025878906, + "step": 3800 + }, + { + "epoch": 0.381, + "grad_norm": 1.3835478346662982e-17, + "learning_rate": 3.890739038750763e-06, + "logits/chosen": -0.07472027093172073, + "logits/rejected": 0.3322257399559021, + "logps/chosen": -547.6071166992188, + "logps/rejected": -853.6226806640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.650531768798828, + "rewards/margins": 36.77111053466797, + "rewards/rejected": -55.4216423034668, + "step": 3810 + }, + { + "epoch": 0.382, + "grad_norm": 0.0, + "learning_rate": 3.88347887310836e-06, + "logits/chosen": -0.4241139888763428, + "logits/rejected": 0.4764328896999359, + "logps/chosen": -441.03485107421875, + "logps/rejected": -940.2620239257812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.40770149230957, + "rewards/margins": 46.130409240722656, + "rewards/rejected": -61.538108825683594, + "step": 3820 + }, + { + "epoch": 0.383, + "grad_norm": 0.0, + "learning_rate": 3.876201850212489e-06, + "logits/chosen": -0.9169275164604187, + "logits/rejected": 0.6456999778747559, + "logps/chosen": -374.56854248046875, + "logps/rejected": -972.72314453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.147981643676758, + "rewards/margins": 49.80410385131836, + "rewards/rejected": -65.95207977294922, + "step": 3830 + }, + { + "epoch": 0.384, + "grad_norm": 1.0824854061080094e-13, + "learning_rate": 3.868908058731376e-06, + "logits/chosen": -0.5485990047454834, + "logits/rejected": 0.7526308298110962, + "logps/chosen": -478.01312255859375, + "logps/rejected": -938.0384521484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.822885513305664, + "rewards/margins": 40.522560119628906, + "rewards/rejected": -53.3454475402832, + "step": 3840 + }, + { + "epoch": 0.385, + "grad_norm": 0.0, + "learning_rate": 3.861597587537568e-06, + "logits/chosen": -0.8074597120285034, + "logits/rejected": 0.5264551639556885, + "logps/chosen": -325.8951721191406, + "logps/rejected": -919.2064208984375, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.664430618286133, + "rewards/margins": 47.888641357421875, + "rewards/rejected": -60.553077697753906, + "step": 3850 + }, + { + "epoch": 0.386, + "grad_norm": 9.141786717338023e-18, + "learning_rate": 3.85427052570685e-06, + "logits/chosen": -0.5131690502166748, + "logits/rejected": 0.6525561809539795, + "logps/chosen": -399.3219909667969, + "logps/rejected": -844.6796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.690951347351074, + "rewards/margins": 45.79986572265625, + "rewards/rejected": -61.49082565307617, + "step": 3860 + }, + { + "epoch": 0.387, + "grad_norm": 0.0, + "learning_rate": 3.846926962517158e-06, + "logits/chosen": -0.5702225565910339, + "logits/rejected": 0.5192240476608276, + "logps/chosen": -446.7099609375, + "logps/rejected": -1046.5726318359375, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.955255508422852, + "rewards/margins": 48.32957077026367, + "rewards/rejected": -63.284828186035156, + "step": 3870 + }, + { + "epoch": 0.388, + "grad_norm": 0.0, + "learning_rate": 3.839566987447492e-06, + "logits/chosen": -0.44374722242355347, + "logits/rejected": 0.702018141746521, + "logps/chosen": -257.0533752441406, + "logps/rejected": -927.4334716796875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.47410774230957, + "rewards/margins": 53.65742874145508, + "rewards/rejected": -66.13153076171875, + "step": 3880 + }, + { + "epoch": 0.389, + "grad_norm": 1.449809592382247e-22, + "learning_rate": 3.832190690176825e-06, + "logits/chosen": -0.4135567545890808, + "logits/rejected": 0.45063215494155884, + "logps/chosen": -438.75189208984375, + "logps/rejected": -829.2301635742188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.356510162353516, + "rewards/margins": 36.14095687866211, + "rewards/rejected": -56.497467041015625, + "step": 3890 + }, + { + "epoch": 0.39, + "grad_norm": 0.0, + "learning_rate": 3.824798160583012e-06, + "logits/chosen": -0.7455593347549438, + "logits/rejected": 0.6117419004440308, + "logps/chosen": -559.6799926757812, + "logps/rejected": -1304.4791259765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -22.246612548828125, + "rewards/margins": 63.397003173828125, + "rewards/rejected": -85.64362335205078, + "step": 3900 + }, + { + "epoch": 0.391, + "grad_norm": 0.00011770037235692143, + "learning_rate": 3.817389488741694e-06, + "logits/chosen": -0.7981809973716736, + "logits/rejected": 0.6145745515823364, + "logps/chosen": -287.59698486328125, + "logps/rejected": -947.4500732421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.564549446105957, + "rewards/margins": 51.583091735839844, + "rewards/rejected": -66.14763641357422, + "step": 3910 + }, + { + "epoch": 0.392, + "grad_norm": 8.74399904153729e-14, + "learning_rate": 3.8099647649251984e-06, + "logits/chosen": -0.3144210875034332, + "logits/rejected": 0.36411142349243164, + "logps/chosen": -704.8829345703125, + "logps/rejected": -1009.3225708007812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -26.28389549255371, + "rewards/margins": 38.85230255126953, + "rewards/rejected": -65.13619232177734, + "step": 3920 + }, + { + "epoch": 0.393, + "grad_norm": 0.0, + "learning_rate": 3.802524079601442e-06, + "logits/chosen": -0.5111797451972961, + "logits/rejected": 0.7962819337844849, + "logps/chosen": -282.1951904296875, + "logps/rejected": -826.8674926757812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.543550491333008, + "rewards/margins": 45.35955047607422, + "rewards/rejected": -61.903106689453125, + "step": 3930 + }, + { + "epoch": 0.394, + "grad_norm": 1.9319190344702086e-12, + "learning_rate": 3.795067523432826e-06, + "logits/chosen": -0.7026056051254272, + "logits/rejected": 0.7360762357711792, + "logps/chosen": -218.7373809814453, + "logps/rejected": -990.8726806640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.464958190917969, + "rewards/margins": 58.12116622924805, + "rewards/rejected": -70.58612823486328, + "step": 3940 + }, + { + "epoch": 0.395, + "grad_norm": 1.2454868176733741e-21, + "learning_rate": 3.787595187275136e-06, + "logits/chosen": -0.38544806838035583, + "logits/rejected": 0.5606005191802979, + "logps/chosen": -564.4012451171875, + "logps/rejected": -1214.3851318359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -24.259593963623047, + "rewards/margins": 62.833946228027344, + "rewards/rejected": -87.09355163574219, + "step": 3950 + }, + { + "epoch": 0.396, + "grad_norm": 398.5549621582031, + "learning_rate": 3.780107162176429e-06, + "logits/chosen": -0.12771472334861755, + "logits/rejected": 0.23889890313148499, + "logps/chosen": -632.0416870117188, + "logps/rejected": -1030.476806640625, + "loss": 0.053, + "rewards/accuracies": 1.0, + "rewards/chosen": -31.639551162719727, + "rewards/margins": 40.21255874633789, + "rewards/rejected": -71.85210418701172, + "step": 3960 + }, + { + "epoch": 0.397, + "grad_norm": 0.0, + "learning_rate": 3.772603539375929e-06, + "logits/chosen": -0.5894684195518494, + "logits/rejected": 0.7861677408218384, + "logps/chosen": -346.8887634277344, + "logps/rejected": -944.8414306640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.525707244873047, + "rewards/margins": 49.367393493652344, + "rewards/rejected": -66.89309692382812, + "step": 3970 + }, + { + "epoch": 0.398, + "grad_norm": 5.1069120672764257e-05, + "learning_rate": 3.7650844103029093e-06, + "logits/chosen": -0.4754267632961273, + "logits/rejected": 0.25245314836502075, + "logps/chosen": -240.3736114501953, + "logps/rejected": -768.532470703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.96320915222168, + "rewards/margins": 42.275733947753906, + "rewards/rejected": -54.23894500732422, + "step": 3980 + }, + { + "epoch": 0.399, + "grad_norm": 1.6205316821047843e-15, + "learning_rate": 3.7575498665755884e-06, + "logits/chosen": -0.3462293744087219, + "logits/rejected": 0.3235887885093689, + "logps/chosen": -371.97552490234375, + "logps/rejected": -866.0631103515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.929563522338867, + "rewards/margins": 45.36902618408203, + "rewards/rejected": -59.29859161376953, + "step": 3990 + }, + { + "epoch": 0.4, + "grad_norm": 0.0, + "learning_rate": 3.7500000000000005e-06, + "logits/chosen": -0.8836024403572083, + "logits/rejected": 0.7240092754364014, + "logps/chosen": -441.99041748046875, + "logps/rejected": -1129.660888671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.415254592895508, + "rewards/margins": 50.131492614746094, + "rewards/rejected": -70.5467529296875, + "step": 4000 + }, + { + "epoch": 0.401, + "grad_norm": 4.55244343114814e-20, + "learning_rate": 3.742434902568889e-06, + "logits/chosen": -0.3526178002357483, + "logits/rejected": 0.26132869720458984, + "logps/chosen": -562.0494384765625, + "logps/rejected": -974.6246337890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.889423370361328, + "rewards/margins": 43.921607971191406, + "rewards/rejected": -63.81103515625, + "step": 4010 + }, + { + "epoch": 0.402, + "grad_norm": 341.7988586425781, + "learning_rate": 3.7348546664605777e-06, + "logits/chosen": -1.0505037307739258, + "logits/rejected": 0.5458782911300659, + "logps/chosen": -359.09869384765625, + "logps/rejected": -1040.2222900390625, + "loss": 0.1245, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -16.71851921081543, + "rewards/margins": 46.79050827026367, + "rewards/rejected": -63.5090217590332, + "step": 4020 + }, + { + "epoch": 0.403, + "grad_norm": 3.106318462903774e-20, + "learning_rate": 3.7272593840378526e-06, + "logits/chosen": 0.07517627626657486, + "logits/rejected": 0.17533142864704132, + "logps/chosen": -382.37725830078125, + "logps/rejected": -732.4088745117188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.196589469909668, + "rewards/margins": 34.664756774902344, + "rewards/rejected": -49.86134338378906, + "step": 4030 + }, + { + "epoch": 0.404, + "grad_norm": 1.848615184540936e-11, + "learning_rate": 3.7196491478468322e-06, + "logits/chosen": -0.8824928402900696, + "logits/rejected": 0.4116589426994324, + "logps/chosen": -323.4385681152344, + "logps/rejected": -935.02294921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.299783706665039, + "rewards/margins": 46.94359588623047, + "rewards/rejected": -59.243377685546875, + "step": 4040 + }, + { + "epoch": 0.405, + "grad_norm": 2.899619184764494e-22, + "learning_rate": 3.7120240506158433e-06, + "logits/chosen": -0.8381205797195435, + "logits/rejected": 0.7291172742843628, + "logps/chosen": -309.32269287109375, + "logps/rejected": -971.3074951171875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.42654800415039, + "rewards/margins": 52.61445236206055, + "rewards/rejected": -64.04100036621094, + "step": 4050 + }, + { + "epoch": 0.406, + "grad_norm": 6.420053136489467e-17, + "learning_rate": 3.7043841852542884e-06, + "logits/chosen": -0.557745099067688, + "logits/rejected": 0.6661397218704224, + "logps/chosen": -288.09661865234375, + "logps/rejected": -966.2178955078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.155458450317383, + "rewards/margins": 58.315834045410156, + "rewards/rejected": -70.4712905883789, + "step": 4060 + }, + { + "epoch": 0.407, + "grad_norm": 0.0, + "learning_rate": 3.6967296448515176e-06, + "logits/chosen": -0.6943304538726807, + "logits/rejected": 0.5316852331161499, + "logps/chosen": -434.56268310546875, + "logps/rejected": -1232.92626953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.114227294921875, + "rewards/margins": 66.32272338867188, + "rewards/rejected": -82.43694305419922, + "step": 4070 + }, + { + "epoch": 0.408, + "grad_norm": 4.842999699189443e-12, + "learning_rate": 3.689060522675689e-06, + "logits/chosen": -0.53111332654953, + "logits/rejected": 0.36400312185287476, + "logps/chosen": -230.18508911132812, + "logps/rejected": -699.6070556640625, + "loss": 0.4617, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.08073616027832, + "rewards/margins": 40.68952560424805, + "rewards/rejected": -49.770259857177734, + "step": 4080 + }, + { + "epoch": 0.409, + "grad_norm": 3.8501983823380215e-08, + "learning_rate": 3.6813769121726356e-06, + "logits/chosen": -1.0677297115325928, + "logits/rejected": 0.30235835909843445, + "logps/chosen": -262.8753662109375, + "logps/rejected": -824.7024536132812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.651956558227539, + "rewards/margins": 38.70851516723633, + "rewards/rejected": -47.360477447509766, + "step": 4090 + }, + { + "epoch": 0.41, + "grad_norm": 2.6643040191057753e-16, + "learning_rate": 3.6736789069647273e-06, + "logits/chosen": -0.6208855509757996, + "logits/rejected": -0.03054434061050415, + "logps/chosen": -350.40325927734375, + "logps/rejected": -694.046630859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.904715061187744, + "rewards/margins": 29.830276489257812, + "rewards/rejected": -37.73499298095703, + "step": 4100 + }, + { + "epoch": 0.411, + "grad_norm": 0.0013503417139872909, + "learning_rate": 3.6659666008497287e-06, + "logits/chosen": -0.6803125143051147, + "logits/rejected": 0.3102254867553711, + "logps/chosen": -315.3930969238281, + "logps/rejected": -651.2322998046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.613018989562988, + "rewards/margins": 29.166412353515625, + "rewards/rejected": -35.7794303894043, + "step": 4110 + }, + { + "epoch": 0.412, + "grad_norm": 1.428150566556985e-12, + "learning_rate": 3.658240087799655e-06, + "logits/chosen": -0.40848007798194885, + "logits/rejected": -0.06087536737322807, + "logps/chosen": -300.522705078125, + "logps/rejected": -697.1531372070312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.41520881652832, + "rewards/margins": 36.236854553222656, + "rewards/rejected": -45.652061462402344, + "step": 4120 + }, + { + "epoch": 0.413, + "grad_norm": 0.00011740612535504624, + "learning_rate": 3.6504994619596295e-06, + "logits/chosen": -0.5807197690010071, + "logits/rejected": 0.24162797629833221, + "logps/chosen": -490.38043212890625, + "logps/rejected": -848.3361206054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.073907852172852, + "rewards/margins": 37.16918182373047, + "rewards/rejected": -47.24308776855469, + "step": 4130 + }, + { + "epoch": 0.414, + "grad_norm": 7.407800755595368e-17, + "learning_rate": 3.642744817646736e-06, + "logits/chosen": -0.9697567224502563, + "logits/rejected": 0.26281073689460754, + "logps/chosen": -361.99359130859375, + "logps/rejected": -823.3468627929688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.846258640289307, + "rewards/margins": 30.67578125, + "rewards/rejected": -38.52204513549805, + "step": 4140 + }, + { + "epoch": 0.415, + "grad_norm": 2.6733165742909525e-22, + "learning_rate": 3.634976249348867e-06, + "logits/chosen": -0.7604053616523743, + "logits/rejected": 0.2656019628047943, + "logps/chosen": -296.73431396484375, + "logps/rejected": -780.6051635742188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.004801750183105, + "rewards/margins": 36.96376419067383, + "rewards/rejected": -44.96856689453125, + "step": 4150 + }, + { + "epoch": 0.416, + "grad_norm": 1.125780147481541e-19, + "learning_rate": 3.627193851723577e-06, + "logits/chosen": -0.502686619758606, + "logits/rejected": 0.29140302538871765, + "logps/chosen": -234.8794403076172, + "logps/rejected": -619.0558471679688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.264492988586426, + "rewards/margins": 30.473552703857422, + "rewards/rejected": -40.73804473876953, + "step": 4160 + }, + { + "epoch": 0.417, + "grad_norm": 1.4964207200773406e-21, + "learning_rate": 3.6193977195969243e-06, + "logits/chosen": -0.7825326919555664, + "logits/rejected": 0.1626361906528473, + "logps/chosen": -405.54864501953125, + "logps/rejected": -663.46484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.345398902893066, + "rewards/margins": 32.6074333190918, + "rewards/rejected": -37.95283126831055, + "step": 4170 + }, + { + "epoch": 0.418, + "grad_norm": 0.0008746925159357488, + "learning_rate": 3.611587947962319e-06, + "logits/chosen": -0.8463672399520874, + "logits/rejected": 0.26045385003089905, + "logps/chosen": -236.1060028076172, + "logps/rejected": -748.9749755859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.019121170043945, + "rewards/margins": 37.229774475097656, + "rewards/rejected": -45.24889373779297, + "step": 4180 + }, + { + "epoch": 0.419, + "grad_norm": 2.3645246871595305e-10, + "learning_rate": 3.6037646319793635e-06, + "logits/chosen": -1.3852901458740234, + "logits/rejected": 0.21314740180969238, + "logps/chosen": -202.6114044189453, + "logps/rejected": -827.9249267578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.685309410095215, + "rewards/margins": 42.572757720947266, + "rewards/rejected": -48.25806427001953, + "step": 4190 + }, + { + "epoch": 0.42, + "grad_norm": 0.0, + "learning_rate": 3.595927866972694e-06, + "logits/chosen": -0.7606481313705444, + "logits/rejected": 0.3610491454601288, + "logps/chosen": -369.33905029296875, + "logps/rejected": -958.4986572265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.40889835357666, + "rewards/margins": 44.8622932434082, + "rewards/rejected": -54.27119064331055, + "step": 4200 + }, + { + "epoch": 0.421, + "grad_norm": 3.889532672474161e-05, + "learning_rate": 3.5880777484308193e-06, + "logits/chosen": -0.8923002481460571, + "logits/rejected": 0.17552146315574646, + "logps/chosen": -297.636474609375, + "logps/rejected": -713.8099365234375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.230313301086426, + "rewards/margins": 31.368526458740234, + "rewards/rejected": -40.598838806152344, + "step": 4210 + }, + { + "epoch": 0.422, + "grad_norm": 6.331940527459778e-10, + "learning_rate": 3.5802143720049565e-06, + "logits/chosen": -0.9664111137390137, + "logits/rejected": 0.3709142804145813, + "logps/chosen": -392.75384521484375, + "logps/rejected": -856.77685546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.62370777130127, + "rewards/margins": 36.612483978271484, + "rewards/rejected": -45.23619079589844, + "step": 4220 + }, + { + "epoch": 0.423, + "grad_norm": 0.0, + "learning_rate": 3.5723378335078653e-06, + "logits/chosen": -0.5414325594902039, + "logits/rejected": 0.0008636951679363847, + "logps/chosen": -414.93902587890625, + "logps/rejected": -768.058349609375, + "loss": 0.292, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -14.064050674438477, + "rewards/margins": 29.415279388427734, + "rewards/rejected": -43.47932815551758, + "step": 4230 + }, + { + "epoch": 0.424, + "grad_norm": 5.148122454556869e-06, + "learning_rate": 3.564448228912682e-06, + "logits/chosen": -0.6183091402053833, + "logits/rejected": 0.3114756643772125, + "logps/chosen": -337.8242492675781, + "logps/rejected": -824.0460815429688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.286993026733398, + "rewards/margins": 39.22076416015625, + "rewards/rejected": -48.507755279541016, + "step": 4240 + }, + { + "epoch": 0.425, + "grad_norm": 0.2487691193819046, + "learning_rate": 3.556545654351749e-06, + "logits/chosen": -0.6846747398376465, + "logits/rejected": 0.16172091662883759, + "logps/chosen": -416.21160888671875, + "logps/rejected": -663.5516357421875, + "loss": 0.2262, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -11.881389617919922, + "rewards/margins": 25.58749771118164, + "rewards/rejected": -37.46889114379883, + "step": 4250 + }, + { + "epoch": 0.426, + "grad_norm": 1.436825769474126e-09, + "learning_rate": 3.5486302061154433e-06, + "logits/chosen": -0.6829143166542053, + "logits/rejected": 0.5160936713218689, + "logps/chosen": -241.07302856445312, + "logps/rejected": -719.6329956054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.42330265045166, + "rewards/margins": 37.69516372680664, + "rewards/rejected": -46.11846160888672, + "step": 4260 + }, + { + "epoch": 0.427, + "grad_norm": 0.0, + "learning_rate": 3.5407019806510035e-06, + "logits/chosen": -0.4164047837257385, + "logits/rejected": 0.5037349462509155, + "logps/chosen": -414.34600830078125, + "logps/rejected": -780.9544067382812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.40142822265625, + "rewards/margins": 32.7313346862793, + "rewards/rejected": -43.13277053833008, + "step": 4270 + }, + { + "epoch": 0.428, + "grad_norm": 1.8435288018370244e-12, + "learning_rate": 3.532761074561355e-06, + "logits/chosen": -0.7431804537773132, + "logits/rejected": 0.388200581073761, + "logps/chosen": -275.937744140625, + "logps/rejected": -827.150390625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.089512825012207, + "rewards/margins": 42.278560638427734, + "rewards/rejected": -52.368072509765625, + "step": 4280 + }, + { + "epoch": 0.429, + "grad_norm": 0.008692407049238682, + "learning_rate": 3.524807584603932e-06, + "logits/chosen": -0.6186565160751343, + "logits/rejected": 0.3314044177532196, + "logps/chosen": -253.70889282226562, + "logps/rejected": -659.0180053710938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.876142978668213, + "rewards/margins": 37.75662612915039, + "rewards/rejected": -44.63277053833008, + "step": 4290 + }, + { + "epoch": 0.43, + "grad_norm": 1.1581534427367252e-15, + "learning_rate": 3.516841607689501e-06, + "logits/chosen": -1.0289478302001953, + "logits/rejected": 0.5168917179107666, + "logps/chosen": -458.88623046875, + "logps/rejected": -1042.71484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.337991714477539, + "rewards/margins": 48.008358001708984, + "rewards/rejected": -58.346351623535156, + "step": 4300 + }, + { + "epoch": 0.431, + "grad_norm": 2.3897616524548582e-11, + "learning_rate": 3.5088632408809757e-06, + "logits/chosen": -0.8067126274108887, + "logits/rejected": 0.3740237355232239, + "logps/chosen": -226.9942169189453, + "logps/rejected": -701.123291015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.234461784362793, + "rewards/margins": 38.51856231689453, + "rewards/rejected": -45.753013610839844, + "step": 4310 + }, + { + "epoch": 0.432, + "grad_norm": 4.3961008444615146e-11, + "learning_rate": 3.5008725813922383e-06, + "logits/chosen": -1.3616108894348145, + "logits/rejected": 0.6536494493484497, + "logps/chosen": -339.8933410644531, + "logps/rejected": -918.8294677734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.569557189941406, + "rewards/margins": 45.761993408203125, + "rewards/rejected": -52.33155059814453, + "step": 4320 + }, + { + "epoch": 0.433, + "grad_norm": 2.4098374662960553e-13, + "learning_rate": 3.4928697265869516e-06, + "logits/chosen": -0.5547953248023987, + "logits/rejected": 0.3869093954563141, + "logps/chosen": -310.59234619140625, + "logps/rejected": -775.0176391601562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.434072494506836, + "rewards/margins": 42.483253479003906, + "rewards/rejected": -52.917320251464844, + "step": 4330 + }, + { + "epoch": 0.434, + "grad_norm": 3.553122372197551e-15, + "learning_rate": 3.4848547739773782e-06, + "logits/chosen": -0.2578112483024597, + "logits/rejected": 0.28735774755477905, + "logps/chosen": -309.19537353515625, + "logps/rejected": -687.3353271484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.1480131149292, + "rewards/margins": 32.68153381347656, + "rewards/rejected": -45.82954788208008, + "step": 4340 + }, + { + "epoch": 0.435, + "grad_norm": 2.7313951165776243e-16, + "learning_rate": 3.476827821223184e-06, + "logits/chosen": -0.5213783383369446, + "logits/rejected": 0.43540406227111816, + "logps/chosen": -269.1460876464844, + "logps/rejected": -684.8394775390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.5526556968688965, + "rewards/margins": 39.1630859375, + "rewards/rejected": -46.71574020385742, + "step": 4350 + }, + { + "epoch": 0.436, + "grad_norm": 7.970552360347938e-07, + "learning_rate": 3.4687889661302577e-06, + "logits/chosen": -0.5087043046951294, + "logits/rejected": 0.5694769024848938, + "logps/chosen": -435.17156982421875, + "logps/rejected": -951.3097534179688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.336833953857422, + "rewards/margins": 45.699851989746094, + "rewards/rejected": -62.03668975830078, + "step": 4360 + }, + { + "epoch": 0.437, + "grad_norm": 9.015243307430865e-08, + "learning_rate": 3.460738306649509e-06, + "logits/chosen": -0.6573070287704468, + "logits/rejected": 0.7821485996246338, + "logps/chosen": -173.24493408203125, + "logps/rejected": -626.5826416015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.188490390777588, + "rewards/margins": 35.53300476074219, + "rewards/rejected": -42.721492767333984, + "step": 4370 + }, + { + "epoch": 0.438, + "grad_norm": 7.699915783887652e-18, + "learning_rate": 3.452675940875686e-06, + "logits/chosen": -0.7135838270187378, + "logits/rejected": 0.1635311394929886, + "logps/chosen": -472.3910217285156, + "logps/rejected": -774.1685791015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.920271873474121, + "rewards/margins": 31.03672218322754, + "rewards/rejected": -41.956993103027344, + "step": 4380 + }, + { + "epoch": 0.439, + "grad_norm": 0.0, + "learning_rate": 3.4446019670461684e-06, + "logits/chosen": -1.4899814128875732, + "logits/rejected": 0.6516152024269104, + "logps/chosen": -200.21481323242188, + "logps/rejected": -914.6165161132812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.7514543533325195, + "rewards/margins": 51.47700881958008, + "rewards/rejected": -57.22846603393555, + "step": 4390 + }, + { + "epoch": 0.44, + "grad_norm": 3.991723588114837e-06, + "learning_rate": 3.436516483539781e-06, + "logits/chosen": -0.6385399103164673, + "logits/rejected": 0.32198888063430786, + "logps/chosen": -381.22479248046875, + "logps/rejected": -689.6920776367188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.779531955718994, + "rewards/margins": 30.917633056640625, + "rewards/rejected": -38.697166442871094, + "step": 4400 + }, + { + "epoch": 0.441, + "grad_norm": 6.0199471275945715e-12, + "learning_rate": 3.4284195888755877e-06, + "logits/chosen": -0.6760299205780029, + "logits/rejected": 0.44728073477745056, + "logps/chosen": -283.923583984375, + "logps/rejected": -700.1207275390625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.251309394836426, + "rewards/margins": 32.73466491699219, + "rewards/rejected": -41.9859733581543, + "step": 4410 + }, + { + "epoch": 0.442, + "grad_norm": 1.9233450943012542e-10, + "learning_rate": 3.4203113817116955e-06, + "logits/chosen": -0.8296216726303101, + "logits/rejected": 0.1775527447462082, + "logps/chosen": -386.5934143066406, + "logps/rejected": -863.5372924804688, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.466753959655762, + "rewards/margins": 36.731040954589844, + "rewards/rejected": -51.197792053222656, + "step": 4420 + }, + { + "epoch": 0.443, + "grad_norm": 0.0, + "learning_rate": 3.412191960844049e-06, + "logits/chosen": -0.366151362657547, + "logits/rejected": 0.12416459619998932, + "logps/chosen": -387.76910400390625, + "logps/rejected": -773.8753662109375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.397980690002441, + "rewards/margins": 43.43345260620117, + "rewards/rejected": -52.8314323425293, + "step": 4430 + }, + { + "epoch": 0.444, + "grad_norm": 0.020295394584536552, + "learning_rate": 3.4040614252052305e-06, + "logits/chosen": -0.7474874258041382, + "logits/rejected": 0.4018009305000305, + "logps/chosen": -328.6078796386719, + "logps/rejected": -1040.136962890625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.811203002929688, + "rewards/margins": 60.68671417236328, + "rewards/rejected": -72.49790954589844, + "step": 4440 + }, + { + "epoch": 0.445, + "grad_norm": 0.0, + "learning_rate": 3.39591987386325e-06, + "logits/chosen": -0.22013449668884277, + "logits/rejected": 0.8452490568161011, + "logps/chosen": -458.42352294921875, + "logps/rejected": -1054.8280029296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.357414245605469, + "rewards/margins": 60.5202522277832, + "rewards/rejected": -75.87767028808594, + "step": 4450 + }, + { + "epoch": 0.446, + "grad_norm": 5.777521206575345e-16, + "learning_rate": 3.387767406020343e-06, + "logits/chosen": -0.4100729823112488, + "logits/rejected": 0.9636079668998718, + "logps/chosen": -517.3120727539062, + "logps/rejected": -1221.5855712890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.92448616027832, + "rewards/margins": 66.79444122314453, + "rewards/rejected": -80.71891784667969, + "step": 4460 + }, + { + "epoch": 0.447, + "grad_norm": 200.02029418945312, + "learning_rate": 3.3796041210117545e-06, + "logits/chosen": -0.5789368152618408, + "logits/rejected": 1.0708125829696655, + "logps/chosen": -411.73980712890625, + "logps/rejected": -1207.94091796875, + "loss": 0.1911, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -13.856058120727539, + "rewards/margins": 72.57444763183594, + "rewards/rejected": -86.43049621582031, + "step": 4470 + }, + { + "epoch": 0.448, + "grad_norm": 3.0700267127912767e-16, + "learning_rate": 3.3714301183045382e-06, + "logits/chosen": -0.26980918645858765, + "logits/rejected": 0.574004590511322, + "logps/chosen": -414.5419921875, + "logps/rejected": -1148.537353515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.525867462158203, + "rewards/margins": 74.47563171386719, + "rewards/rejected": -91.00149536132812, + "step": 4480 + }, + { + "epoch": 0.449, + "grad_norm": 7.729781224818932e-12, + "learning_rate": 3.3632454974963368e-06, + "logits/chosen": -0.4832285940647125, + "logits/rejected": 0.8683622479438782, + "logps/chosen": -475.9676208496094, + "logps/rejected": -1218.5394287109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.89743995666504, + "rewards/margins": 69.6901626586914, + "rewards/rejected": -90.58760070800781, + "step": 4490 + }, + { + "epoch": 0.45, + "grad_norm": 5.4706279506433475e-09, + "learning_rate": 3.3550503583141726e-06, + "logits/chosen": -0.607457160949707, + "logits/rejected": 1.0746209621429443, + "logps/chosen": -425.61590576171875, + "logps/rejected": -996.9503784179688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.851272583007812, + "rewards/margins": 54.49498748779297, + "rewards/rejected": -74.34626770019531, + "step": 4500 + }, + { + "epoch": 0.451, + "grad_norm": 3.78396756474126e-14, + "learning_rate": 3.346844800613229e-06, + "logits/chosen": -0.2813403606414795, + "logits/rejected": 1.1134769916534424, + "logps/chosen": -579.0421142578125, + "logps/rejected": -1489.37060546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -25.425365447998047, + "rewards/margins": 87.05371856689453, + "rewards/rejected": -112.47908020019531, + "step": 4510 + }, + { + "epoch": 0.452, + "grad_norm": 6.463025947756051e-11, + "learning_rate": 3.338628924375638e-06, + "logits/chosen": -0.7063466310501099, + "logits/rejected": 0.9095737338066101, + "logps/chosen": -304.01470947265625, + "logps/rejected": -1101.2586669921875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.164072036743164, + "rewards/margins": 64.8074722290039, + "rewards/rejected": -80.97154235839844, + "step": 4520 + }, + { + "epoch": 0.453, + "grad_norm": 0.0, + "learning_rate": 3.3304028297092583e-06, + "logits/chosen": -0.31411752104759216, + "logits/rejected": 0.9088078737258911, + "logps/chosen": -433.57940673828125, + "logps/rejected": -1088.935302734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.69645118713379, + "rewards/margins": 62.624244689941406, + "rewards/rejected": -80.32068634033203, + "step": 4530 + }, + { + "epoch": 0.454, + "grad_norm": 0.0, + "learning_rate": 3.3221666168464584e-06, + "logits/chosen": -0.5670292377471924, + "logits/rejected": 0.8151572942733765, + "logps/chosen": -518.0890502929688, + "logps/rejected": -1576.5970458984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -21.283605575561523, + "rewards/margins": 96.97727966308594, + "rewards/rejected": -118.26087951660156, + "step": 4540 + }, + { + "epoch": 0.455, + "grad_norm": 3.6856651064454127e-10, + "learning_rate": 3.313920386142892e-06, + "logits/chosen": -0.3371369242668152, + "logits/rejected": 0.8347536325454712, + "logps/chosen": -359.16168212890625, + "logps/rejected": -1016.52685546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.546493530273438, + "rewards/margins": 59.38935470581055, + "rewards/rejected": -76.93585205078125, + "step": 4550 + }, + { + "epoch": 0.456, + "grad_norm": 0.0, + "learning_rate": 3.3056642380762783e-06, + "logits/chosen": -0.4648679792881012, + "logits/rejected": 1.0613789558410645, + "logps/chosen": -707.9065551757812, + "logps/rejected": -1881.05078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -31.829044342041016, + "rewards/margins": 105.40827941894531, + "rewards/rejected": -137.2373046875, + "step": 4560 + }, + { + "epoch": 0.457, + "grad_norm": 0.0, + "learning_rate": 3.2973982732451753e-06, + "logits/chosen": -0.9866671562194824, + "logits/rejected": 1.4066946506500244, + "logps/chosen": -392.55731201171875, + "logps/rejected": -1470.923095703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.702869415283203, + "rewards/margins": 90.46757507324219, + "rewards/rejected": -110.17044830322266, + "step": 4570 + }, + { + "epoch": 0.458, + "grad_norm": 0.0, + "learning_rate": 3.2891225923677565e-06, + "logits/chosen": -0.5084947943687439, + "logits/rejected": 0.9717354774475098, + "logps/chosen": -396.46392822265625, + "logps/rejected": -1423.1839599609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.467742919921875, + "rewards/margins": 92.16461944580078, + "rewards/rejected": -106.63236236572266, + "step": 4580 + }, + { + "epoch": 0.459, + "grad_norm": 5.6872564788067e-17, + "learning_rate": 3.280837296280582e-06, + "logits/chosen": -0.8941437005996704, + "logits/rejected": 1.4038536548614502, + "logps/chosen": -263.62640380859375, + "logps/rejected": -1279.1226806640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.20585823059082, + "rewards/margins": 80.79095458984375, + "rewards/rejected": -94.9968032836914, + "step": 4590 + }, + { + "epoch": 0.46, + "grad_norm": 0.0, + "learning_rate": 3.272542485937369e-06, + "logits/chosen": -0.31348443031311035, + "logits/rejected": 0.8007584810256958, + "logps/chosen": -546.2010498046875, + "logps/rejected": -1243.5308837890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.36667251586914, + "rewards/margins": 70.93089294433594, + "rewards/rejected": -89.29756927490234, + "step": 4600 + }, + { + "epoch": 0.461, + "grad_norm": 0.0, + "learning_rate": 3.2642382624077647e-06, + "logits/chosen": -0.5122408270835876, + "logits/rejected": 0.698052704334259, + "logps/chosen": -435.46142578125, + "logps/rejected": -1183.85986328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.03237533569336, + "rewards/margins": 63.591339111328125, + "rewards/rejected": -79.62370300292969, + "step": 4610 + }, + { + "epoch": 0.462, + "grad_norm": 6.614533741655918e-19, + "learning_rate": 3.2559247268761117e-06, + "logits/chosen": -0.789227306842804, + "logits/rejected": 1.2054466009140015, + "logps/chosen": -334.6946105957031, + "logps/rejected": -1168.644775390625, + "loss": 0.1734, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.957490921020508, + "rewards/margins": 64.99995422363281, + "rewards/rejected": -84.95745086669922, + "step": 4620 + }, + { + "epoch": 0.463, + "grad_norm": 0.0, + "learning_rate": 3.247601980640217e-06, + "logits/chosen": -0.46238309144973755, + "logits/rejected": 0.8354955911636353, + "logps/chosen": -530.7396850585938, + "logps/rejected": -1429.529541015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.647016525268555, + "rewards/margins": 84.20267486572266, + "rewards/rejected": -101.84969329833984, + "step": 4630 + }, + { + "epoch": 0.464, + "grad_norm": 0.0, + "learning_rate": 3.2392701251101172e-06, + "logits/chosen": -0.7151135206222534, + "logits/rejected": 0.6666629910469055, + "logps/chosen": -438.103515625, + "logps/rejected": -1222.823974609375, + "loss": 0.0251, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.228551864624023, + "rewards/margins": 80.06420135498047, + "rewards/rejected": -92.29276275634766, + "step": 4640 + }, + { + "epoch": 0.465, + "grad_norm": 0.0, + "learning_rate": 3.230929261806842e-06, + "logits/chosen": -0.47254037857055664, + "logits/rejected": 0.9429537057876587, + "logps/chosen": -334.66949462890625, + "logps/rejected": -1285.926025390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.019699096679688, + "rewards/margins": 82.53520202636719, + "rewards/rejected": -101.55490112304688, + "step": 4650 + }, + { + "epoch": 0.466, + "grad_norm": 0.0, + "learning_rate": 3.222579492361179e-06, + "logits/chosen": -0.31999364495277405, + "logits/rejected": 1.0213924646377563, + "logps/chosen": -595.5615234375, + "logps/rejected": -1488.1405029296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.56376838684082, + "rewards/margins": 82.70274353027344, + "rewards/rejected": -102.26651000976562, + "step": 4660 + }, + { + "epoch": 0.467, + "grad_norm": 0.0, + "learning_rate": 3.214220918512434e-06, + "logits/chosen": -0.6097769737243652, + "logits/rejected": 0.9890214800834656, + "logps/chosen": -217.25765991210938, + "logps/rejected": -1152.97607421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.308805465698242, + "rewards/margins": 80.55377960205078, + "rewards/rejected": -92.86257934570312, + "step": 4670 + }, + { + "epoch": 0.468, + "grad_norm": 0.0, + "learning_rate": 3.205853642107192e-06, + "logits/chosen": -0.7291964292526245, + "logits/rejected": 0.6837356090545654, + "logps/chosen": -446.716796875, + "logps/rejected": -1542.17431640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.69511604309082, + "rewards/margins": 97.58460998535156, + "rewards/rejected": -117.27970886230469, + "step": 4680 + }, + { + "epoch": 0.469, + "grad_norm": 1.9307333374218513e-14, + "learning_rate": 3.1974777650980737e-06, + "logits/chosen": -0.882508397102356, + "logits/rejected": 0.8033136129379272, + "logps/chosen": -353.4367980957031, + "logps/rejected": -1384.420654296875, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.956860542297363, + "rewards/margins": 85.45812225341797, + "rewards/rejected": -100.41497802734375, + "step": 4690 + }, + { + "epoch": 0.47, + "grad_norm": 0.0, + "learning_rate": 3.189093389542498e-06, + "logits/chosen": -0.5763040781021118, + "logits/rejected": 1.2356733083724976, + "logps/chosen": -531.9615478515625, + "logps/rejected": -1474.219970703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -27.269664764404297, + "rewards/margins": 83.10777282714844, + "rewards/rejected": -110.3774185180664, + "step": 4700 + }, + { + "epoch": 0.471, + "grad_norm": 0.0, + "learning_rate": 3.180700617601436e-06, + "logits/chosen": -0.5389689207077026, + "logits/rejected": 0.7355275750160217, + "logps/chosen": -611.3015747070312, + "logps/rejected": -1604.859619140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -37.51515197753906, + "rewards/margins": 84.79194641113281, + "rewards/rejected": -122.3071060180664, + "step": 4710 + }, + { + "epoch": 0.472, + "grad_norm": 5.714040351989524e-13, + "learning_rate": 3.1722995515381644e-06, + "logits/chosen": -0.3022814393043518, + "logits/rejected": 1.2010146379470825, + "logps/chosen": -409.2784423828125, + "logps/rejected": -1302.8153076171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.96738052368164, + "rewards/margins": 79.1131362915039, + "rewards/rejected": -98.08050537109375, + "step": 4720 + }, + { + "epoch": 0.473, + "grad_norm": 1.995590777248055e-15, + "learning_rate": 3.1638902937170224e-06, + "logits/chosen": -0.5371532440185547, + "logits/rejected": 1.079555630683899, + "logps/chosen": -493.83477783203125, + "logps/rejected": -1238.79150390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.327775955200195, + "rewards/margins": 65.20509338378906, + "rewards/rejected": -85.53287506103516, + "step": 4730 + }, + { + "epoch": 0.474, + "grad_norm": 0.0, + "learning_rate": 3.155472946602162e-06, + "logits/chosen": 0.05416768044233322, + "logits/rejected": 0.6128655672073364, + "logps/chosen": -610.4722900390625, + "logps/rejected": -1401.691162109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -33.495906829833984, + "rewards/margins": 79.49934387207031, + "rewards/rejected": -112.99525451660156, + "step": 4740 + }, + { + "epoch": 0.475, + "grad_norm": 0.0, + "learning_rate": 3.147047612756302e-06, + "logits/chosen": -0.7330142259597778, + "logits/rejected": 1.4546329975128174, + "logps/chosen": -310.0386657714844, + "logps/rejected": -1607.7252197265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.920520782470703, + "rewards/margins": 114.64827728271484, + "rewards/rejected": -133.56878662109375, + "step": 4750 + }, + { + "epoch": 0.476, + "grad_norm": 0.0, + "learning_rate": 3.1386143948394764e-06, + "logits/chosen": -1.1134151220321655, + "logits/rejected": 1.2869594097137451, + "logps/chosen": -354.29571533203125, + "logps/rejected": -1476.500732421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -24.353538513183594, + "rewards/margins": 91.77238464355469, + "rewards/rejected": -116.12593841552734, + "step": 4760 + }, + { + "epoch": 0.477, + "grad_norm": 0.0, + "learning_rate": 3.130173395607785e-06, + "logits/chosen": -0.9510402679443359, + "logits/rejected": 1.2608510255813599, + "logps/chosen": -573.7791137695312, + "logps/rejected": -1927.2662353515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -23.785295486450195, + "rewards/margins": 120.95957946777344, + "rewards/rejected": -144.744873046875, + "step": 4770 + }, + { + "epoch": 0.478, + "grad_norm": 0.0, + "learning_rate": 3.121724717912138e-06, + "logits/chosen": -0.5686971545219421, + "logits/rejected": 1.2843066453933716, + "logps/chosen": -423.05084228515625, + "logps/rejected": -1424.4683837890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -25.960468292236328, + "rewards/margins": 82.43710327148438, + "rewards/rejected": -108.3975830078125, + "step": 4780 + }, + { + "epoch": 0.479, + "grad_norm": 0.0, + "learning_rate": 3.1132684646970068e-06, + "logits/chosen": -0.25530606508255005, + "logits/rejected": 0.879818320274353, + "logps/chosen": -614.8173217773438, + "logps/rejected": -1260.456787109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -27.6771183013916, + "rewards/margins": 61.40053176879883, + "rewards/rejected": -89.07764434814453, + "step": 4790 + }, + { + "epoch": 0.48, + "grad_norm": 0.0, + "learning_rate": 3.1048047389991693e-06, + "logits/chosen": -0.5389418601989746, + "logits/rejected": 1.1465704441070557, + "logps/chosen": -342.2206115722656, + "logps/rejected": -1164.1463623046875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.74749755859375, + "rewards/margins": 69.92078399658203, + "rewards/rejected": -88.66828918457031, + "step": 4800 + }, + { + "epoch": 0.481, + "grad_norm": 0.0, + "learning_rate": 3.0963336439464527e-06, + "logits/chosen": -1.0253163576126099, + "logits/rejected": 0.9419560432434082, + "logps/chosen": -423.51068115234375, + "logps/rejected": -1376.6455078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.159902572631836, + "rewards/margins": 77.23731994628906, + "rewards/rejected": -95.39723205566406, + "step": 4810 + }, + { + "epoch": 0.482, + "grad_norm": 0.0, + "learning_rate": 3.087855282756475e-06, + "logits/chosen": -0.06365472078323364, + "logits/rejected": 0.9617937803268433, + "logps/chosen": -586.2677001953125, + "logps/rejected": -1372.3779296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -33.02089309692383, + "rewards/margins": 73.61542510986328, + "rewards/rejected": -106.63630676269531, + "step": 4820 + }, + { + "epoch": 0.483, + "grad_norm": 2.512538492409947e-21, + "learning_rate": 3.079369758735393e-06, + "logits/chosen": -0.1162848025560379, + "logits/rejected": 1.0244472026824951, + "logps/chosen": -283.7181091308594, + "logps/rejected": -922.8565673828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.87861442565918, + "rewards/margins": 58.47466278076172, + "rewards/rejected": -74.35327911376953, + "step": 4830 + }, + { + "epoch": 0.484, + "grad_norm": 2.2687768005437425e-13, + "learning_rate": 3.0708771752766397e-06, + "logits/chosen": -0.730124294757843, + "logits/rejected": 1.3653849363327026, + "logps/chosen": -306.2144470214844, + "logps/rejected": -1348.135009765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.26485824584961, + "rewards/margins": 90.71617126464844, + "rewards/rejected": -106.98104095458984, + "step": 4840 + }, + { + "epoch": 0.485, + "grad_norm": 3.276530235132772e-18, + "learning_rate": 3.062377635859663e-06, + "logits/chosen": -0.5650082230567932, + "logits/rejected": 0.9500142931938171, + "logps/chosen": -571.1583862304688, + "logps/rejected": -1898.913818359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -24.382946014404297, + "rewards/margins": 123.2589111328125, + "rewards/rejected": -147.641845703125, + "step": 4850 + }, + { + "epoch": 0.486, + "grad_norm": 0.0, + "learning_rate": 3.053871244048669e-06, + "logits/chosen": -0.1732010543346405, + "logits/rejected": 0.6027408242225647, + "logps/chosen": -812.1961669921875, + "logps/rejected": -1584.22314453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -24.407480239868164, + "rewards/margins": 88.35913848876953, + "rewards/rejected": -112.7666015625, + "step": 4860 + }, + { + "epoch": 0.487, + "grad_norm": 0.0, + "learning_rate": 3.045358103491357e-06, + "logits/chosen": -0.3686712086200714, + "logits/rejected": 1.344543695449829, + "logps/chosen": -415.87420654296875, + "logps/rejected": -1310.935791015625, + "loss": 0.1168, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -24.52159309387207, + "rewards/margins": 77.90599060058594, + "rewards/rejected": -102.42757415771484, + "step": 4870 + }, + { + "epoch": 0.488, + "grad_norm": 0.0, + "learning_rate": 3.0368383179176584e-06, + "logits/chosen": -0.8000626564025879, + "logits/rejected": 1.0974117517471313, + "logps/chosen": -320.93768310546875, + "logps/rejected": -1404.354736328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -21.79820442199707, + "rewards/margins": 94.18359375, + "rewards/rejected": -115.98179626464844, + "step": 4880 + }, + { + "epoch": 0.489, + "grad_norm": 0.0, + "learning_rate": 3.0283119911384724e-06, + "logits/chosen": -0.8725460171699524, + "logits/rejected": 1.4595402479171753, + "logps/chosen": -270.36187744140625, + "logps/rejected": -1621.961669921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.967273712158203, + "rewards/margins": 112.51634216308594, + "rewards/rejected": -129.48361206054688, + "step": 4890 + }, + { + "epoch": 0.49, + "grad_norm": 0.0, + "learning_rate": 3.019779227044398e-06, + "logits/chosen": -0.5901892781257629, + "logits/rejected": 1.4911832809448242, + "logps/chosen": -424.73095703125, + "logps/rejected": -1390.6676025390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -22.552160263061523, + "rewards/margins": 86.83820343017578, + "rewards/rejected": -109.39036560058594, + "step": 4900 + }, + { + "epoch": 0.491, + "grad_norm": 0.0, + "learning_rate": 3.0112401296044756e-06, + "logits/chosen": -0.31066763401031494, + "logits/rejected": 1.3882120847702026, + "logps/chosen": -620.4592895507812, + "logps/rejected": -1867.9720458984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -32.121437072753906, + "rewards/margins": 115.75643157958984, + "rewards/rejected": -147.8778533935547, + "step": 4910 + }, + { + "epoch": 0.492, + "grad_norm": 0.0, + "learning_rate": 3.002694802864912e-06, + "logits/chosen": -0.33419251441955566, + "logits/rejected": 1.1376793384552002, + "logps/chosen": -537.1612548828125, + "logps/rejected": -1480.853515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -31.045330047607422, + "rewards/margins": 86.95547485351562, + "rewards/rejected": -118.00080871582031, + "step": 4920 + }, + { + "epoch": 0.493, + "grad_norm": 0.0, + "learning_rate": 2.9941433509478157e-06, + "logits/chosen": -0.39663586020469666, + "logits/rejected": 1.4704856872558594, + "logps/chosen": -459.516845703125, + "logps/rejected": -1432.1171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -23.825571060180664, + "rewards/margins": 90.44802856445312, + "rewards/rejected": -114.27359771728516, + "step": 4930 + }, + { + "epoch": 0.494, + "grad_norm": 0.0, + "learning_rate": 2.98558587804993e-06, + "logits/chosen": -0.7232345342636108, + "logits/rejected": 1.2291861772537231, + "logps/chosen": -293.2331237792969, + "logps/rejected": -1354.670654296875, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.588459014892578, + "rewards/margins": 89.76078796386719, + "rewards/rejected": -107.3492431640625, + "step": 4940 + }, + { + "epoch": 0.495, + "grad_norm": 0.0, + "learning_rate": 2.9770224884413625e-06, + "logits/chosen": -0.5555292963981628, + "logits/rejected": 1.3182722330093384, + "logps/chosen": -564.3573608398438, + "logps/rejected": -2021.4749755859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -33.60253143310547, + "rewards/margins": 131.98068237304688, + "rewards/rejected": -165.58322143554688, + "step": 4950 + }, + { + "epoch": 0.496, + "grad_norm": 0.0, + "learning_rate": 2.9684532864643123e-06, + "logits/chosen": 0.037823986262083054, + "logits/rejected": 1.2553424835205078, + "logps/chosen": -715.589111328125, + "logps/rejected": -1661.09765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -40.89625549316406, + "rewards/margins": 89.43692779541016, + "rewards/rejected": -130.33316040039062, + "step": 4960 + }, + { + "epoch": 0.497, + "grad_norm": 0.0, + "learning_rate": 2.9598783765318005e-06, + "logits/chosen": -0.36400288343429565, + "logits/rejected": 1.1979119777679443, + "logps/chosen": -501.673583984375, + "logps/rejected": -1469.694580078125, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -27.255168914794922, + "rewards/margins": 88.84143829345703, + "rewards/rejected": -116.09659576416016, + "step": 4970 + }, + { + "epoch": 0.498, + "grad_norm": 0.0, + "learning_rate": 2.9512978631264006e-06, + "logits/chosen": -0.011648990213871002, + "logits/rejected": 1.1334331035614014, + "logps/chosen": -927.916015625, + "logps/rejected": -2148.081298828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -45.16447067260742, + "rewards/margins": 127.28688049316406, + "rewards/rejected": -172.4513397216797, + "step": 4980 + }, + { + "epoch": 0.499, + "grad_norm": 0.0, + "learning_rate": 2.942711850798959e-06, + "logits/chosen": -0.4471127986907959, + "logits/rejected": 1.7075560092926025, + "logps/chosen": -536.8842163085938, + "logps/rejected": -1839.9417724609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -36.184226989746094, + "rewards/margins": 116.0263900756836, + "rewards/rejected": -152.2106170654297, + "step": 4990 + }, + { + "epoch": 0.5, + "grad_norm": 0.0, + "learning_rate": 2.9341204441673267e-06, + "logits/chosen": -0.24953731894493103, + "logits/rejected": 1.3373390436172485, + "logps/chosen": -591.9629516601562, + "logps/rejected": -1578.786865234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -28.433740615844727, + "rewards/margins": 102.1045913696289, + "rewards/rejected": -130.538330078125, + "step": 5000 + }, + { + "epoch": 0.501, + "grad_norm": 1.4258596039984361e-11, + "learning_rate": 2.9255237479150815e-06, + "logits/chosen": -0.03982694074511528, + "logits/rejected": 1.6737682819366455, + "logps/chosen": -937.9222412109375, + "logps/rejected": -2133.9873046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -60.917579650878906, + "rewards/margins": 112.63969421386719, + "rewards/rejected": -173.55728149414062, + "step": 5010 + }, + { + "epoch": 0.502, + "grad_norm": 0.0, + "learning_rate": 2.9169218667902562e-06, + "logits/chosen": -0.39063578844070435, + "logits/rejected": 1.115846872329712, + "logps/chosen": -773.4056396484375, + "logps/rejected": -2109.5830078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -41.8221321105957, + "rewards/margins": 122.40623474121094, + "rewards/rejected": -164.22836303710938, + "step": 5020 + }, + { + "epoch": 0.503, + "grad_norm": 0.0, + "learning_rate": 2.908314905604056e-06, + "logits/chosen": -0.6067632436752319, + "logits/rejected": 1.213060975074768, + "logps/chosen": -385.57440185546875, + "logps/rejected": -1478.2047119140625, + "loss": 0.031, + "rewards/accuracies": 1.0, + "rewards/chosen": -21.669397354125977, + "rewards/margins": 100.73478698730469, + "rewards/rejected": -122.4041976928711, + "step": 5030 + }, + { + "epoch": 0.504, + "grad_norm": 0.0, + "learning_rate": 2.8997029692295875e-06, + "logits/chosen": -0.15307000279426575, + "logits/rejected": 1.7413864135742188, + "logps/chosen": -553.080322265625, + "logps/rejected": -1663.1246337890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -24.34833526611328, + "rewards/margins": 105.29701232910156, + "rewards/rejected": -129.6453399658203, + "step": 5040 + }, + { + "epoch": 0.505, + "grad_norm": 0.0, + "learning_rate": 2.8910861626005774e-06, + "logits/chosen": -0.13748657703399658, + "logits/rejected": 1.384549617767334, + "logps/chosen": -447.5091247558594, + "logps/rejected": -1429.4466552734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -25.847366333007812, + "rewards/margins": 87.37092590332031, + "rewards/rejected": -113.21829986572266, + "step": 5050 + }, + { + "epoch": 0.506, + "grad_norm": 2.4198081518989056e-05, + "learning_rate": 2.8824645907100957e-06, + "logits/chosen": -0.11051235347986221, + "logits/rejected": 0.8797086477279663, + "logps/chosen": -620.237548828125, + "logps/rejected": -1389.697021484375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -25.004684448242188, + "rewards/margins": 79.93074035644531, + "rewards/rejected": -104.9354248046875, + "step": 5060 + }, + { + "epoch": 0.507, + "grad_norm": 7.882509645007474e-10, + "learning_rate": 2.8738383586092745e-06, + "logits/chosen": 0.07782775163650513, + "logits/rejected": 1.0023387670516968, + "logps/chosen": -517.6043090820312, + "logps/rejected": -1478.5277099609375, + "loss": 0.3021, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -30.25027847290039, + "rewards/margins": 93.49092102050781, + "rewards/rejected": -123.7411880493164, + "step": 5070 + }, + { + "epoch": 0.508, + "grad_norm": 6.824987797138249e-11, + "learning_rate": 2.8652075714060296e-06, + "logits/chosen": -1.0027350187301636, + "logits/rejected": 0.8647255897521973, + "logps/chosen": -311.21234130859375, + "logps/rejected": -1197.444091796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.196630477905273, + "rewards/margins": 74.7915267944336, + "rewards/rejected": -92.98814392089844, + "step": 5080 + }, + { + "epoch": 0.509, + "grad_norm": 2.4030850490281355e-19, + "learning_rate": 2.8565723342637797e-06, + "logits/chosen": -0.7676445841789246, + "logits/rejected": 0.6487428545951843, + "logps/chosen": -506.98919677734375, + "logps/rejected": -1381.9840087890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.320411682128906, + "rewards/margins": 81.42391204833984, + "rewards/rejected": -96.74433898925781, + "step": 5090 + }, + { + "epoch": 0.51, + "grad_norm": 0.0, + "learning_rate": 2.847932752400164e-06, + "logits/chosen": -0.817142128944397, + "logits/rejected": 0.9467900991439819, + "logps/chosen": -381.5802307128906, + "logps/rejected": -1384.46142578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.012475967407227, + "rewards/margins": 84.2886962890625, + "rewards/rejected": -103.30118560791016, + "step": 5100 + }, + { + "epoch": 0.511, + "grad_norm": 0.0, + "learning_rate": 2.8392889310857615e-06, + "logits/chosen": -0.8176937103271484, + "logits/rejected": 0.7632077932357788, + "logps/chosen": -248.266357421875, + "logps/rejected": -954.0487060546875, + "loss": 0.0361, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.92024040222168, + "rewards/margins": 60.94586181640625, + "rewards/rejected": -69.86610412597656, + "step": 5110 + }, + { + "epoch": 0.512, + "grad_norm": 1.8101504364408225e-17, + "learning_rate": 2.8306409756428067e-06, + "logits/chosen": -0.6258947253227234, + "logits/rejected": 0.6817538738250732, + "logps/chosen": -364.7867126464844, + "logps/rejected": -1302.393798828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.83382225036621, + "rewards/margins": 77.67640686035156, + "rewards/rejected": -98.51023864746094, + "step": 5120 + }, + { + "epoch": 0.513, + "grad_norm": 0.0, + "learning_rate": 2.8219889914439073e-06, + "logits/chosen": -1.2403560876846313, + "logits/rejected": 0.8351043462753296, + "logps/chosen": -279.90972900390625, + "logps/rejected": -1319.77099609375, + "loss": 0.1724, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -12.439977645874023, + "rewards/margins": 79.86945343017578, + "rewards/rejected": -92.30941772460938, + "step": 5130 + }, + { + "epoch": 0.514, + "grad_norm": 8.39083418450239e-20, + "learning_rate": 2.813333083910761e-06, + "logits/chosen": -0.9665325284004211, + "logits/rejected": 0.5111045241355896, + "logps/chosen": -145.3264923095703, + "logps/rejected": -1021.9627685546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.8399481773376465, + "rewards/margins": 69.44557189941406, + "rewards/rejected": -76.2855224609375, + "step": 5140 + }, + { + "epoch": 0.515, + "grad_norm": 0.0, + "learning_rate": 2.804673358512869e-06, + "logits/chosen": -0.45989829301834106, + "logits/rejected": 0.41274410486221313, + "logps/chosen": -561.388916015625, + "logps/rejected": -1412.365234375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.812267303466797, + "rewards/margins": 80.39027404785156, + "rewards/rejected": -101.2025375366211, + "step": 5150 + }, + { + "epoch": 0.516, + "grad_norm": 0.0, + "learning_rate": 2.7960099207662535e-06, + "logits/chosen": 0.02474859170615673, + "logits/rejected": 0.44673410058021545, + "logps/chosen": -635.7348022460938, + "logps/rejected": -1145.3740234375, + "loss": 0.9996, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.584806442260742, + "rewards/margins": 56.94877243041992, + "rewards/rejected": -79.53358459472656, + "step": 5160 + }, + { + "epoch": 0.517, + "grad_norm": 0.0, + "learning_rate": 2.7873428762321667e-06, + "logits/chosen": -0.7794687151908875, + "logits/rejected": 0.7619781494140625, + "logps/chosen": -375.510986328125, + "logps/rejected": -1299.2457275390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.562225341796875, + "rewards/margins": 83.94371032714844, + "rewards/rejected": -101.50593566894531, + "step": 5170 + }, + { + "epoch": 0.518, + "grad_norm": 0.0, + "learning_rate": 2.778672330515814e-06, + "logits/chosen": -0.6919055581092834, + "logits/rejected": 0.6613712310791016, + "logps/chosen": -451.2239685058594, + "logps/rejected": -1180.0167236328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -22.567163467407227, + "rewards/margins": 60.3519172668457, + "rewards/rejected": -82.91908264160156, + "step": 5180 + }, + { + "epoch": 0.519, + "grad_norm": 0.0, + "learning_rate": 2.769998389265057e-06, + "logits/chosen": -0.21784038841724396, + "logits/rejected": 0.3405402600765228, + "logps/chosen": -757.1791381835938, + "logps/rejected": -1300.2464599609375, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -23.873455047607422, + "rewards/margins": 60.86193084716797, + "rewards/rejected": -84.7353744506836, + "step": 5190 + }, + { + "epoch": 0.52, + "grad_norm": 0.0, + "learning_rate": 2.761321158169134e-06, + "logits/chosen": -0.7988853454589844, + "logits/rejected": 0.634006917476654, + "logps/chosen": -276.1188049316406, + "logps/rejected": -1188.635009765625, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.00566291809082, + "rewards/margins": 72.88716125488281, + "rewards/rejected": -86.89281463623047, + "step": 5200 + }, + { + "epoch": 0.521, + "grad_norm": 0.0, + "learning_rate": 2.752640742957366e-06, + "logits/chosen": -1.1302502155303955, + "logits/rejected": 0.5086938738822937, + "logps/chosen": -423.6642150878906, + "logps/rejected": -1387.6676025390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.596208572387695, + "rewards/margins": 78.29840850830078, + "rewards/rejected": -95.89461517333984, + "step": 5210 + }, + { + "epoch": 0.522, + "grad_norm": 0.0, + "learning_rate": 2.743957249397874e-06, + "logits/chosen": -1.0586451292037964, + "logits/rejected": 0.7228761315345764, + "logps/chosen": -252.70962524414062, + "logps/rejected": -1112.5032958984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.525865077972412, + "rewards/margins": 71.3156509399414, + "rewards/rejected": -78.84151458740234, + "step": 5220 + }, + { + "epoch": 0.523, + "grad_norm": 0.0, + "learning_rate": 2.7352707832962865e-06, + "logits/chosen": -0.5234454274177551, + "logits/rejected": 0.3423479497432709, + "logps/chosen": -307.30731201171875, + "logps/rejected": -1012.4879760742188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.181329727172852, + "rewards/margins": 61.77275848388672, + "rewards/rejected": -74.95409393310547, + "step": 5230 + }, + { + "epoch": 0.524, + "grad_norm": 1.0060982570359205e-16, + "learning_rate": 2.726581450494451e-06, + "logits/chosen": -0.9201523065567017, + "logits/rejected": 0.5635375380516052, + "logps/chosen": -228.01455688476562, + "logps/rejected": -932.2889404296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.266953468322754, + "rewards/margins": 57.03753662109375, + "rewards/rejected": -64.30448913574219, + "step": 5240 + }, + { + "epoch": 0.525, + "grad_norm": 0.0, + "learning_rate": 2.717889356869146e-06, + "logits/chosen": -0.9244860410690308, + "logits/rejected": 0.3733980357646942, + "logps/chosen": -389.0986328125, + "logps/rejected": -1089.4500732421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.316126823425293, + "rewards/margins": 57.2689323425293, + "rewards/rejected": -69.58506774902344, + "step": 5250 + }, + { + "epoch": 0.526, + "grad_norm": 0.0, + "learning_rate": 2.70919460833079e-06, + "logits/chosen": -1.0785424709320068, + "logits/rejected": 0.5775918960571289, + "logps/chosen": -389.9350891113281, + "logps/rejected": -1264.129638671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.48707103729248, + "rewards/margins": 67.63380432128906, + "rewards/rejected": -81.1208724975586, + "step": 5260 + }, + { + "epoch": 0.527, + "grad_norm": 7.135824320759349e-14, + "learning_rate": 2.700497310822147e-06, + "logits/chosen": -0.6644527316093445, + "logits/rejected": 0.04605517536401749, + "logps/chosen": -502.5401306152344, + "logps/rejected": -885.4474487304688, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.509904861450195, + "rewards/margins": 48.577606201171875, + "rewards/rejected": -58.0875129699707, + "step": 5270 + }, + { + "epoch": 0.528, + "grad_norm": 0.0, + "learning_rate": 2.6917975703170466e-06, + "logits/chosen": -1.0135033130645752, + "logits/rejected": 0.859094500541687, + "logps/chosen": -378.8349304199219, + "logps/rejected": -1455.221923828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.651538848876953, + "rewards/margins": 89.67914581298828, + "rewards/rejected": -106.33067321777344, + "step": 5280 + }, + { + "epoch": 0.529, + "grad_norm": 7.46895166230388e-05, + "learning_rate": 2.6830954928190795e-06, + "logits/chosen": -0.7778174877166748, + "logits/rejected": 0.6204961538314819, + "logps/chosen": -495.7356872558594, + "logps/rejected": -1281.3050537109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.955945014953613, + "rewards/margins": 75.40312957763672, + "rewards/rejected": -90.35907745361328, + "step": 5290 + }, + { + "epoch": 0.53, + "grad_norm": 1.8743726564813783e-18, + "learning_rate": 2.6743911843603134e-06, + "logits/chosen": -0.44399577379226685, + "logits/rejected": 0.5348590612411499, + "logps/chosen": -536.5358276367188, + "logps/rejected": -1139.5767822265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.537757873535156, + "rewards/margins": 64.67687225341797, + "rewards/rejected": -82.21463012695312, + "step": 5300 + }, + { + "epoch": 0.531, + "grad_norm": 0.0, + "learning_rate": 2.6656847510000013e-06, + "logits/chosen": -1.0946407318115234, + "logits/rejected": 0.8582640886306763, + "logps/chosen": -297.0832824707031, + "logps/rejected": -1163.2041015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.352200508117676, + "rewards/margins": 73.29154968261719, + "rewards/rejected": -86.64375305175781, + "step": 5310 + }, + { + "epoch": 0.532, + "grad_norm": 0.0, + "learning_rate": 2.6569762988232838e-06, + "logits/chosen": -0.6373372077941895, + "logits/rejected": 0.6762363314628601, + "logps/chosen": -314.57879638671875, + "logps/rejected": -1196.302978515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.09781265258789, + "rewards/margins": 71.00390625, + "rewards/rejected": -87.10172271728516, + "step": 5320 + }, + { + "epoch": 0.533, + "grad_norm": 0.0, + "learning_rate": 2.6482659339399047e-06, + "logits/chosen": -0.6710726022720337, + "logits/rejected": 0.7335922122001648, + "logps/chosen": -502.332763671875, + "logps/rejected": -1351.554931640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -21.287044525146484, + "rewards/margins": 76.317138671875, + "rewards/rejected": -97.60417938232422, + "step": 5330 + }, + { + "epoch": 0.534, + "grad_norm": 0.0, + "learning_rate": 2.63955376248291e-06, + "logits/chosen": -0.6070116758346558, + "logits/rejected": 0.26677125692367554, + "logps/chosen": -332.3114929199219, + "logps/rejected": -861.4298706054688, + "loss": 0.2173, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.19046688079834, + "rewards/margins": 51.89105987548828, + "rewards/rejected": -62.0815315246582, + "step": 5340 + }, + { + "epoch": 0.535, + "grad_norm": 0.0, + "learning_rate": 2.6308398906073603e-06, + "logits/chosen": -0.7868450880050659, + "logits/rejected": 0.4568649232387543, + "logps/chosen": -312.804931640625, + "logps/rejected": -983.685546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.068277359008789, + "rewards/margins": 56.0164909362793, + "rewards/rejected": -67.08477020263672, + "step": 5350 + }, + { + "epoch": 0.536, + "grad_norm": 0.0, + "learning_rate": 2.6221244244890336e-06, + "logits/chosen": -0.29687172174453735, + "logits/rejected": 0.35419854521751404, + "logps/chosen": -341.05596923828125, + "logps/rejected": -851.9397583007812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.857192993164062, + "rewards/margins": 52.79615020751953, + "rewards/rejected": -63.653350830078125, + "step": 5360 + }, + { + "epoch": 0.537, + "grad_norm": 0.0, + "learning_rate": 2.613407470323134e-06, + "logits/chosen": -0.5562046766281128, + "logits/rejected": 0.6017817258834839, + "logps/chosen": -576.7545776367188, + "logps/rejected": -1128.655517578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.32354736328125, + "rewards/margins": 50.09172439575195, + "rewards/rejected": -66.41526794433594, + "step": 5370 + }, + { + "epoch": 0.538, + "grad_norm": 2.7073644212871053e-15, + "learning_rate": 2.604689134322999e-06, + "logits/chosen": -0.7671887278556824, + "logits/rejected": 0.18423447012901306, + "logps/chosen": -495.18218994140625, + "logps/rejected": -990.7804565429688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.015054702758789, + "rewards/margins": 44.199180603027344, + "rewards/rejected": -55.2142333984375, + "step": 5380 + }, + { + "epoch": 0.539, + "grad_norm": 0.0, + "learning_rate": 2.5959695227188e-06, + "logits/chosen": -0.8589827418327332, + "logits/rejected": 0.6271054148674011, + "logps/chosen": -410.236572265625, + "logps/rejected": -1313.5830078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.382259368896484, + "rewards/margins": 74.65422058105469, + "rewards/rejected": -92.0364761352539, + "step": 5390 + }, + { + "epoch": 0.54, + "grad_norm": 9.483873873250559e-05, + "learning_rate": 2.587248741756253e-06, + "logits/chosen": -0.36108261346817017, + "logits/rejected": 0.16383466124534607, + "logps/chosen": -565.5712890625, + "logps/rejected": -1062.492919921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.465934753417969, + "rewards/margins": 53.61336135864258, + "rewards/rejected": -65.07929229736328, + "step": 5400 + }, + { + "epoch": 0.541, + "grad_norm": 2.3672402471412723e-13, + "learning_rate": 2.578526897695321e-06, + "logits/chosen": -0.5892329216003418, + "logits/rejected": 0.7394314408302307, + "logps/chosen": -379.57183837890625, + "logps/rejected": -951.5711669921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.3280029296875, + "rewards/margins": 51.3201904296875, + "rewards/rejected": -65.648193359375, + "step": 5410 + }, + { + "epoch": 0.542, + "grad_norm": 0.0, + "learning_rate": 2.569804096808923e-06, + "logits/chosen": -0.6709790229797363, + "logits/rejected": 0.42003265023231506, + "logps/chosen": -464.9453125, + "logps/rejected": -999.1988525390625, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.656137466430664, + "rewards/margins": 50.31475830078125, + "rewards/rejected": -59.97089385986328, + "step": 5420 + }, + { + "epoch": 0.543, + "grad_norm": 5.099034326824965e-14, + "learning_rate": 2.5610804453816333e-06, + "logits/chosen": -0.7774316072463989, + "logits/rejected": 0.4784523844718933, + "logps/chosen": -458.3973693847656, + "logps/rejected": -1123.7091064453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.050821304321289, + "rewards/margins": 60.913475036621094, + "rewards/rejected": -75.96430206298828, + "step": 5430 + }, + { + "epoch": 0.544, + "grad_norm": 0.0, + "learning_rate": 2.5523560497083927e-06, + "logits/chosen": -0.5590888261795044, + "logits/rejected": 0.5929467082023621, + "logps/chosen": -469.6875915527344, + "logps/rejected": -1505.582763671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.421056747436523, + "rewards/margins": 93.0894775390625, + "rewards/rejected": -109.51053619384766, + "step": 5440 + }, + { + "epoch": 0.545, + "grad_norm": 6.712893271306715e-18, + "learning_rate": 2.543631016093209e-06, + "logits/chosen": -0.6673922538757324, + "logits/rejected": 0.9135753512382507, + "logps/chosen": -501.02728271484375, + "logps/rejected": -1274.625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.747516632080078, + "rewards/margins": 70.78562927246094, + "rewards/rejected": -89.53314971923828, + "step": 5450 + }, + { + "epoch": 0.546, + "grad_norm": 0.0, + "learning_rate": 2.5349054508478636e-06, + "logits/chosen": -0.7824907898902893, + "logits/rejected": 0.8109520077705383, + "logps/chosen": -499.3113708496094, + "logps/rejected": -1379.143798828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.124048233032227, + "rewards/margins": 87.3325424194336, + "rewards/rejected": -106.45658874511719, + "step": 5460 + }, + { + "epoch": 0.547, + "grad_norm": 4.859037795943949e-19, + "learning_rate": 2.526179460290615e-06, + "logits/chosen": -0.8277426958084106, + "logits/rejected": 1.123626470565796, + "logps/chosen": -290.3539733886719, + "logps/rejected": -1319.580322265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.491305351257324, + "rewards/margins": 89.3653335571289, + "rewards/rejected": -104.85664367675781, + "step": 5470 + }, + { + "epoch": 0.548, + "grad_norm": 0.16051889955997467, + "learning_rate": 2.517453150744904e-06, + "logits/chosen": -0.36050155758857727, + "logits/rejected": 1.1666429042816162, + "logps/chosen": -409.23846435546875, + "logps/rejected": -1217.471923828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.654441833496094, + "rewards/margins": 78.6314468383789, + "rewards/rejected": -94.28587341308594, + "step": 5480 + }, + { + "epoch": 0.549, + "grad_norm": 1.2967491197527487e-22, + "learning_rate": 2.5087266285380597e-06, + "logits/chosen": -0.4092417359352112, + "logits/rejected": 0.9499204754829407, + "logps/chosen": -409.16278076171875, + "logps/rejected": -1338.765869140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.687070846557617, + "rewards/margins": 84.14730834960938, + "rewards/rejected": -98.83438110351562, + "step": 5490 + }, + { + "epoch": 0.55, + "grad_norm": 1.0176245092833531e-21, + "learning_rate": 2.5e-06, + "logits/chosen": -0.47044605016708374, + "logits/rejected": 0.5489223599433899, + "logps/chosen": -455.0294494628906, + "logps/rejected": -1252.549072265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.840413093566895, + "rewards/margins": 80.813720703125, + "rewards/rejected": -95.65412902832031, + "step": 5500 + }, + { + "epoch": 0.551, + "grad_norm": 0.0, + "learning_rate": 2.4912733714619415e-06, + "logits/chosen": -1.1358522176742554, + "logits/rejected": 0.8391151428222656, + "logps/chosen": -336.40625, + "logps/rejected": -1572.833251953125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.205284118652344, + "rewards/margins": 105.4991455078125, + "rewards/rejected": -118.70442962646484, + "step": 5510 + }, + { + "epoch": 0.552, + "grad_norm": 0.0, + "learning_rate": 2.482546849255096e-06, + "logits/chosen": -0.43424397706985474, + "logits/rejected": 1.4264408349990845, + "logps/chosen": -546.3912353515625, + "logps/rejected": -1812.598388671875, + "loss": 0.59, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -33.54061508178711, + "rewards/margins": 116.97703552246094, + "rewards/rejected": -150.51763916015625, + "step": 5520 + }, + { + "epoch": 0.553, + "grad_norm": 0.0, + "learning_rate": 2.4738205397093863e-06, + "logits/chosen": 0.08370640128850937, + "logits/rejected": 1.2765973806381226, + "logps/chosen": -435.69342041015625, + "logps/rejected": -1340.457275390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -23.692970275878906, + "rewards/margins": 87.23201751708984, + "rewards/rejected": -110.92498779296875, + "step": 5530 + }, + { + "epoch": 0.554, + "grad_norm": 0.0, + "learning_rate": 2.4650945491521372e-06, + "logits/chosen": -0.7669180035591125, + "logits/rejected": 0.8787969350814819, + "logps/chosen": -640.8975830078125, + "logps/rejected": -1826.0234375, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -28.16714096069336, + "rewards/margins": 105.84178161621094, + "rewards/rejected": -134.00892639160156, + "step": 5540 + }, + { + "epoch": 0.555, + "grad_norm": 1.2562163765814094e-12, + "learning_rate": 2.4563689839067913e-06, + "logits/chosen": -0.3882649838924408, + "logits/rejected": 0.9912908673286438, + "logps/chosen": -418.70263671875, + "logps/rejected": -1501.5875244140625, + "loss": 0.1537, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -23.708744049072266, + "rewards/margins": 99.52375793457031, + "rewards/rejected": -123.23250579833984, + "step": 5550 + }, + { + "epoch": 0.556, + "grad_norm": 0.0, + "learning_rate": 2.447643950291608e-06, + "logits/chosen": -0.444000244140625, + "logits/rejected": 1.3806891441345215, + "logps/chosen": -306.3438415527344, + "logps/rejected": -1433.14501953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.685264587402344, + "rewards/margins": 95.08927154541016, + "rewards/rejected": -115.7745361328125, + "step": 5560 + }, + { + "epoch": 0.557, + "grad_norm": 2.19682027375889e-18, + "learning_rate": 2.4389195546183676e-06, + "logits/chosen": -0.7899306416511536, + "logits/rejected": 1.2382338047027588, + "logps/chosen": -407.1981506347656, + "logps/rejected": -1564.7291259765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.620243072509766, + "rewards/margins": 104.19229888916016, + "rewards/rejected": -121.81254577636719, + "step": 5570 + }, + { + "epoch": 0.558, + "grad_norm": 0.0, + "learning_rate": 2.4301959031910785e-06, + "logits/chosen": -0.41182246804237366, + "logits/rejected": 1.5325813293457031, + "logps/chosen": -453.8590393066406, + "logps/rejected": -1610.720458984375, + "loss": 0.0618, + "rewards/accuracies": 1.0, + "rewards/chosen": -25.81329917907715, + "rewards/margins": 103.739990234375, + "rewards/rejected": -129.55328369140625, + "step": 5580 + }, + { + "epoch": 0.559, + "grad_norm": 0.0, + "learning_rate": 2.4214731023046795e-06, + "logits/chosen": -0.6320607662200928, + "logits/rejected": 0.9761055707931519, + "logps/chosen": -472.55517578125, + "logps/rejected": -1443.104248046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -23.019092559814453, + "rewards/margins": 86.91942596435547, + "rewards/rejected": -109.93852233886719, + "step": 5590 + }, + { + "epoch": 0.56, + "grad_norm": 5.9338226318359375, + "learning_rate": 2.4127512582437486e-06, + "logits/chosen": -0.35688918828964233, + "logits/rejected": 1.3163071870803833, + "logps/chosen": -589.3690185546875, + "logps/rejected": -1790.47265625, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -34.09458541870117, + "rewards/margins": 107.3012466430664, + "rewards/rejected": -141.39584350585938, + "step": 5600 + }, + { + "epoch": 0.561, + "grad_norm": 0.0, + "learning_rate": 2.4040304772812002e-06, + "logits/chosen": -0.5669654607772827, + "logits/rejected": 0.7861363887786865, + "logps/chosen": -357.8940734863281, + "logps/rejected": -989.2740478515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.1118221282959, + "rewards/margins": 51.08353805541992, + "rewards/rejected": -71.19535827636719, + "step": 5610 + }, + { + "epoch": 0.562, + "grad_norm": 0.0, + "learning_rate": 2.3953108656770018e-06, + "logits/chosen": -0.6459758877754211, + "logits/rejected": 0.6650265455245972, + "logps/chosen": -463.2652282714844, + "logps/rejected": -1128.4453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.661130905151367, + "rewards/margins": 61.90986251831055, + "rewards/rejected": -77.57099914550781, + "step": 5620 + }, + { + "epoch": 0.563, + "grad_norm": 5.690169564331882e-05, + "learning_rate": 2.3865925296768658e-06, + "logits/chosen": -0.06742945313453674, + "logits/rejected": 0.5442739725112915, + "logps/chosen": -351.1674499511719, + "logps/rejected": -1076.4241943359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.4465274810791, + "rewards/margins": 67.57926177978516, + "rewards/rejected": -85.02578735351562, + "step": 5630 + }, + { + "epoch": 0.564, + "grad_norm": 9.043148738013525e-18, + "learning_rate": 2.377875575510967e-06, + "logits/chosen": -0.6333662271499634, + "logits/rejected": 0.7028582096099854, + "logps/chosen": -295.8404846191406, + "logps/rejected": -1131.8314208984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.302228927612305, + "rewards/margins": 71.73029327392578, + "rewards/rejected": -83.03252410888672, + "step": 5640 + }, + { + "epoch": 0.565, + "grad_norm": 2.721798864513403e-06, + "learning_rate": 2.3691601093926406e-06, + "logits/chosen": -0.7991350293159485, + "logits/rejected": 0.430896133184433, + "logps/chosen": -292.8297119140625, + "logps/rejected": -885.7141723632812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.243639945983887, + "rewards/margins": 50.072479248046875, + "rewards/rejected": -63.31612014770508, + "step": 5650 + }, + { + "epoch": 0.566, + "grad_norm": 0.0, + "learning_rate": 2.3604462375170905e-06, + "logits/chosen": -0.8914593458175659, + "logits/rejected": 0.5920891761779785, + "logps/chosen": -471.0685119628906, + "logps/rejected": -1146.7142333984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.243207931518555, + "rewards/margins": 60.8463134765625, + "rewards/rejected": -75.08952331542969, + "step": 5660 + }, + { + "epoch": 0.567, + "grad_norm": 0.0, + "learning_rate": 2.3517340660600965e-06, + "logits/chosen": -0.5996074676513672, + "logits/rejected": 0.47150737047195435, + "logps/chosen": -497.8310546875, + "logps/rejected": -1160.7313232421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.251384735107422, + "rewards/margins": 62.97553634643555, + "rewards/rejected": -79.22691345214844, + "step": 5670 + }, + { + "epoch": 0.568, + "grad_norm": 0.0, + "learning_rate": 2.3430237011767166e-06, + "logits/chosen": -0.8915193676948547, + "logits/rejected": 0.988507091999054, + "logps/chosen": -258.1877746582031, + "logps/rejected": -1035.653076171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.907968521118164, + "rewards/margins": 68.4066162109375, + "rewards/rejected": -82.31459045410156, + "step": 5680 + }, + { + "epoch": 0.569, + "grad_norm": 2.1966523043957533e-19, + "learning_rate": 2.3343152490000004e-06, + "logits/chosen": -0.6327385902404785, + "logits/rejected": 0.5575627088546753, + "logps/chosen": -420.05474853515625, + "logps/rejected": -978.6383056640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.766349792480469, + "rewards/margins": 51.23418426513672, + "rewards/rejected": -65.00053405761719, + "step": 5690 + }, + { + "epoch": 0.57, + "grad_norm": 153.5894012451172, + "learning_rate": 2.325608815639687e-06, + "logits/chosen": -0.7398956418037415, + "logits/rejected": 0.3631681799888611, + "logps/chosen": -445.54754638671875, + "logps/rejected": -1215.155517578125, + "loss": 0.0887, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -13.032880783081055, + "rewards/margins": 69.30997467041016, + "rewards/rejected": -82.34285736083984, + "step": 5700 + }, + { + "epoch": 0.571, + "grad_norm": 9.302183912041073e-09, + "learning_rate": 2.3169045071809217e-06, + "logits/chosen": -0.7493211627006531, + "logits/rejected": 0.3405657708644867, + "logps/chosen": -409.96435546875, + "logps/rejected": -1141.1806640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.728360176086426, + "rewards/margins": 65.49519348144531, + "rewards/rejected": -77.22355651855469, + "step": 5710 + }, + { + "epoch": 0.572, + "grad_norm": 1.2539364888652926e-06, + "learning_rate": 2.3082024296829538e-06, + "logits/chosen": -0.5696662664413452, + "logits/rejected": 0.18319830298423767, + "logps/chosen": -483.90655517578125, + "logps/rejected": -1233.9267578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.041364669799805, + "rewards/margins": 65.56819152832031, + "rewards/rejected": -77.60954284667969, + "step": 5720 + }, + { + "epoch": 0.573, + "grad_norm": 0.0, + "learning_rate": 2.2995026891778533e-06, + "logits/chosen": -0.6082257032394409, + "logits/rejected": 0.2994880676269531, + "logps/chosen": -288.6447448730469, + "logps/rejected": -933.8486328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.254521369934082, + "rewards/margins": 59.925621032714844, + "rewards/rejected": -69.18013763427734, + "step": 5730 + }, + { + "epoch": 0.574, + "grad_norm": 3.743392066509216e-23, + "learning_rate": 2.290805391669212e-06, + "logits/chosen": -1.0448625087738037, + "logits/rejected": 0.6118916273117065, + "logps/chosen": -322.68231201171875, + "logps/rejected": -1163.752685546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.992044448852539, + "rewards/margins": 70.45680236816406, + "rewards/rejected": -81.4488525390625, + "step": 5740 + }, + { + "epoch": 0.575, + "grad_norm": 9.589562413097884e-17, + "learning_rate": 2.2821106431308546e-06, + "logits/chosen": -0.5972896814346313, + "logits/rejected": -0.02794502303004265, + "logps/chosen": -602.9471435546875, + "logps/rejected": -1100.9271240234375, + "loss": 0.1444, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -13.757568359375, + "rewards/margins": 58.230491638183594, + "rewards/rejected": -71.9880599975586, + "step": 5750 + }, + { + "epoch": 0.576, + "grad_norm": 0.0, + "learning_rate": 2.2734185495055503e-06, + "logits/chosen": -0.9066916704177856, + "logits/rejected": 0.5560423135757446, + "logps/chosen": -319.40240478515625, + "logps/rejected": -1094.9283447265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.93016529083252, + "rewards/margins": 62.95295333862305, + "rewards/rejected": -74.88311004638672, + "step": 5760 + }, + { + "epoch": 0.577, + "grad_norm": 3.373019552554979e-08, + "learning_rate": 2.2647292167037143e-06, + "logits/chosen": -0.981410801410675, + "logits/rejected": 0.6274434328079224, + "logps/chosen": -214.78018188476562, + "logps/rejected": -852.8345947265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.489265441894531, + "rewards/margins": 49.72411346435547, + "rewards/rejected": -59.21337890625, + "step": 5770 + }, + { + "epoch": 0.578, + "grad_norm": 1.8989554303106886e-19, + "learning_rate": 2.256042750602127e-06, + "logits/chosen": -0.3405448794364929, + "logits/rejected": 0.5417619943618774, + "logps/chosen": -321.7869567871094, + "logps/rejected": -918.9954833984375, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.773573875427246, + "rewards/margins": 49.88673782348633, + "rewards/rejected": -59.660308837890625, + "step": 5780 + }, + { + "epoch": 0.579, + "grad_norm": 2.0045403156105073e-15, + "learning_rate": 2.2473592570426343e-06, + "logits/chosen": -0.8019183874130249, + "logits/rejected": 0.17649047076702118, + "logps/chosen": -328.5504455566406, + "logps/rejected": -991.7091674804688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.611047744750977, + "rewards/margins": 56.48290252685547, + "rewards/rejected": -70.09394836425781, + "step": 5790 + }, + { + "epoch": 0.58, + "grad_norm": 1.5212917503504286e-07, + "learning_rate": 2.238678841830867e-06, + "logits/chosen": -0.969714343547821, + "logits/rejected": 0.5986171364784241, + "logps/chosen": -305.9940185546875, + "logps/rejected": -1073.6085205078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.390681266784668, + "rewards/margins": 63.28633499145508, + "rewards/rejected": -76.67700958251953, + "step": 5800 + }, + { + "epoch": 0.581, + "grad_norm": 0.0, + "learning_rate": 2.230001610734943e-06, + "logits/chosen": -0.4790991246700287, + "logits/rejected": 0.2994881570339203, + "logps/chosen": -428.73291015625, + "logps/rejected": -1098.240478515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.178543090820312, + "rewards/margins": 66.76612091064453, + "rewards/rejected": -80.94465637207031, + "step": 5810 + }, + { + "epoch": 0.582, + "grad_norm": 0.0, + "learning_rate": 2.2213276694841866e-06, + "logits/chosen": -0.9775009155273438, + "logits/rejected": 0.6576396226882935, + "logps/chosen": -301.51812744140625, + "logps/rejected": -1096.399169921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.869229316711426, + "rewards/margins": 61.690216064453125, + "rewards/rejected": -76.5594482421875, + "step": 5820 + }, + { + "epoch": 0.583, + "grad_norm": 0.0, + "learning_rate": 2.212657123767834e-06, + "logits/chosen": -0.31216496229171753, + "logits/rejected": 0.3422687351703644, + "logps/chosen": -367.2973937988281, + "logps/rejected": -886.2732543945312, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.941743850708008, + "rewards/margins": 53.467002868652344, + "rewards/rejected": -70.40875244140625, + "step": 5830 + }, + { + "epoch": 0.584, + "grad_norm": 9.648000883725073e-15, + "learning_rate": 2.2039900792337477e-06, + "logits/chosen": -0.6409394145011902, + "logits/rejected": 0.2590024769306183, + "logps/chosen": -540.4464111328125, + "logps/rejected": -1060.7490234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.835675239562988, + "rewards/margins": 53.53327178955078, + "rewards/rejected": -64.36894226074219, + "step": 5840 + }, + { + "epoch": 0.585, + "grad_norm": 0.0, + "learning_rate": 2.195326641487132e-06, + "logits/chosen": -0.343078076839447, + "logits/rejected": 0.4754057824611664, + "logps/chosen": -322.4148864746094, + "logps/rejected": -1100.409423828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.861885070800781, + "rewards/margins": 70.69842529296875, + "rewards/rejected": -83.56031799316406, + "step": 5850 + }, + { + "epoch": 0.586, + "grad_norm": 2.893859733358337e-18, + "learning_rate": 2.186666916089239e-06, + "logits/chosen": -0.8028178215026855, + "logits/rejected": 0.560520350933075, + "logps/chosen": -427.0816345214844, + "logps/rejected": -1150.7421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.319122314453125, + "rewards/margins": 59.44419479370117, + "rewards/rejected": -76.76332092285156, + "step": 5860 + }, + { + "epoch": 0.587, + "grad_norm": 1.4526141574322526e-11, + "learning_rate": 2.1780110085560935e-06, + "logits/chosen": -0.47322821617126465, + "logits/rejected": 0.35947781801223755, + "logps/chosen": -366.51885986328125, + "logps/rejected": -961.4563598632812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.354814529418945, + "rewards/margins": 61.39038848876953, + "rewards/rejected": -71.74519348144531, + "step": 5870 + }, + { + "epoch": 0.588, + "grad_norm": 0.0, + "learning_rate": 2.1693590243571937e-06, + "logits/chosen": -0.7379357218742371, + "logits/rejected": 0.541722297668457, + "logps/chosen": -333.3455810546875, + "logps/rejected": -952.2578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.706256866455078, + "rewards/margins": 52.303131103515625, + "rewards/rejected": -65.00939178466797, + "step": 5880 + }, + { + "epoch": 0.589, + "grad_norm": 2.424898097498228e-16, + "learning_rate": 2.1607110689142393e-06, + "logits/chosen": -0.7496501207351685, + "logits/rejected": 0.1588805913925171, + "logps/chosen": -266.98760986328125, + "logps/rejected": -715.7047119140625, + "loss": 0.1048, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.766366958618164, + "rewards/margins": 40.335838317871094, + "rewards/rejected": -51.10220718383789, + "step": 5890 + }, + { + "epoch": 0.59, + "grad_norm": 1.9521148207028022e-14, + "learning_rate": 2.1520672475998374e-06, + "logits/chosen": -0.46376457810401917, + "logits/rejected": 0.2426864206790924, + "logps/chosen": -598.9649047851562, + "logps/rejected": -1084.9559326171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.458829879760742, + "rewards/margins": 56.31378936767578, + "rewards/rejected": -73.77261352539062, + "step": 5900 + }, + { + "epoch": 0.591, + "grad_norm": 0.0, + "learning_rate": 2.143427665736221e-06, + "logits/chosen": -1.3513346910476685, + "logits/rejected": 0.6467947363853455, + "logps/chosen": -178.78578186035156, + "logps/rejected": -986.7268676757812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.031607627868652, + "rewards/margins": 62.00954055786133, + "rewards/rejected": -71.04115295410156, + "step": 5910 + }, + { + "epoch": 0.592, + "grad_norm": 0.0, + "learning_rate": 2.134792428593971e-06, + "logits/chosen": -0.9142158627510071, + "logits/rejected": 0.142775759100914, + "logps/chosen": -418.8714294433594, + "logps/rejected": -1166.554931640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.114670753479004, + "rewards/margins": 68.04464721679688, + "rewards/rejected": -78.15931701660156, + "step": 5920 + }, + { + "epoch": 0.593, + "grad_norm": 4.327770424878341e-15, + "learning_rate": 2.1261616413907267e-06, + "logits/chosen": -0.7585387825965881, + "logits/rejected": 0.018490100279450417, + "logps/chosen": -398.62091064453125, + "logps/rejected": -756.716552734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.56114387512207, + "rewards/margins": 40.72282409667969, + "rewards/rejected": -51.283966064453125, + "step": 5930 + }, + { + "epoch": 0.594, + "grad_norm": 0.29131418466567993, + "learning_rate": 2.117535409289905e-06, + "logits/chosen": -0.8369476199150085, + "logits/rejected": 0.48800697922706604, + "logps/chosen": -322.8172302246094, + "logps/rejected": -1040.8592529296875, + "loss": 0.1809, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -14.917299270629883, + "rewards/margins": 56.43064498901367, + "rewards/rejected": -71.34794616699219, + "step": 5940 + }, + { + "epoch": 0.595, + "grad_norm": 6.851609413160986e-08, + "learning_rate": 2.1089138373994226e-06, + "logits/chosen": -0.4474567770957947, + "logits/rejected": -0.12609001994132996, + "logps/chosen": -425.7982482910156, + "logps/rejected": -696.9923706054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.972140312194824, + "rewards/margins": 30.715478897094727, + "rewards/rejected": -43.6876220703125, + "step": 5950 + }, + { + "epoch": 0.596, + "grad_norm": 3.2781477784737945e-05, + "learning_rate": 2.1002970307704134e-06, + "logits/chosen": -0.7989672422409058, + "logits/rejected": 0.2822956442832947, + "logps/chosen": -343.94207763671875, + "logps/rejected": -1046.2325439453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.518392562866211, + "rewards/margins": 62.300148010253906, + "rewards/rejected": -74.81852722167969, + "step": 5960 + }, + { + "epoch": 0.597, + "grad_norm": 0.0, + "learning_rate": 2.0916850943959453e-06, + "logits/chosen": -0.9798108339309692, + "logits/rejected": 0.031129514798521996, + "logps/chosen": -349.68011474609375, + "logps/rejected": -926.5896606445312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.097526550292969, + "rewards/margins": 52.179046630859375, + "rewards/rejected": -62.27656936645508, + "step": 5970 + }, + { + "epoch": 0.598, + "grad_norm": 2.6305224309908226e-06, + "learning_rate": 2.0830781332097446e-06, + "logits/chosen": -1.1097701787948608, + "logits/rejected": 0.24544647336006165, + "logps/chosen": -307.0548095703125, + "logps/rejected": -812.1460571289062, + "loss": 0.2396, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.199498176574707, + "rewards/margins": 41.038108825683594, + "rewards/rejected": -50.23760223388672, + "step": 5980 + }, + { + "epoch": 0.599, + "grad_norm": 3.837552151053636e-11, + "learning_rate": 2.0744762520849193e-06, + "logits/chosen": -0.8115105628967285, + "logits/rejected": 0.38198933005332947, + "logps/chosen": -315.76873779296875, + "logps/rejected": -777.8378295898438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.973611354827881, + "rewards/margins": 42.35847854614258, + "rewards/rejected": -50.33209228515625, + "step": 5990 + }, + { + "epoch": 0.6, + "grad_norm": 0.0, + "learning_rate": 2.0658795558326745e-06, + "logits/chosen": -1.053982138633728, + "logits/rejected": 0.04489628225564957, + "logps/chosen": -420.40087890625, + "logps/rejected": -1016.2745971679688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.601455688476562, + "rewards/margins": 55.01713180541992, + "rewards/rejected": -66.61858367919922, + "step": 6000 + }, + { + "epoch": 0.601, + "grad_norm": 0.0, + "learning_rate": 2.0572881492010423e-06, + "logits/chosen": -0.9520200490951538, + "logits/rejected": 0.4393930435180664, + "logps/chosen": -258.37884521484375, + "logps/rejected": -915.1923828125, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.035299301147461, + "rewards/margins": 54.08031463623047, + "rewards/rejected": -62.115623474121094, + "step": 6010 + }, + { + "epoch": 0.602, + "grad_norm": 0.0, + "learning_rate": 2.0487021368736002e-06, + "logits/chosen": -0.727254331111908, + "logits/rejected": 0.055363964289426804, + "logps/chosen": -608.6199951171875, + "logps/rejected": -1106.275146484375, + "loss": 1.1745, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -14.362531661987305, + "rewards/margins": 57.39153289794922, + "rewards/rejected": -71.75406646728516, + "step": 6020 + }, + { + "epoch": 0.603, + "grad_norm": 8.775121294461786e-21, + "learning_rate": 2.0401216234682e-06, + "logits/chosen": -1.1106388568878174, + "logits/rejected": 0.28149712085723877, + "logps/chosen": -506.61077880859375, + "logps/rejected": -1085.811279296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.73906135559082, + "rewards/margins": 53.52830123901367, + "rewards/rejected": -64.26737213134766, + "step": 6030 + }, + { + "epoch": 0.604, + "grad_norm": 0.0, + "learning_rate": 2.031546713535688e-06, + "logits/chosen": -0.8970220685005188, + "logits/rejected": 0.5340319275856018, + "logps/chosen": -187.3031463623047, + "logps/rejected": -888.166015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.198158264160156, + "rewards/margins": 56.56108856201172, + "rewards/rejected": -65.7592544555664, + "step": 6040 + }, + { + "epoch": 0.605, + "grad_norm": 0.0, + "learning_rate": 2.022977511558638e-06, + "logits/chosen": -0.49822598695755005, + "logits/rejected": -0.051455218344926834, + "logps/chosen": -539.9725341796875, + "logps/rejected": -912.2399291992188, + "loss": 0.0528, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.603445053100586, + "rewards/margins": 41.82299041748047, + "rewards/rejected": -54.42643356323242, + "step": 6050 + }, + { + "epoch": 0.606, + "grad_norm": 3.051496014185517e-16, + "learning_rate": 2.0144141219500707e-06, + "logits/chosen": -0.5907710790634155, + "logits/rejected": 0.0812007263302803, + "logps/chosen": -668.8988037109375, + "logps/rejected": -1045.5540771484375, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.167119979858398, + "rewards/margins": 42.041015625, + "rewards/rejected": -56.20813751220703, + "step": 6060 + }, + { + "epoch": 0.607, + "grad_norm": 5.917444961412447e-18, + "learning_rate": 2.0058566490521848e-06, + "logits/chosen": -0.28065013885498047, + "logits/rejected": 0.347043514251709, + "logps/chosen": -434.7425842285156, + "logps/rejected": -957.3238525390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.05737018585205, + "rewards/margins": 50.82740020751953, + "rewards/rejected": -64.88477325439453, + "step": 6070 + }, + { + "epoch": 0.608, + "grad_norm": 0.0, + "learning_rate": 1.997305197135089e-06, + "logits/chosen": -0.7074576616287231, + "logits/rejected": 0.348089337348938, + "logps/chosen": -308.53753662109375, + "logps/rejected": -1001.8761596679688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.252487182617188, + "rewards/margins": 67.42102813720703, + "rewards/rejected": -80.67351531982422, + "step": 6080 + }, + { + "epoch": 0.609, + "grad_norm": 35.3226432800293, + "learning_rate": 1.9887598703955244e-06, + "logits/chosen": -0.26775437593460083, + "logits/rejected": 0.3113982081413269, + "logps/chosen": -366.8798522949219, + "logps/rejected": -877.09228515625, + "loss": 0.2139, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.909229278564453, + "rewards/margins": 44.00845718383789, + "rewards/rejected": -63.917686462402344, + "step": 6090 + }, + { + "epoch": 0.61, + "grad_norm": 1.7081393386554248e-19, + "learning_rate": 1.9802207729556023e-06, + "logits/chosen": -0.3729914128780365, + "logits/rejected": 0.7869149446487427, + "logps/chosen": -341.86688232421875, + "logps/rejected": -942.552734375, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.829532623291016, + "rewards/margins": 51.61570358276367, + "rewards/rejected": -71.44523620605469, + "step": 6100 + }, + { + "epoch": 0.611, + "grad_norm": 0.0, + "learning_rate": 1.971688008861529e-06, + "logits/chosen": -0.7285288572311401, + "logits/rejected": 0.8685183525085449, + "logps/chosen": -676.9864501953125, + "logps/rejected": -1683.438232421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -29.139759063720703, + "rewards/margins": 93.76480865478516, + "rewards/rejected": -122.9045639038086, + "step": 6110 + }, + { + "epoch": 0.612, + "grad_norm": 0.0, + "learning_rate": 1.963161682082342e-06, + "logits/chosen": -0.40115728974342346, + "logits/rejected": 0.9319968223571777, + "logps/chosen": -672.72900390625, + "logps/rejected": -1428.954833984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -28.64020347595215, + "rewards/margins": 77.25892639160156, + "rewards/rejected": -105.8991470336914, + "step": 6120 + }, + { + "epoch": 0.613, + "grad_norm": 5.300155225440156e-18, + "learning_rate": 1.9546418965086444e-06, + "logits/chosen": -0.3054092228412628, + "logits/rejected": 0.3358805775642395, + "logps/chosen": -673.1195068359375, + "logps/rejected": -1441.669921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -27.96059226989746, + "rewards/margins": 78.94953918457031, + "rewards/rejected": -106.9101333618164, + "step": 6130 + }, + { + "epoch": 0.614, + "grad_norm": 0.0, + "learning_rate": 1.946128755951332e-06, + "logits/chosen": -0.7575784921646118, + "logits/rejected": 0.9253544807434082, + "logps/chosen": -581.2315063476562, + "logps/rejected": -1685.402587890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -32.97461700439453, + "rewards/margins": 97.02174377441406, + "rewards/rejected": -129.9963836669922, + "step": 6140 + }, + { + "epoch": 0.615, + "grad_norm": 0.0, + "learning_rate": 1.937622364140338e-06, + "logits/chosen": -0.3287307918071747, + "logits/rejected": 0.8551700711250305, + "logps/chosen": -698.66015625, + "logps/rejected": -1677.6588134765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -28.919662475585938, + "rewards/margins": 97.02134704589844, + "rewards/rejected": -125.94100189208984, + "step": 6150 + }, + { + "epoch": 0.616, + "grad_norm": 0.0, + "learning_rate": 1.9291228247233607e-06, + "logits/chosen": -0.10450273752212524, + "logits/rejected": 0.12423954159021378, + "logps/chosen": -672.6328125, + "logps/rejected": -1056.39404296875, + "loss": 1.1588, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -29.697463989257812, + "rewards/margins": 49.33916091918945, + "rewards/rejected": -79.03662109375, + "step": 6160 + }, + { + "epoch": 0.617, + "grad_norm": 8.932964556152001e-05, + "learning_rate": 1.9206302412646074e-06, + "logits/chosen": -0.8520253896713257, + "logits/rejected": 0.530659019947052, + "logps/chosen": -543.1141967773438, + "logps/rejected": -1338.0950927734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -23.23099136352539, + "rewards/margins": 75.07130432128906, + "rewards/rejected": -98.30229187011719, + "step": 6170 + }, + { + "epoch": 0.618, + "grad_norm": 0.0, + "learning_rate": 1.912144717243525e-06, + "logits/chosen": -0.46287283301353455, + "logits/rejected": 0.8936434984207153, + "logps/chosen": -479.36773681640625, + "logps/rejected": -1329.708251953125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.504770278930664, + "rewards/margins": 79.99925231933594, + "rewards/rejected": -98.50402069091797, + "step": 6180 + }, + { + "epoch": 0.619, + "grad_norm": 1153.3616943359375, + "learning_rate": 1.9036663560535484e-06, + "logits/chosen": -0.21779179573059082, + "logits/rejected": 0.7919279336929321, + "logps/chosen": -481.73870849609375, + "logps/rejected": -1110.656494140625, + "loss": 1.2391, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -28.207775115966797, + "rewards/margins": 58.3253173828125, + "rewards/rejected": -86.53309631347656, + "step": 6190 + }, + { + "epoch": 0.62, + "grad_norm": 0.0, + "learning_rate": 1.895195261000831e-06, + "logits/chosen": -0.6195026636123657, + "logits/rejected": 0.8219470977783203, + "logps/chosen": -394.10955810546875, + "logps/rejected": -1159.2943115234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.551332473754883, + "rewards/margins": 62.34423065185547, + "rewards/rejected": -82.89556121826172, + "step": 6200 + }, + { + "epoch": 0.621, + "grad_norm": 1122.32861328125, + "learning_rate": 1.8867315353029937e-06, + "logits/chosen": -0.8880168199539185, + "logits/rejected": 1.2520583868026733, + "logps/chosen": -328.0982360839844, + "logps/rejected": -1303.611572265625, + "loss": 0.3969, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.13575553894043, + "rewards/margins": 83.49183654785156, + "rewards/rejected": -100.62757873535156, + "step": 6210 + }, + { + "epoch": 0.622, + "grad_norm": 1.964397094899521e-15, + "learning_rate": 1.8782752820878636e-06, + "logits/chosen": -0.16132107377052307, + "logits/rejected": 0.4336255192756653, + "logps/chosen": -609.0550537109375, + "logps/rejected": -1116.940673828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.98457145690918, + "rewards/margins": 60.426025390625, + "rewards/rejected": -77.41058349609375, + "step": 6220 + }, + { + "epoch": 0.623, + "grad_norm": 0.0, + "learning_rate": 1.8698266043922159e-06, + "logits/chosen": -0.5896113514900208, + "logits/rejected": 1.0377198457717896, + "logps/chosen": -258.1077575683594, + "logps/rejected": -1181.6407470703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.312314987182617, + "rewards/margins": 80.14900207519531, + "rewards/rejected": -93.46131896972656, + "step": 6230 + }, + { + "epoch": 0.624, + "grad_norm": 88.24200439453125, + "learning_rate": 1.8613856051605242e-06, + "logits/chosen": -0.3807418942451477, + "logits/rejected": 0.5020023584365845, + "logps/chosen": -382.841552734375, + "logps/rejected": -858.4488525390625, + "loss": 0.037, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.80988597869873, + "rewards/margins": 45.08965301513672, + "rewards/rejected": -60.8995361328125, + "step": 6240 + }, + { + "epoch": 0.625, + "grad_norm": 0.0, + "learning_rate": 1.852952387243698e-06, + "logits/chosen": -0.6610077023506165, + "logits/rejected": 0.6249849200248718, + "logps/chosen": -316.33349609375, + "logps/rejected": -1102.541015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.481925964355469, + "rewards/margins": 68.57567596435547, + "rewards/rejected": -81.05760192871094, + "step": 6250 + }, + { + "epoch": 0.626, + "grad_norm": 0.0, + "learning_rate": 1.8445270533978387e-06, + "logits/chosen": -1.2662389278411865, + "logits/rejected": 0.8339168429374695, + "logps/chosen": -370.40118408203125, + "logps/rejected": -1378.05859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.953084945678711, + "rewards/margins": 80.45343780517578, + "rewards/rejected": -92.4065170288086, + "step": 6260 + }, + { + "epoch": 0.627, + "grad_norm": 0.0, + "learning_rate": 1.836109706282978e-06, + "logits/chosen": -0.8804155588150024, + "logits/rejected": 0.7382038235664368, + "logps/chosen": -221.7541961669922, + "logps/rejected": -1014.7086181640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.514819145202637, + "rewards/margins": 65.65678405761719, + "rewards/rejected": -77.1716079711914, + "step": 6270 + }, + { + "epoch": 0.628, + "grad_norm": 6.518362027918556e-08, + "learning_rate": 1.827700448461836e-06, + "logits/chosen": -0.8720572590827942, + "logits/rejected": 0.5231325626373291, + "logps/chosen": -294.4517517089844, + "logps/rejected": -1028.52685546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.470726013183594, + "rewards/margins": 63.9245491027832, + "rewards/rejected": -76.39527893066406, + "step": 6280 + }, + { + "epoch": 0.629, + "grad_norm": 2.3427173562252272e-18, + "learning_rate": 1.8192993823985643e-06, + "logits/chosen": -0.9169327020645142, + "logits/rejected": 0.49997347593307495, + "logps/chosen": -261.94268798828125, + "logps/rejected": -907.6803588867188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.600554466247559, + "rewards/margins": 53.24702072143555, + "rewards/rejected": -64.84757995605469, + "step": 6290 + }, + { + "epoch": 0.63, + "grad_norm": 1.7803756079109385e-21, + "learning_rate": 1.8109066104575023e-06, + "logits/chosen": -0.5586016178131104, + "logits/rejected": 0.49877291917800903, + "logps/chosen": -348.76483154296875, + "logps/rejected": -890.5277099609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.734954833984375, + "rewards/margins": 52.39110565185547, + "rewards/rejected": -65.12606048583984, + "step": 6300 + }, + { + "epoch": 0.631, + "grad_norm": 8.788940242254739e-09, + "learning_rate": 1.8025222349019273e-06, + "logits/chosen": -0.5049250721931458, + "logits/rejected": 0.47886282205581665, + "logps/chosen": -478.8617248535156, + "logps/rejected": -1211.92578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.844632148742676, + "rewards/margins": 73.86564636230469, + "rewards/rejected": -88.71028137207031, + "step": 6310 + }, + { + "epoch": 0.632, + "grad_norm": 0.0, + "learning_rate": 1.7941463578928088e-06, + "logits/chosen": -0.4538189470767975, + "logits/rejected": 0.46021947264671326, + "logps/chosen": -531.1541748046875, + "logps/rejected": -1060.1229248046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.083524703979492, + "rewards/margins": 55.92713165283203, + "rewards/rejected": -74.01065826416016, + "step": 6320 + }, + { + "epoch": 0.633, + "grad_norm": 0.0, + "learning_rate": 1.7857790814875665e-06, + "logits/chosen": -0.7689892053604126, + "logits/rejected": 0.6113948225975037, + "logps/chosen": -419.75775146484375, + "logps/rejected": -1273.57421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.010692596435547, + "rewards/margins": 71.44483947753906, + "rewards/rejected": -90.45552825927734, + "step": 6330 + }, + { + "epoch": 0.634, + "grad_norm": 0.0, + "learning_rate": 1.7774205076388207e-06, + "logits/chosen": -0.762971043586731, + "logits/rejected": 0.8176964521408081, + "logps/chosen": -239.3108673095703, + "logps/rejected": -915.2222900390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.808510780334473, + "rewards/margins": 54.695709228515625, + "rewards/rejected": -66.50421905517578, + "step": 6340 + }, + { + "epoch": 0.635, + "grad_norm": 1.2885712408206018e-07, + "learning_rate": 1.7690707381931585e-06, + "logits/chosen": -1.4763569831848145, + "logits/rejected": 0.7502704858779907, + "logps/chosen": -228.9136962890625, + "logps/rejected": -1045.082275390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.950324058532715, + "rewards/margins": 59.335121154785156, + "rewards/rejected": -69.28544616699219, + "step": 6350 + }, + { + "epoch": 0.636, + "grad_norm": 0.0, + "learning_rate": 1.7607298748898844e-06, + "logits/chosen": -0.7422378659248352, + "logits/rejected": 0.5283973217010498, + "logps/chosen": -269.3170166015625, + "logps/rejected": -842.0760498046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.552061080932617, + "rewards/margins": 47.43568801879883, + "rewards/rejected": -62.98775100708008, + "step": 6360 + }, + { + "epoch": 0.637, + "grad_norm": 3.602982634465235e-10, + "learning_rate": 1.7523980193597837e-06, + "logits/chosen": -0.6213805079460144, + "logits/rejected": 0.7540527582168579, + "logps/chosen": -347.17779541015625, + "logps/rejected": -1026.532958984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.154919624328613, + "rewards/margins": 59.262908935546875, + "rewards/rejected": -71.41783142089844, + "step": 6370 + }, + { + "epoch": 0.638, + "grad_norm": 0.0022318889386951923, + "learning_rate": 1.744075273123889e-06, + "logits/chosen": -0.31907743215560913, + "logits/rejected": 0.8481258153915405, + "logps/chosen": -642.7422485351562, + "logps/rejected": -1179.421875, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.963634490966797, + "rewards/margins": 53.08282470703125, + "rewards/rejected": -74.04646301269531, + "step": 6380 + }, + { + "epoch": 0.639, + "grad_norm": 9.515637247343306e-14, + "learning_rate": 1.735761737592236e-06, + "logits/chosen": -0.6827441453933716, + "logits/rejected": 0.2889486253261566, + "logps/chosen": -539.0543823242188, + "logps/rejected": -1096.6243896484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.704416275024414, + "rewards/margins": 55.89228439331055, + "rewards/rejected": -73.59669494628906, + "step": 6390 + }, + { + "epoch": 0.64, + "grad_norm": 0.37831369042396545, + "learning_rate": 1.7274575140626318e-06, + "logits/chosen": -0.37383323907852173, + "logits/rejected": 0.634491503238678, + "logps/chosen": -344.0820007324219, + "logps/rejected": -1041.222900390625, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.306551933288574, + "rewards/margins": 65.38855743408203, + "rewards/rejected": -76.69511413574219, + "step": 6400 + }, + { + "epoch": 0.641, + "grad_norm": 1.802431224022169e-13, + "learning_rate": 1.7191627037194187e-06, + "logits/chosen": -0.9682666063308716, + "logits/rejected": 0.46219348907470703, + "logps/chosen": -399.88909912109375, + "logps/rejected": -1011.9031982421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.483869552612305, + "rewards/margins": 52.6249885559082, + "rewards/rejected": -66.10885620117188, + "step": 6410 + }, + { + "epoch": 0.642, + "grad_norm": 4.1179455407668736e-15, + "learning_rate": 1.7108774076322443e-06, + "logits/chosen": -0.9478727579116821, + "logits/rejected": 0.4891994893550873, + "logps/chosen": -316.1852111816406, + "logps/rejected": -1141.15185546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.764043807983398, + "rewards/margins": 72.55058288574219, + "rewards/rejected": -83.31462097167969, + "step": 6420 + }, + { + "epoch": 0.643, + "grad_norm": 575.969482421875, + "learning_rate": 1.702601726754825e-06, + "logits/chosen": -0.7723701000213623, + "logits/rejected": 0.13531820476055145, + "logps/chosen": -432.0208435058594, + "logps/rejected": -1143.351318359375, + "loss": 1.7103, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.985231399536133, + "rewards/margins": 61.39850997924805, + "rewards/rejected": -80.38373565673828, + "step": 6430 + }, + { + "epoch": 0.644, + "grad_norm": 0.0, + "learning_rate": 1.6943357619237227e-06, + "logits/chosen": -1.2825745344161987, + "logits/rejected": 0.7488567233085632, + "logps/chosen": -296.5671081542969, + "logps/rejected": -1154.734619140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.1475248336792, + "rewards/margins": 60.58649826049805, + "rewards/rejected": -73.73402404785156, + "step": 6440 + }, + { + "epoch": 0.645, + "grad_norm": 0.0, + "learning_rate": 1.686079613857109e-06, + "logits/chosen": -0.5981461405754089, + "logits/rejected": 0.599422037601471, + "logps/chosen": -424.4049377441406, + "logps/rejected": -1025.369140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.99231243133545, + "rewards/margins": 57.72393035888672, + "rewards/rejected": -72.71624755859375, + "step": 6450 + }, + { + "epoch": 0.646, + "grad_norm": 0.0, + "learning_rate": 1.677833383153542e-06, + "logits/chosen": -0.33491963148117065, + "logits/rejected": 0.2924351096153259, + "logps/chosen": -494.74298095703125, + "logps/rejected": -979.0546875, + "loss": 1.7072, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.02706527709961, + "rewards/margins": 47.13035202026367, + "rewards/rejected": -64.15740966796875, + "step": 6460 + }, + { + "epoch": 0.647, + "grad_norm": 0.0, + "learning_rate": 1.6695971702907425e-06, + "logits/chosen": -0.5291346311569214, + "logits/rejected": 0.04112546145915985, + "logps/chosen": -389.43548583984375, + "logps/rejected": -861.4542846679688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.868743896484375, + "rewards/margins": 45.284690856933594, + "rewards/rejected": -61.1534309387207, + "step": 6470 + }, + { + "epoch": 0.648, + "grad_norm": 3.206112458853383e-11, + "learning_rate": 1.661371075624363e-06, + "logits/chosen": -0.820167064666748, + "logits/rejected": 0.22958044707775116, + "logps/chosen": -268.8128967285156, + "logps/rejected": -816.1143188476562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.350539207458496, + "rewards/margins": 41.11225128173828, + "rewards/rejected": -51.462791442871094, + "step": 6480 + }, + { + "epoch": 0.649, + "grad_norm": 0.0, + "learning_rate": 1.6531551993867717e-06, + "logits/chosen": -1.0266730785369873, + "logits/rejected": 0.31012818217277527, + "logps/chosen": -227.1880340576172, + "logps/rejected": -852.8770751953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.547816276550293, + "rewards/margins": 48.10416793823242, + "rewards/rejected": -56.65198516845703, + "step": 6490 + }, + { + "epoch": 0.65, + "grad_norm": 9.873515782635005e-15, + "learning_rate": 1.6449496416858285e-06, + "logits/chosen": -1.2130448818206787, + "logits/rejected": 0.2985571026802063, + "logps/chosen": -328.6929626464844, + "logps/rejected": -840.7061767578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.523885726928711, + "rewards/margins": 38.90607833862305, + "rewards/rejected": -48.429962158203125, + "step": 6500 + }, + { + "epoch": 0.651, + "grad_norm": 0.0, + "learning_rate": 1.6367545025036634e-06, + "logits/chosen": -1.709118127822876, + "logits/rejected": 0.4337089955806732, + "logps/chosen": -177.02171325683594, + "logps/rejected": -972.3427734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.665238380432129, + "rewards/margins": 57.431732177734375, + "rewards/rejected": -67.09696960449219, + "step": 6510 + }, + { + "epoch": 0.652, + "grad_norm": 9.39294147491455, + "learning_rate": 1.6285698816954626e-06, + "logits/chosen": -0.7160784006118774, + "logits/rejected": 0.2711246609687805, + "logps/chosen": -312.7411804199219, + "logps/rejected": -796.49658203125, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.27977180480957, + "rewards/margins": 36.427284240722656, + "rewards/rejected": -47.707054138183594, + "step": 6520 + }, + { + "epoch": 0.653, + "grad_norm": 5.293955920339377e-23, + "learning_rate": 1.6203958789882457e-06, + "logits/chosen": -0.5094423294067383, + "logits/rejected": 0.31378093361854553, + "logps/chosen": -440.42694091796875, + "logps/rejected": -1072.122802734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.908445358276367, + "rewards/margins": 57.47730255126953, + "rewards/rejected": -71.38574981689453, + "step": 6530 + }, + { + "epoch": 0.654, + "grad_norm": 1.2480342009412587e-13, + "learning_rate": 1.612232593979658e-06, + "logits/chosen": -0.5389014482498169, + "logits/rejected": 0.4068358540534973, + "logps/chosen": -410.4971618652344, + "logps/rejected": -1073.8428955078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.45536994934082, + "rewards/margins": 61.50476837158203, + "rewards/rejected": -75.96012878417969, + "step": 6540 + }, + { + "epoch": 0.655, + "grad_norm": 0.0, + "learning_rate": 1.6040801261367494e-06, + "logits/chosen": -0.9499231576919556, + "logits/rejected": 0.3402765691280365, + "logps/chosen": -211.19735717773438, + "logps/rejected": -858.4918823242188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.90208911895752, + "rewards/margins": 53.628746032714844, + "rewards/rejected": -62.53083419799805, + "step": 6550 + }, + { + "epoch": 0.656, + "grad_norm": 3.516597623349375e-21, + "learning_rate": 1.5959385747947697e-06, + "logits/chosen": -1.0181282758712769, + "logits/rejected": 0.5531474351882935, + "logps/chosen": -224.2174835205078, + "logps/rejected": -1108.608154296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.008251190185547, + "rewards/margins": 72.00736236572266, + "rewards/rejected": -80.01561737060547, + "step": 6560 + }, + { + "epoch": 0.657, + "grad_norm": 5.693732811618002e-12, + "learning_rate": 1.5878080391559507e-06, + "logits/chosen": -0.7998801469802856, + "logits/rejected": 0.24991516768932343, + "logps/chosen": -464.068359375, + "logps/rejected": -992.8370361328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.504290580749512, + "rewards/margins": 48.961421966552734, + "rewards/rejected": -62.46571731567383, + "step": 6570 + }, + { + "epoch": 0.658, + "grad_norm": 3.429549756263497e-19, + "learning_rate": 1.5796886182883053e-06, + "logits/chosen": -0.6511567831039429, + "logits/rejected": 0.4151083827018738, + "logps/chosen": -389.8343811035156, + "logps/rejected": -1196.2972412109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.969945907592773, + "rewards/margins": 70.79234313964844, + "rewards/rejected": -84.76228332519531, + "step": 6580 + }, + { + "epoch": 0.659, + "grad_norm": 2.676518306543585e-05, + "learning_rate": 1.5715804111244138e-06, + "logits/chosen": -1.3965880870819092, + "logits/rejected": 0.4531164765357971, + "logps/chosen": -166.05572509765625, + "logps/rejected": -980.0531005859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.495244026184082, + "rewards/margins": 50.93138885498047, + "rewards/rejected": -59.42664337158203, + "step": 6590 + }, + { + "epoch": 0.66, + "grad_norm": 4.1763072999856377e-08, + "learning_rate": 1.56348351646022e-06, + "logits/chosen": -0.7084104418754578, + "logits/rejected": 0.251697301864624, + "logps/chosen": -414.48553466796875, + "logps/rejected": -966.2922973632812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.428361892700195, + "rewards/margins": 49.03688430786133, + "rewards/rejected": -59.465248107910156, + "step": 6600 + }, + { + "epoch": 0.661, + "grad_norm": 6.320921164903152e-17, + "learning_rate": 1.5553980329538326e-06, + "logits/chosen": -0.8995717763900757, + "logits/rejected": 0.3077837824821472, + "logps/chosen": -229.172607421875, + "logps/rejected": -759.6439208984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.536977767944336, + "rewards/margins": 41.12507247924805, + "rewards/rejected": -52.66204833984375, + "step": 6610 + }, + { + "epoch": 0.662, + "grad_norm": 1.0289334488600144e-11, + "learning_rate": 1.547324059124315e-06, + "logits/chosen": -0.8084653615951538, + "logits/rejected": 0.15149430930614471, + "logps/chosen": -355.364013671875, + "logps/rejected": -922.4557495117188, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.91248607635498, + "rewards/margins": 51.052677154541016, + "rewards/rejected": -64.96516418457031, + "step": 6620 + }, + { + "epoch": 0.663, + "grad_norm": 0.0, + "learning_rate": 1.539261693350491e-06, + "logits/chosen": -0.8480122685432434, + "logits/rejected": 0.20509465038776398, + "logps/chosen": -326.6919250488281, + "logps/rejected": -920.38330078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.641644477844238, + "rewards/margins": 53.87500762939453, + "rewards/rejected": -64.51664733886719, + "step": 6630 + }, + { + "epoch": 0.664, + "grad_norm": 2.461848956910131e-15, + "learning_rate": 1.5312110338697427e-06, + "logits/chosen": -0.5872770547866821, + "logits/rejected": 0.07159560918807983, + "logps/chosen": -389.7482604980469, + "logps/rejected": -811.3320922851562, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.820398330688477, + "rewards/margins": 40.80659866333008, + "rewards/rejected": -53.626991271972656, + "step": 6640 + }, + { + "epoch": 0.665, + "grad_norm": 0.0, + "learning_rate": 1.5231721787768162e-06, + "logits/chosen": -0.7103424072265625, + "logits/rejected": 0.08447039872407913, + "logps/chosen": -422.61962890625, + "logps/rejected": -948.6730346679688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.737173080444336, + "rewards/margins": 47.96685028076172, + "rewards/rejected": -63.70402145385742, + "step": 6650 + }, + { + "epoch": 0.666, + "grad_norm": 0.0, + "learning_rate": 1.5151452260226224e-06, + "logits/chosen": -0.9289888143539429, + "logits/rejected": 0.18887189030647278, + "logps/chosen": -377.4011535644531, + "logps/rejected": -859.4797973632812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.963301658630371, + "rewards/margins": 46.39201354980469, + "rewards/rejected": -54.355316162109375, + "step": 6660 + }, + { + "epoch": 0.667, + "grad_norm": 0.0, + "learning_rate": 1.5071302734130488e-06, + "logits/chosen": -0.9944952726364136, + "logits/rejected": 0.5345765352249146, + "logps/chosen": -303.9234619140625, + "logps/rejected": -991.6798706054688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.570622444152832, + "rewards/margins": 58.970664978027344, + "rewards/rejected": -66.54129028320312, + "step": 6670 + }, + { + "epoch": 0.668, + "grad_norm": 0.0, + "learning_rate": 1.4991274186077632e-06, + "logits/chosen": -1.0211021900177002, + "logits/rejected": 0.1890941858291626, + "logps/chosen": -625.1935424804688, + "logps/rejected": -1045.278564453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.653399467468262, + "rewards/margins": 45.128929138183594, + "rewards/rejected": -57.78232955932617, + "step": 6680 + }, + { + "epoch": 0.669, + "grad_norm": 0.0, + "learning_rate": 1.491136759119025e-06, + "logits/chosen": -1.015205979347229, + "logits/rejected": 0.25742340087890625, + "logps/chosen": -416.8662109375, + "logps/rejected": -1021.6949462890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.015602111816406, + "rewards/margins": 52.00904083251953, + "rewards/rejected": -63.02463912963867, + "step": 6690 + }, + { + "epoch": 0.67, + "grad_norm": 1.739340937945144e-08, + "learning_rate": 1.4831583923104997e-06, + "logits/chosen": -0.38597649335861206, + "logits/rejected": 0.28067824244499207, + "logps/chosen": -511.48779296875, + "logps/rejected": -923.99560546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.102460861206055, + "rewards/margins": 43.810855865478516, + "rewards/rejected": -57.9133186340332, + "step": 6700 + }, + { + "epoch": 0.671, + "grad_norm": 5.359721928721332e-22, + "learning_rate": 1.4751924153960681e-06, + "logits/chosen": -0.5853025317192078, + "logits/rejected": 0.1794218271970749, + "logps/chosen": -587.2357177734375, + "logps/rejected": -1080.541015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.21432876586914, + "rewards/margins": 53.93608856201172, + "rewards/rejected": -70.15042114257812, + "step": 6710 + }, + { + "epoch": 0.672, + "grad_norm": 0.0, + "learning_rate": 1.467238925438646e-06, + "logits/chosen": -1.0530178546905518, + "logits/rejected": 0.48365315794944763, + "logps/chosen": -196.48049926757812, + "logps/rejected": -989.3134765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.558575630187988, + "rewards/margins": 60.487815856933594, + "rewards/rejected": -70.04638671875, + "step": 6720 + }, + { + "epoch": 0.673, + "grad_norm": 1.5804560438148485e-10, + "learning_rate": 1.4592980193489975e-06, + "logits/chosen": -0.4733239710330963, + "logits/rejected": 0.3845910429954529, + "logps/chosen": -347.839599609375, + "logps/rejected": -834.8303833007812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.455202102661133, + "rewards/margins": 39.814720153808594, + "rewards/rejected": -49.26992416381836, + "step": 6730 + }, + { + "epoch": 0.674, + "grad_norm": 2.8186614800618866e-16, + "learning_rate": 1.4513697938845571e-06, + "logits/chosen": -1.0593464374542236, + "logits/rejected": 0.3638271391391754, + "logps/chosen": -243.1426544189453, + "logps/rejected": -907.37255859375, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.289741516113281, + "rewards/margins": 49.817100524902344, + "rewards/rejected": -60.106834411621094, + "step": 6740 + }, + { + "epoch": 0.675, + "grad_norm": 2.792390985106863e-09, + "learning_rate": 1.443454345648252e-06, + "logits/chosen": -0.5483334064483643, + "logits/rejected": 0.18954530358314514, + "logps/chosen": -512.3756713867188, + "logps/rejected": -916.8468627929688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.741385459899902, + "rewards/margins": 44.06202697753906, + "rewards/rejected": -55.80341720581055, + "step": 6750 + }, + { + "epoch": 0.676, + "grad_norm": 0.0, + "learning_rate": 1.4355517710873184e-06, + "logits/chosen": -1.1610043048858643, + "logits/rejected": 0.23131489753723145, + "logps/chosen": -362.11016845703125, + "logps/rejected": -1002.7393798828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.117647171020508, + "rewards/margins": 51.933372497558594, + "rewards/rejected": -64.051025390625, + "step": 6760 + }, + { + "epoch": 0.677, + "grad_norm": 0.0, + "learning_rate": 1.4276621664921358e-06, + "logits/chosen": -1.1171985864639282, + "logits/rejected": 0.08405411243438721, + "logps/chosen": -434.3948669433594, + "logps/rejected": -875.7274169921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.467716217041016, + "rewards/margins": 38.010677337646484, + "rewards/rejected": -54.4783935546875, + "step": 6770 + }, + { + "epoch": 0.678, + "grad_norm": 1.0053239233383422e-15, + "learning_rate": 1.419785627995044e-06, + "logits/chosen": -0.7967459559440613, + "logits/rejected": 0.46557193994522095, + "logps/chosen": -384.51470947265625, + "logps/rejected": -1023.1734619140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.550901412963867, + "rewards/margins": 46.259498596191406, + "rewards/rejected": -60.81040573120117, + "step": 6780 + }, + { + "epoch": 0.679, + "grad_norm": 3.572149283217892e-18, + "learning_rate": 1.4119222515691817e-06, + "logits/chosen": -0.819512665271759, + "logits/rejected": 0.2668699324131012, + "logps/chosen": -322.56097412109375, + "logps/rejected": -923.2853393554688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.771075248718262, + "rewards/margins": 44.104061126708984, + "rewards/rejected": -55.8751335144043, + "step": 6790 + }, + { + "epoch": 0.68, + "grad_norm": 0.00208302098326385, + "learning_rate": 1.4040721330273063e-06, + "logits/chosen": -0.5939651727676392, + "logits/rejected": -0.12722672522068024, + "logps/chosen": -430.8482971191406, + "logps/rejected": -837.2296752929688, + "loss": 0.1543, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -16.117908477783203, + "rewards/margins": 43.303470611572266, + "rewards/rejected": -59.42137908935547, + "step": 6800 + }, + { + "epoch": 0.681, + "grad_norm": 1.484518757921549e-11, + "learning_rate": 1.3962353680206372e-06, + "logits/chosen": -0.5131450295448303, + "logits/rejected": -0.08215949684381485, + "logps/chosen": -521.3753662109375, + "logps/rejected": -982.7488403320312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.535924911499023, + "rewards/margins": 44.76746368408203, + "rewards/rejected": -62.30338668823242, + "step": 6810 + }, + { + "epoch": 0.682, + "grad_norm": 0.0, + "learning_rate": 1.388412052037682e-06, + "logits/chosen": -0.686394989490509, + "logits/rejected": 0.25932493805885315, + "logps/chosen": -337.4996032714844, + "logps/rejected": -921.4371948242188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.139752388000488, + "rewards/margins": 45.83702850341797, + "rewards/rejected": -58.976783752441406, + "step": 6820 + }, + { + "epoch": 0.683, + "grad_norm": 0.0, + "learning_rate": 1.380602280403076e-06, + "logits/chosen": -0.6851155161857605, + "logits/rejected": 0.1985010951757431, + "logps/chosen": -406.23663330078125, + "logps/rejected": -868.2559814453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.177343368530273, + "rewards/margins": 48.88190841674805, + "rewards/rejected": -59.05924606323242, + "step": 6830 + }, + { + "epoch": 0.684, + "grad_norm": 0.0, + "learning_rate": 1.3728061482764238e-06, + "logits/chosen": -0.675434947013855, + "logits/rejected": 0.35549676418304443, + "logps/chosen": -417.695556640625, + "logps/rejected": -1046.248779296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.115216255187988, + "rewards/margins": 50.35161209106445, + "rewards/rejected": -64.46682739257812, + "step": 6840 + }, + { + "epoch": 0.685, + "grad_norm": 1.7023156495543645e-10, + "learning_rate": 1.3650237506511333e-06, + "logits/chosen": -0.9004403948783875, + "logits/rejected": 0.2808682322502136, + "logps/chosen": -341.0504455566406, + "logps/rejected": -935.5436401367188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.120309829711914, + "rewards/margins": 53.2804069519043, + "rewards/rejected": -62.40071487426758, + "step": 6850 + }, + { + "epoch": 0.686, + "grad_norm": 0.0, + "learning_rate": 1.3572551823532654e-06, + "logits/chosen": -0.49469512701034546, + "logits/rejected": 0.2047506868839264, + "logps/chosen": -517.6826171875, + "logps/rejected": -975.38232421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.508434295654297, + "rewards/margins": 47.37997055053711, + "rewards/rejected": -64.88841247558594, + "step": 6860 + }, + { + "epoch": 0.687, + "grad_norm": 2.4329527314898353e-19, + "learning_rate": 1.349500538040371e-06, + "logits/chosen": -0.7579712271690369, + "logits/rejected": 0.23987862467765808, + "logps/chosen": -303.2074890136719, + "logps/rejected": -856.4743041992188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.821787357330322, + "rewards/margins": 51.322776794433594, + "rewards/rejected": -59.144569396972656, + "step": 6870 + }, + { + "epoch": 0.688, + "grad_norm": 0.0, + "learning_rate": 1.3417599122003464e-06, + "logits/chosen": -0.7172698378562927, + "logits/rejected": 0.41875559091567993, + "logps/chosen": -434.141845703125, + "logps/rejected": -1052.8280029296875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.568477630615234, + "rewards/margins": 57.83243942260742, + "rewards/rejected": -68.40091705322266, + "step": 6880 + }, + { + "epoch": 0.689, + "grad_norm": 0.003907814156264067, + "learning_rate": 1.3340333991502723e-06, + "logits/chosen": -0.24279102683067322, + "logits/rejected": 0.33490872383117676, + "logps/chosen": -311.4708251953125, + "logps/rejected": -821.8943481445312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.199905395507812, + "rewards/margins": 48.722801208496094, + "rewards/rejected": -62.922706604003906, + "step": 6890 + }, + { + "epoch": 0.69, + "grad_norm": 0.0, + "learning_rate": 1.3263210930352737e-06, + "logits/chosen": -0.8461467623710632, + "logits/rejected": 0.21782417595386505, + "logps/chosen": -332.932373046875, + "logps/rejected": -1089.2677001953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.197572708129883, + "rewards/margins": 68.2454605102539, + "rewards/rejected": -77.44303894042969, + "step": 6900 + }, + { + "epoch": 0.691, + "grad_norm": 0.0, + "learning_rate": 1.3186230878273654e-06, + "logits/chosen": -1.0278829336166382, + "logits/rejected": 0.48746466636657715, + "logps/chosen": -175.5904541015625, + "logps/rejected": -934.48046875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.513206481933594, + "rewards/margins": 59.157508850097656, + "rewards/rejected": -67.67072296142578, + "step": 6910 + }, + { + "epoch": 0.692, + "grad_norm": 0.0, + "learning_rate": 1.3109394773243117e-06, + "logits/chosen": -0.322140634059906, + "logits/rejected": 0.26277634501457214, + "logps/chosen": -565.261962890625, + "logps/rejected": -993.6104736328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.960165023803711, + "rewards/margins": 47.8547248840332, + "rewards/rejected": -59.81489181518555, + "step": 6920 + }, + { + "epoch": 0.693, + "grad_norm": 0.0, + "learning_rate": 1.3032703551484832e-06, + "logits/chosen": -0.6050577759742737, + "logits/rejected": 0.24701687693595886, + "logps/chosen": -320.1561584472656, + "logps/rejected": -811.4998779296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.287206649780273, + "rewards/margins": 47.75560760498047, + "rewards/rejected": -57.042808532714844, + "step": 6930 + }, + { + "epoch": 0.694, + "grad_norm": 4.693359187513124e-06, + "learning_rate": 1.2956158147457116e-06, + "logits/chosen": -0.7445470094680786, + "logits/rejected": 0.4302369952201843, + "logps/chosen": -271.83868408203125, + "logps/rejected": -832.82177734375, + "loss": 0.1174, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.641332626342773, + "rewards/margins": 47.020240783691406, + "rewards/rejected": -56.66157150268555, + "step": 6940 + }, + { + "epoch": 0.695, + "grad_norm": 5.191566856410645e-07, + "learning_rate": 1.2879759493841577e-06, + "logits/chosen": -1.0439097881317139, + "logits/rejected": 0.6121958494186401, + "logps/chosen": -200.1259307861328, + "logps/rejected": -844.1336059570312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.755642890930176, + "rewards/margins": 49.18115997314453, + "rewards/rejected": -59.936805725097656, + "step": 6950 + }, + { + "epoch": 0.696, + "grad_norm": 2.7079682496378155e-08, + "learning_rate": 1.280350852153168e-06, + "logits/chosen": -0.7683631181716919, + "logits/rejected": 0.4195129871368408, + "logps/chosen": -410.979736328125, + "logps/rejected": -1058.7357177734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.451654434204102, + "rewards/margins": 56.14714813232422, + "rewards/rejected": -67.59880065917969, + "step": 6960 + }, + { + "epoch": 0.697, + "grad_norm": 0.0, + "learning_rate": 1.272740615962148e-06, + "logits/chosen": -0.3984186351299286, + "logits/rejected": 0.3945949673652649, + "logps/chosen": -437.71600341796875, + "logps/rejected": -1098.1243896484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.609410285949707, + "rewards/margins": 65.10707092285156, + "rewards/rejected": -79.71647644042969, + "step": 6970 + }, + { + "epoch": 0.698, + "grad_norm": 1.6409067843652786e-14, + "learning_rate": 1.2651453335394232e-06, + "logits/chosen": -0.6048688888549805, + "logits/rejected": 0.12000073492527008, + "logps/chosen": -758.637939453125, + "logps/rejected": -1092.565673828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.610742568969727, + "rewards/margins": 44.76811599731445, + "rewards/rejected": -58.37885665893555, + "step": 6980 + }, + { + "epoch": 0.699, + "grad_norm": 0.0, + "learning_rate": 1.2575650974311118e-06, + "logits/chosen": -1.0600610971450806, + "logits/rejected": 0.607258141040802, + "logps/chosen": -189.1847686767578, + "logps/rejected": -864.9268798828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.889070510864258, + "rewards/margins": 51.53076171875, + "rewards/rejected": -61.419837951660156, + "step": 6990 + }, + { + "epoch": 0.7, + "grad_norm": 0.0004449795524124056, + "learning_rate": 1.2500000000000007e-06, + "logits/chosen": -0.7913299798965454, + "logits/rejected": 0.18162783980369568, + "logps/chosen": -394.7737121582031, + "logps/rejected": -997.1735229492188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.63129997253418, + "rewards/margins": 58.36121368408203, + "rewards/rejected": -70.99250793457031, + "step": 7000 + }, + { + "epoch": 0.701, + "grad_norm": 2.3053121911420504e-12, + "learning_rate": 1.2424501334244124e-06, + "logits/chosen": -0.3424440026283264, + "logits/rejected": 0.23116068542003632, + "logps/chosen": -497.0257873535156, + "logps/rejected": -1031.158447265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.78152084350586, + "rewards/margins": 51.52674102783203, + "rewards/rejected": -70.30825805664062, + "step": 7010 + }, + { + "epoch": 0.702, + "grad_norm": 0.08845698833465576, + "learning_rate": 1.234915589697091e-06, + "logits/chosen": -0.4873952269554138, + "logits/rejected": 0.16965332627296448, + "logps/chosen": -439.7547912597656, + "logps/rejected": -884.2762451171875, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.531588554382324, + "rewards/margins": 38.97401809692383, + "rewards/rejected": -53.5056037902832, + "step": 7020 + }, + { + "epoch": 0.703, + "grad_norm": 0.0, + "learning_rate": 1.2273964606240718e-06, + "logits/chosen": -1.1222232580184937, + "logits/rejected": 0.5668286085128784, + "logps/chosen": -311.9913635253906, + "logps/rejected": -963.0960083007812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.711811065673828, + "rewards/margins": 50.60163497924805, + "rewards/rejected": -61.313446044921875, + "step": 7030 + }, + { + "epoch": 0.704, + "grad_norm": 0.0, + "learning_rate": 1.2198928378235717e-06, + "logits/chosen": -0.9157301187515259, + "logits/rejected": 0.8059916496276855, + "logps/chosen": -273.55029296875, + "logps/rejected": -903.5457763671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.198125839233398, + "rewards/margins": 51.39958572387695, + "rewards/rejected": -59.59770965576172, + "step": 7040 + }, + { + "epoch": 0.705, + "grad_norm": 9.500559383255822e-14, + "learning_rate": 1.2124048127248644e-06, + "logits/chosen": -0.3811189830303192, + "logits/rejected": 0.7395201921463013, + "logps/chosen": -226.52603149414062, + "logps/rejected": -791.3262329101562, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.798089981079102, + "rewards/margins": 49.31720733642578, + "rewards/rejected": -58.11529541015625, + "step": 7050 + }, + { + "epoch": 0.706, + "grad_norm": 1.8333838884395179e-16, + "learning_rate": 1.204932476567175e-06, + "logits/chosen": -0.2587122917175293, + "logits/rejected": 0.3872125744819641, + "logps/chosen": -462.1097717285156, + "logps/rejected": -999.1061401367188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.840702056884766, + "rewards/margins": 56.51240158081055, + "rewards/rejected": -76.35310363769531, + "step": 7060 + }, + { + "epoch": 0.707, + "grad_norm": 0.0, + "learning_rate": 1.19747592039856e-06, + "logits/chosen": -0.7832753658294678, + "logits/rejected": 0.3581964075565338, + "logps/chosen": -441.05780029296875, + "logps/rejected": -1143.239990234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.54179573059082, + "rewards/margins": 65.3556137084961, + "rewards/rejected": -76.89740753173828, + "step": 7070 + }, + { + "epoch": 0.708, + "grad_norm": 9.741703427059047e-17, + "learning_rate": 1.1900352350748026e-06, + "logits/chosen": -0.4372076094150543, + "logits/rejected": 0.41365084052085876, + "logps/chosen": -311.98260498046875, + "logps/rejected": -779.3128662109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.149206161499023, + "rewards/margins": 42.368614196777344, + "rewards/rejected": -55.51781463623047, + "step": 7080 + }, + { + "epoch": 0.709, + "grad_norm": 7.48366429589864e-13, + "learning_rate": 1.1826105112583061e-06, + "logits/chosen": -0.7754701972007751, + "logits/rejected": 0.3482648730278015, + "logps/chosen": -442.53985595703125, + "logps/rejected": -1012.1441650390625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.848672866821289, + "rewards/margins": 50.82096862792969, + "rewards/rejected": -60.66963577270508, + "step": 7090 + }, + { + "epoch": 0.71, + "grad_norm": 0.0, + "learning_rate": 1.1752018394169882e-06, + "logits/chosen": -0.7517408132553101, + "logits/rejected": 0.5923580527305603, + "logps/chosen": -251.5076904296875, + "logps/rejected": -1010.1513671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.077350616455078, + "rewards/margins": 63.697349548339844, + "rewards/rejected": -75.77470397949219, + "step": 7100 + }, + { + "epoch": 0.711, + "grad_norm": 2.079006303960973e-17, + "learning_rate": 1.1678093098231748e-06, + "logits/chosen": -0.6140622496604919, + "logits/rejected": 0.6913038492202759, + "logps/chosen": -387.5259704589844, + "logps/rejected": -1004.0714111328125, + "loss": 0.5563, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -14.7683744430542, + "rewards/margins": 54.26259231567383, + "rewards/rejected": -69.03096771240234, + "step": 7110 + }, + { + "epoch": 0.712, + "grad_norm": 0.0, + "learning_rate": 1.160433012552508e-06, + "logits/chosen": -0.9175033569335938, + "logits/rejected": 0.3144712746143341, + "logps/chosen": -406.3265686035156, + "logps/rejected": -1089.0546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.071568489074707, + "rewards/margins": 62.89362716674805, + "rewards/rejected": -70.96519470214844, + "step": 7120 + }, + { + "epoch": 0.713, + "grad_norm": 0.0, + "learning_rate": 1.1530730374828424e-06, + "logits/chosen": -0.9688884615898132, + "logits/rejected": 0.6511700749397278, + "logps/chosen": -416.6983947753906, + "logps/rejected": -1149.14794921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.32402229309082, + "rewards/margins": 55.059349060058594, + "rewards/rejected": -67.38337707519531, + "step": 7130 + }, + { + "epoch": 0.714, + "grad_norm": 0.0, + "learning_rate": 1.1457294742931508e-06, + "logits/chosen": -0.810795783996582, + "logits/rejected": -0.004343023989349604, + "logps/chosen": -714.1817016601562, + "logps/rejected": -1107.7537841796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.2621431350708, + "rewards/margins": 53.681312561035156, + "rewards/rejected": -67.94345092773438, + "step": 7140 + }, + { + "epoch": 0.715, + "grad_norm": 8.800793781812907e-21, + "learning_rate": 1.1384024124624324e-06, + "logits/chosen": -1.0921555757522583, + "logits/rejected": 0.08647129684686661, + "logps/chosen": -353.8780212402344, + "logps/rejected": -922.1368408203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.05639934539795, + "rewards/margins": 46.51305389404297, + "rewards/rejected": -55.56945037841797, + "step": 7150 + }, + { + "epoch": 0.716, + "grad_norm": 1.2604188919067383, + "learning_rate": 1.1310919412686248e-06, + "logits/chosen": -0.9001690149307251, + "logits/rejected": 0.18475279211997986, + "logps/chosen": -418.95562744140625, + "logps/rejected": -1113.604248046875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.656079292297363, + "rewards/margins": 58.632850646972656, + "rewards/rejected": -72.28893280029297, + "step": 7160 + }, + { + "epoch": 0.717, + "grad_norm": 0.11071855574846268, + "learning_rate": 1.1237981497875112e-06, + "logits/chosen": -0.7475544810295105, + "logits/rejected": 0.2684534192085266, + "logps/chosen": -382.6466064453125, + "logps/rejected": -833.0397338867188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.405658721923828, + "rewards/margins": 41.787742614746094, + "rewards/rejected": -53.19340133666992, + "step": 7170 + }, + { + "epoch": 0.718, + "grad_norm": 0.0, + "learning_rate": 1.11652112689164e-06, + "logits/chosen": -1.6065635681152344, + "logits/rejected": 0.68021160364151, + "logps/chosen": -204.63723754882812, + "logps/rejected": -1166.2694091796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.474165439605713, + "rewards/margins": 67.179443359375, + "rewards/rejected": -74.65361022949219, + "step": 7180 + }, + { + "epoch": 0.719, + "grad_norm": 1.3178431436389193e-11, + "learning_rate": 1.109260961249238e-06, + "logits/chosen": -1.1102924346923828, + "logits/rejected": 0.444924533367157, + "logps/chosen": -300.0912780761719, + "logps/rejected": -1036.2974853515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.845537185668945, + "rewards/margins": 54.732566833496094, + "rewards/rejected": -67.5781021118164, + "step": 7190 + }, + { + "epoch": 0.72, + "grad_norm": 0.0, + "learning_rate": 1.1020177413231334e-06, + "logits/chosen": -1.1612586975097656, + "logits/rejected": 0.28627079725265503, + "logps/chosen": -308.6979064941406, + "logps/rejected": -1037.414306640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.11671257019043, + "rewards/margins": 47.389488220214844, + "rewards/rejected": -60.506202697753906, + "step": 7200 + }, + { + "epoch": 0.721, + "grad_norm": 3.743392066509216e-23, + "learning_rate": 1.0947915553696742e-06, + "logits/chosen": -0.8225234150886536, + "logits/rejected": 0.5830526351928711, + "logps/chosen": -275.85394287109375, + "logps/rejected": -997.060546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.838689804077148, + "rewards/margins": 63.3530387878418, + "rewards/rejected": -72.19173431396484, + "step": 7210 + }, + { + "epoch": 0.722, + "grad_norm": 8.84302053805186e-09, + "learning_rate": 1.0875824914376555e-06, + "logits/chosen": -0.5271292328834534, + "logits/rejected": 0.6783641576766968, + "logps/chosen": -432.099853515625, + "logps/rejected": -1039.996826171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.017711639404297, + "rewards/margins": 54.85394287109375, + "rewards/rejected": -72.87165832519531, + "step": 7220 + }, + { + "epoch": 0.723, + "grad_norm": 1.3129018952895422e-05, + "learning_rate": 1.0803906373672477e-06, + "logits/chosen": -0.33937448263168335, + "logits/rejected": 0.41331392526626587, + "logps/chosen": -365.21868896484375, + "logps/rejected": -936.5850830078125, + "loss": 0.4843, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -15.739160537719727, + "rewards/margins": 46.56276321411133, + "rewards/rejected": -62.30192184448242, + "step": 7230 + }, + { + "epoch": 0.724, + "grad_norm": 0.0, + "learning_rate": 1.073216080788921e-06, + "logits/chosen": -0.733219563961029, + "logits/rejected": 0.18814750015735626, + "logps/chosen": -397.82293701171875, + "logps/rejected": -884.7576293945312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.14622688293457, + "rewards/margins": 47.09293746948242, + "rewards/rejected": -58.239173889160156, + "step": 7240 + }, + { + "epoch": 0.725, + "grad_norm": 1.6266512833904598e-15, + "learning_rate": 1.0660589091223854e-06, + "logits/chosen": -0.5925036072731018, + "logits/rejected": 0.1687505543231964, + "logps/chosen": -344.05572509765625, + "logps/rejected": -875.701171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.012567520141602, + "rewards/margins": 50.19481658935547, + "rewards/rejected": -60.2073860168457, + "step": 7250 + }, + { + "epoch": 0.726, + "grad_norm": 0.0, + "learning_rate": 1.0589192095755172e-06, + "logits/chosen": -0.7121064066886902, + "logits/rejected": 0.22998180985450745, + "logps/chosen": -286.26458740234375, + "logps/rejected": -935.8848876953125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.205058097839355, + "rewards/margins": 54.207542419433594, + "rewards/rejected": -62.41259765625, + "step": 7260 + }, + { + "epoch": 0.727, + "grad_norm": 5.209674054640345e-05, + "learning_rate": 1.0517970691433035e-06, + "logits/chosen": -1.3975350856781006, + "logits/rejected": 0.3783726990222931, + "logps/chosen": -350.1981506347656, + "logps/rejected": -978.7327880859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.47797966003418, + "rewards/margins": 48.890647888183594, + "rewards/rejected": -57.368629455566406, + "step": 7270 + }, + { + "epoch": 0.728, + "grad_norm": 1.089237144924482e-09, + "learning_rate": 1.0446925746067768e-06, + "logits/chosen": -0.7229653000831604, + "logits/rejected": 0.2919732630252838, + "logps/chosen": -379.4202880859375, + "logps/rejected": -898.99853515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.882861137390137, + "rewards/margins": 43.108375549316406, + "rewards/rejected": -54.991233825683594, + "step": 7280 + }, + { + "epoch": 0.729, + "grad_norm": 7.443757112923777e-06, + "learning_rate": 1.0376058125319614e-06, + "logits/chosen": -0.6685749292373657, + "logits/rejected": 0.09258606284856796, + "logps/chosen": -364.90899658203125, + "logps/rejected": -830.1349487304688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.9243803024292, + "rewards/margins": 41.93610763549805, + "rewards/rejected": -51.86049270629883, + "step": 7290 + }, + { + "epoch": 0.73, + "grad_norm": 4.272778311720238e-12, + "learning_rate": 1.0305368692688175e-06, + "logits/chosen": -0.7117483615875244, + "logits/rejected": 0.2935028672218323, + "logps/chosen": -331.0298767089844, + "logps/rejected": -861.7659301757812, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.721847534179688, + "rewards/margins": 47.882728576660156, + "rewards/rejected": -56.604576110839844, + "step": 7300 + }, + { + "epoch": 0.731, + "grad_norm": 1.561633285822126e-17, + "learning_rate": 1.0234858309501864e-06, + "logits/chosen": -0.9831579923629761, + "logits/rejected": 0.2982550263404846, + "logps/chosen": -528.8619384765625, + "logps/rejected": -1077.370361328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.656841278076172, + "rewards/margins": 53.95241165161133, + "rewards/rejected": -62.6092529296875, + "step": 7310 + }, + { + "epoch": 0.732, + "grad_norm": 2.661149880168437e-17, + "learning_rate": 1.0164527834907468e-06, + "logits/chosen": -0.7919927835464478, + "logits/rejected": 0.5363516211509705, + "logps/chosen": -300.376708984375, + "logps/rejected": -838.0631103515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.472113132476807, + "rewards/margins": 43.48785400390625, + "rewards/rejected": -50.9599723815918, + "step": 7320 + }, + { + "epoch": 0.733, + "grad_norm": 3.8212088161642394e-18, + "learning_rate": 1.0094378125859602e-06, + "logits/chosen": -1.0631580352783203, + "logits/rejected": 0.5589848160743713, + "logps/chosen": -170.1670379638672, + "logps/rejected": -833.8010864257812, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.077153205871582, + "rewards/margins": 53.96528244018555, + "rewards/rejected": -63.04243850708008, + "step": 7330 + }, + { + "epoch": 0.734, + "grad_norm": 1.1333886134029091e-14, + "learning_rate": 1.0024410037110356e-06, + "logits/chosen": -1.00998055934906, + "logits/rejected": 0.4892210066318512, + "logps/chosen": -373.8369445800781, + "logps/rejected": -1075.8577880859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.881095886230469, + "rewards/margins": 54.31298828125, + "rewards/rejected": -69.19407653808594, + "step": 7340 + }, + { + "epoch": 0.735, + "grad_norm": 0.0, + "learning_rate": 9.95462442119879e-07, + "logits/chosen": -0.7339566946029663, + "logits/rejected": 0.4608619809150696, + "logps/chosen": -290.23321533203125, + "logps/rejected": -881.2384643554688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.089037895202637, + "rewards/margins": 49.113502502441406, + "rewards/rejected": -62.202537536621094, + "step": 7350 + }, + { + "epoch": 0.736, + "grad_norm": 1.6385482201310323e-18, + "learning_rate": 9.88502212844063e-07, + "logits/chosen": -0.5577305555343628, + "logits/rejected": -0.04514486715197563, + "logps/chosen": -494.9710388183594, + "logps/rejected": -983.4933471679688, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.71878433227539, + "rewards/margins": 48.584686279296875, + "rewards/rejected": -67.30347442626953, + "step": 7360 + }, + { + "epoch": 0.737, + "grad_norm": 0.0, + "learning_rate": 9.815604006917839e-07, + "logits/chosen": -0.7128852605819702, + "logits/rejected": 0.5136991739273071, + "logps/chosen": -336.8421325683594, + "logps/rejected": -1104.3736572265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.214262008666992, + "rewards/margins": 62.831642150878906, + "rewards/rejected": -76.0458984375, + "step": 7370 + }, + { + "epoch": 0.738, + "grad_norm": 0.0, + "learning_rate": 9.746370902468311e-07, + "logits/chosen": -0.5148681998252869, + "logits/rejected": 0.2315702885389328, + "logps/chosen": -495.29150390625, + "logps/rejected": -932.6849365234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.574322700500488, + "rewards/margins": 42.701656341552734, + "rewards/rejected": -57.27598190307617, + "step": 7380 + }, + { + "epoch": 0.739, + "grad_norm": 5.64483789572412e-13, + "learning_rate": 9.677323658675594e-07, + "logits/chosen": -0.8095144033432007, + "logits/rejected": 0.015424412675201893, + "logps/chosen": -367.74444580078125, + "logps/rejected": -735.0386962890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.414527893066406, + "rewards/margins": 37.90756607055664, + "rewards/rejected": -50.32209014892578, + "step": 7390 + }, + { + "epoch": 0.74, + "grad_norm": 0.002067842520773411, + "learning_rate": 9.608463116858544e-07, + "logits/chosen": -0.8038798570632935, + "logits/rejected": 0.47396841645240784, + "logps/chosen": -428.75592041015625, + "logps/rejected": -1115.0914306640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.412130355834961, + "rewards/margins": 59.50236892700195, + "rewards/rejected": -72.91450500488281, + "step": 7400 + }, + { + "epoch": 0.741, + "grad_norm": 2.0158783812668233e-22, + "learning_rate": 9.53979011606115e-07, + "logits/chosen": -0.7873549461364746, + "logits/rejected": 0.4778427481651306, + "logps/chosen": -234.4102020263672, + "logps/rejected": -925.4586791992188, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.518302917480469, + "rewards/margins": 56.09989547729492, + "rewards/rejected": -64.61819458007812, + "step": 7410 + }, + { + "epoch": 0.742, + "grad_norm": 0.0, + "learning_rate": 9.471305493042243e-07, + "logits/chosen": -0.8623729944229126, + "logits/rejected": 0.5287893414497375, + "logps/chosen": -196.85728454589844, + "logps/rejected": -967.7962036132812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.566311836242676, + "rewards/margins": 59.91844940185547, + "rewards/rejected": -66.48475646972656, + "step": 7420 + }, + { + "epoch": 0.743, + "grad_norm": 7.9055621148254245e-22, + "learning_rate": 9.403010082265351e-07, + "logits/chosen": -0.5145904421806335, + "logits/rejected": 0.5459538698196411, + "logps/chosen": -408.49102783203125, + "logps/rejected": -997.2546997070312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.392236709594727, + "rewards/margins": 58.52399444580078, + "rewards/rejected": -68.91622924804688, + "step": 7430 + }, + { + "epoch": 0.744, + "grad_norm": 1.038647681790984e-15, + "learning_rate": 9.334904715888496e-07, + "logits/chosen": -0.9317490458488464, + "logits/rejected": 0.5168638229370117, + "logps/chosen": -442.51861572265625, + "logps/rejected": -1209.481689453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.806231498718262, + "rewards/margins": 61.112640380859375, + "rewards/rejected": -74.91886901855469, + "step": 7440 + }, + { + "epoch": 0.745, + "grad_norm": 0.0, + "learning_rate": 9.266990223754069e-07, + "logits/chosen": -0.9047843813896179, + "logits/rejected": 0.7249792814254761, + "logps/chosen": -312.70367431640625, + "logps/rejected": -1056.26220703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.898077011108398, + "rewards/margins": 59.7227897644043, + "rewards/rejected": -69.62086486816406, + "step": 7450 + }, + { + "epoch": 0.746, + "grad_norm": 0.0, + "learning_rate": 9.199267433378728e-07, + "logits/chosen": -0.8632529377937317, + "logits/rejected": 0.6839910745620728, + "logps/chosen": -316.83856201171875, + "logps/rejected": -1040.1484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.796337127685547, + "rewards/margins": 60.876007080078125, + "rewards/rejected": -70.6723403930664, + "step": 7460 + }, + { + "epoch": 0.747, + "grad_norm": 0.0, + "learning_rate": 9.131737169943314e-07, + "logits/chosen": -1.0358017683029175, + "logits/rejected": 0.4787723422050476, + "logps/chosen": -299.0080871582031, + "logps/rejected": -1036.6314697265625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.263322830200195, + "rewards/margins": 56.86602783203125, + "rewards/rejected": -72.1293716430664, + "step": 7470 + }, + { + "epoch": 0.748, + "grad_norm": 0.0, + "learning_rate": 9.064400256282757e-07, + "logits/chosen": -0.7976016998291016, + "logits/rejected": 0.7549166679382324, + "logps/chosen": -206.1776123046875, + "logps/rejected": -975.0432739257812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.270216941833496, + "rewards/margins": 60.506622314453125, + "rewards/rejected": -68.77684020996094, + "step": 7480 + }, + { + "epoch": 0.749, + "grad_norm": 8.384397318299565e-17, + "learning_rate": 8.99725751287611e-07, + "logits/chosen": -0.6219117045402527, + "logits/rejected": 0.44370508193969727, + "logps/chosen": -420.1907653808594, + "logps/rejected": -1133.765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.35955810546875, + "rewards/margins": 68.6363296508789, + "rewards/rejected": -81.99589538574219, + "step": 7490 + }, + { + "epoch": 0.75, + "grad_norm": 2.094380985929404e-18, + "learning_rate": 8.930309757836517e-07, + "logits/chosen": -0.6963815689086914, + "logits/rejected": 0.6372653245925903, + "logps/chosen": -433.8370666503906, + "logps/rejected": -1238.472900390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.223983764648438, + "rewards/margins": 69.96954345703125, + "rewards/rejected": -87.19352722167969, + "step": 7500 + }, + { + "epoch": 0.751, + "grad_norm": 0.0, + "learning_rate": 8.863557806901233e-07, + "logits/chosen": -0.146893709897995, + "logits/rejected": 0.3493548035621643, + "logps/chosen": -653.1065673828125, + "logps/rejected": -1275.319091796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -22.061296463012695, + "rewards/margins": 67.54277038574219, + "rewards/rejected": -89.60407257080078, + "step": 7510 + }, + { + "epoch": 0.752, + "grad_norm": 0.0, + "learning_rate": 8.797002473421729e-07, + "logits/chosen": -0.355530321598053, + "logits/rejected": 0.8727057576179504, + "logps/chosen": -294.88006591796875, + "logps/rejected": -957.97900390625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.46592903137207, + "rewards/margins": 60.088661193847656, + "rewards/rejected": -71.5545883178711, + "step": 7520 + }, + { + "epoch": 0.753, + "grad_norm": 0.004240179434418678, + "learning_rate": 8.73064456835373e-07, + "logits/chosen": -1.0234708786010742, + "logits/rejected": 0.3270181119441986, + "logps/chosen": -342.8228759765625, + "logps/rejected": -1176.275634765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.1382474899292, + "rewards/margins": 66.42398834228516, + "rewards/rejected": -81.56224060058594, + "step": 7530 + }, + { + "epoch": 0.754, + "grad_norm": 2.278529131322614e-14, + "learning_rate": 8.664484900247363e-07, + "logits/chosen": -0.9598628878593445, + "logits/rejected": 0.6580491065979004, + "logps/chosen": -283.4799499511719, + "logps/rejected": -1075.259033203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.367246627807617, + "rewards/margins": 62.03153610229492, + "rewards/rejected": -75.39878845214844, + "step": 7540 + }, + { + "epoch": 0.755, + "grad_norm": 0.0, + "learning_rate": 8.598524275237321e-07, + "logits/chosen": -0.837388813495636, + "logits/rejected": 1.1218478679656982, + "logps/chosen": -289.286865234375, + "logps/rejected": -1147.4429931640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.133901596069336, + "rewards/margins": 73.49677276611328, + "rewards/rejected": -89.63066864013672, + "step": 7550 + }, + { + "epoch": 0.756, + "grad_norm": 2.807277161829369e-20, + "learning_rate": 8.532763497032987e-07, + "logits/chosen": -0.38878798484802246, + "logits/rejected": 0.5794919729232788, + "logps/chosen": -512.4061889648438, + "logps/rejected": -1225.756591796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.27451515197754, + "rewards/margins": 65.82911682128906, + "rewards/rejected": -82.10364532470703, + "step": 7560 + }, + { + "epoch": 0.757, + "grad_norm": 0.0, + "learning_rate": 8.467203366908708e-07, + "logits/chosen": -0.7684077024459839, + "logits/rejected": 0.4302092492580414, + "logps/chosen": -290.90289306640625, + "logps/rejected": -995.5157470703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.721793174743652, + "rewards/margins": 61.515533447265625, + "rewards/rejected": -71.2373275756836, + "step": 7570 + }, + { + "epoch": 0.758, + "grad_norm": 0.0, + "learning_rate": 8.40184468369396e-07, + "logits/chosen": -0.5130096077919006, + "logits/rejected": 0.5950255393981934, + "logps/chosen": -315.585205078125, + "logps/rejected": -1058.7269287109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.12911605834961, + "rewards/margins": 60.01392364501953, + "rewards/rejected": -78.14305114746094, + "step": 7580 + }, + { + "epoch": 0.759, + "grad_norm": 0.0, + "learning_rate": 8.336688243763691e-07, + "logits/chosen": -1.0224621295928955, + "logits/rejected": 0.558266818523407, + "logps/chosen": -430.31793212890625, + "logps/rejected": -1285.520263671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.064857482910156, + "rewards/margins": 72.1447525024414, + "rewards/rejected": -86.20960235595703, + "step": 7590 + }, + { + "epoch": 0.76, + "grad_norm": 0.0, + "learning_rate": 8.271734841028553e-07, + "logits/chosen": -0.5809733867645264, + "logits/rejected": 0.902326762676239, + "logps/chosen": -500.68280029296875, + "logps/rejected": -1293.6524658203125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.30832290649414, + "rewards/margins": 71.17332458496094, + "rewards/rejected": -91.48163604736328, + "step": 7600 + }, + { + "epoch": 0.761, + "grad_norm": 0.0, + "learning_rate": 8.206985266925249e-07, + "logits/chosen": -1.140430212020874, + "logits/rejected": 1.0953062772750854, + "logps/chosen": -450.14691162109375, + "logps/rejected": -1547.922607421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.620083808898926, + "rewards/margins": 95.6705322265625, + "rewards/rejected": -110.2906265258789, + "step": 7610 + }, + { + "epoch": 0.762, + "grad_norm": 1.6068162069854874e-16, + "learning_rate": 8.142440310406923e-07, + "logits/chosen": -0.014678800478577614, + "logits/rejected": 0.8266127705574036, + "logps/chosen": -457.71075439453125, + "logps/rejected": -953.9225463867188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.222761154174805, + "rewards/margins": 48.474849700927734, + "rewards/rejected": -67.69761657714844, + "step": 7620 + }, + { + "epoch": 0.763, + "grad_norm": 0.0, + "learning_rate": 8.078100757933486e-07, + "logits/chosen": -0.9533795118331909, + "logits/rejected": 0.5999480485916138, + "logps/chosen": -447.9500427246094, + "logps/rejected": -1231.653564453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.345884323120117, + "rewards/margins": 77.30724334716797, + "rewards/rejected": -90.65313720703125, + "step": 7630 + }, + { + "epoch": 0.764, + "grad_norm": 0.0, + "learning_rate": 8.013967393462094e-07, + "logits/chosen": -0.3110244870185852, + "logits/rejected": 0.5950175523757935, + "logps/chosen": -463.6453552246094, + "logps/rejected": -975.3029174804688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.173898696899414, + "rewards/margins": 49.042076110839844, + "rewards/rejected": -68.21597290039062, + "step": 7640 + }, + { + "epoch": 0.765, + "grad_norm": 0.0, + "learning_rate": 7.950040998437541e-07, + "logits/chosen": -1.0438666343688965, + "logits/rejected": 0.9437441825866699, + "logps/chosen": -346.1753234863281, + "logps/rejected": -1227.1187744140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.079136848449707, + "rewards/margins": 74.47779083251953, + "rewards/rejected": -87.55693054199219, + "step": 7650 + }, + { + "epoch": 0.766, + "grad_norm": 2.848948995975536e-17, + "learning_rate": 7.886322351782782e-07, + "logits/chosen": -0.37577199935913086, + "logits/rejected": 0.2696637511253357, + "logps/chosen": -501.11114501953125, + "logps/rejected": -861.0324096679688, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.849464416503906, + "rewards/margins": 40.464820861816406, + "rewards/rejected": -53.31428909301758, + "step": 7660 + }, + { + "epoch": 0.767, + "grad_norm": 1.937213681569433e-13, + "learning_rate": 7.822812229889429e-07, + "logits/chosen": -0.6278411149978638, + "logits/rejected": 0.5903183817863464, + "logps/chosen": -349.5834045410156, + "logps/rejected": -1101.945556640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.024063110351562, + "rewards/margins": 65.76557922363281, + "rewards/rejected": -82.78965759277344, + "step": 7670 + }, + { + "epoch": 0.768, + "grad_norm": 0.0, + "learning_rate": 7.759511406608255e-07, + "logits/chosen": -0.25402379035949707, + "logits/rejected": 0.7397147417068481, + "logps/chosen": -428.45892333984375, + "logps/rejected": -1207.371337890625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.313592910766602, + "rewards/margins": 73.54627990722656, + "rewards/rejected": -88.85987091064453, + "step": 7680 + }, + { + "epoch": 0.769, + "grad_norm": 0.0, + "learning_rate": 7.696420653239834e-07, + "logits/chosen": -0.5498959422111511, + "logits/rejected": 1.0125188827514648, + "logps/chosen": -429.74560546875, + "logps/rejected": -1378.413330078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.181983947753906, + "rewards/margins": 90.03840637207031, + "rewards/rejected": -106.22039794921875, + "step": 7690 + }, + { + "epoch": 0.77, + "grad_norm": 2.008980715118014e-07, + "learning_rate": 7.633540738525066e-07, + "logits/chosen": -0.6189178228378296, + "logits/rejected": 0.47738155722618103, + "logps/chosen": -677.8727416992188, + "logps/rejected": -1215.9979248046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -25.010875701904297, + "rewards/margins": 50.56502151489258, + "rewards/rejected": -75.57588958740234, + "step": 7700 + }, + { + "epoch": 0.771, + "grad_norm": 8.869028939606787e-16, + "learning_rate": 7.57087242863589e-07, + "logits/chosen": -0.6059373617172241, + "logits/rejected": 0.5790004730224609, + "logps/chosen": -365.8356018066406, + "logps/rejected": -1020.9505615234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.18605613708496, + "rewards/margins": 53.259193420410156, + "rewards/rejected": -69.44524383544922, + "step": 7710 + }, + { + "epoch": 0.772, + "grad_norm": 0.0, + "learning_rate": 7.508416487165864e-07, + "logits/chosen": -0.5776602625846863, + "logits/rejected": 0.7393280267715454, + "logps/chosen": -298.1356506347656, + "logps/rejected": -1209.6595458984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.489774703979492, + "rewards/margins": 76.7217025756836, + "rewards/rejected": -89.21147918701172, + "step": 7720 + }, + { + "epoch": 0.773, + "grad_norm": 9.048666859879866e-18, + "learning_rate": 7.446173675120943e-07, + "logits/chosen": -0.45593494176864624, + "logits/rejected": 0.6403728127479553, + "logps/chosen": -461.8550720214844, + "logps/rejected": -1142.08642578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.827007293701172, + "rewards/margins": 60.17173385620117, + "rewards/rejected": -79.99874114990234, + "step": 7730 + }, + { + "epoch": 0.774, + "grad_norm": 3.2634135614775815e-22, + "learning_rate": 7.384144750910133e-07, + "logits/chosen": -0.45879751443862915, + "logits/rejected": 0.7685213088989258, + "logps/chosen": -507.66021728515625, + "logps/rejected": -1147.2984619140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.178197860717773, + "rewards/margins": 65.04210662841797, + "rewards/rejected": -80.22030639648438, + "step": 7740 + }, + { + "epoch": 0.775, + "grad_norm": 0.0, + "learning_rate": 7.322330470336314e-07, + "logits/chosen": -0.6126856803894043, + "logits/rejected": 0.41923293471336365, + "logps/chosen": -483.3583984375, + "logps/rejected": -1216.6903076171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.74558448791504, + "rewards/margins": 65.92410278320312, + "rewards/rejected": -82.66969299316406, + "step": 7750 + }, + { + "epoch": 0.776, + "grad_norm": 0.0, + "learning_rate": 7.260731586586983e-07, + "logits/chosen": -0.5855390429496765, + "logits/rejected": 1.0094908475875854, + "logps/chosen": -311.06524658203125, + "logps/rejected": -1248.450439453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.760951042175293, + "rewards/margins": 84.68740844726562, + "rewards/rejected": -94.44835662841797, + "step": 7760 + }, + { + "epoch": 0.777, + "grad_norm": 0.0, + "learning_rate": 7.199348850225091e-07, + "logits/chosen": -0.4212276339530945, + "logits/rejected": 0.845160961151123, + "logps/chosen": -426.73419189453125, + "logps/rejected": -1159.7073974609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.290374755859375, + "rewards/margins": 66.98060607910156, + "rewards/rejected": -85.27098083496094, + "step": 7770 + }, + { + "epoch": 0.778, + "grad_norm": 0.0, + "learning_rate": 7.138183009179922e-07, + "logits/chosen": -0.6795636415481567, + "logits/rejected": 0.7848717570304871, + "logps/chosen": -327.3196105957031, + "logps/rejected": -966.79541015625, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.527212142944336, + "rewards/margins": 55.81372833251953, + "rewards/rejected": -71.34093475341797, + "step": 7780 + }, + { + "epoch": 0.779, + "grad_norm": 0.0, + "learning_rate": 7.077234808737932e-07, + "logits/chosen": -0.38788530230522156, + "logits/rejected": 0.15900078415870667, + "logps/chosen": -410.41839599609375, + "logps/rejected": -959.5905151367188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.295787811279297, + "rewards/margins": 53.696380615234375, + "rewards/rejected": -70.9921646118164, + "step": 7790 + }, + { + "epoch": 0.78, + "grad_norm": 2.8316051218189864e-13, + "learning_rate": 7.016504991533727e-07, + "logits/chosen": -0.39930716156959534, + "logits/rejected": 0.008740996941924095, + "logps/chosen": -597.921142578125, + "logps/rejected": -835.9523315429688, + "loss": 0.4145, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.053123474121094, + "rewards/margins": 34.08021926879883, + "rewards/rejected": -55.133338928222656, + "step": 7800 + }, + { + "epoch": 0.781, + "grad_norm": 0.0, + "learning_rate": 6.955994297540947e-07, + "logits/chosen": -0.7211162447929382, + "logits/rejected": 0.7454463243484497, + "logps/chosen": -369.17742919921875, + "logps/rejected": -1076.2313232421875, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.157571792602539, + "rewards/margins": 62.682777404785156, + "rewards/rejected": -75.84034729003906, + "step": 7810 + }, + { + "epoch": 0.782, + "grad_norm": 1.4056424946110474e-21, + "learning_rate": 6.895703464063319e-07, + "logits/chosen": -0.04733237624168396, + "logits/rejected": 0.5720465183258057, + "logps/chosen": -285.1549987792969, + "logps/rejected": -922.0696411132812, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.364412307739258, + "rewards/margins": 59.26238250732422, + "rewards/rejected": -75.62680053710938, + "step": 7820 + }, + { + "epoch": 0.783, + "grad_norm": 1.2598473581162394e-18, + "learning_rate": 6.835633225725604e-07, + "logits/chosen": -0.9407356381416321, + "logits/rejected": 0.767217755317688, + "logps/chosen": -441.0362854003906, + "logps/rejected": -1393.307373046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.530881881713867, + "rewards/margins": 87.36241149902344, + "rewards/rejected": -104.8932876586914, + "step": 7830 + }, + { + "epoch": 0.784, + "grad_norm": 0.0, + "learning_rate": 6.775784314464717e-07, + "logits/chosen": -1.044764757156372, + "logits/rejected": 1.2633781433105469, + "logps/chosen": -280.1641845703125, + "logps/rejected": -1321.974365234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.51031494140625, + "rewards/margins": 84.8749008178711, + "rewards/rejected": -104.38520812988281, + "step": 7840 + }, + { + "epoch": 0.785, + "grad_norm": 0.0, + "learning_rate": 6.716157459520739e-07, + "logits/chosen": -0.7862663865089417, + "logits/rejected": 0.907000720500946, + "logps/chosen": -588.1331176757812, + "logps/rejected": -1509.89501953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.144685745239258, + "rewards/margins": 90.22511291503906, + "rewards/rejected": -107.36979675292969, + "step": 7850 + }, + { + "epoch": 0.786, + "grad_norm": 0.0, + "learning_rate": 6.656753387428089e-07, + "logits/chosen": -0.7701439261436462, + "logits/rejected": 1.0488814115524292, + "logps/chosen": -291.90594482421875, + "logps/rejected": -1312.0511474609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.93437385559082, + "rewards/margins": 87.22757720947266, + "rewards/rejected": -101.16195678710938, + "step": 7860 + }, + { + "epoch": 0.787, + "grad_norm": 0.0, + "learning_rate": 6.597572822006643e-07, + "logits/chosen": -1.2947529554367065, + "logits/rejected": 1.1008408069610596, + "logps/chosen": -359.7368469238281, + "logps/rejected": -1567.0120849609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.875795364379883, + "rewards/margins": 97.81360626220703, + "rewards/rejected": -112.68940734863281, + "step": 7870 + }, + { + "epoch": 0.788, + "grad_norm": 3.3539895861166484e-20, + "learning_rate": 6.538616484352902e-07, + "logits/chosen": -0.8063453435897827, + "logits/rejected": 0.6678211092948914, + "logps/chosen": -332.2959289550781, + "logps/rejected": -1273.5902099609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.185787200927734, + "rewards/margins": 73.93902587890625, + "rewards/rejected": -91.12480926513672, + "step": 7880 + }, + { + "epoch": 0.789, + "grad_norm": 0.0, + "learning_rate": 6.479885092831248e-07, + "logits/chosen": -0.667913019657135, + "logits/rejected": 0.936458945274353, + "logps/chosen": -701.1113891601562, + "logps/rejected": -1564.383544921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -28.8167667388916, + "rewards/margins": 81.56040954589844, + "rewards/rejected": -110.3771743774414, + "step": 7890 + }, + { + "epoch": 0.79, + "grad_norm": 0.0, + "learning_rate": 6.421379363065142e-07, + "logits/chosen": -0.3396407663822174, + "logits/rejected": 0.5185993313789368, + "logps/chosen": -385.85736083984375, + "logps/rejected": -1270.9588623046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.686500549316406, + "rewards/margins": 87.16119384765625, + "rewards/rejected": -104.84769439697266, + "step": 7900 + }, + { + "epoch": 0.791, + "grad_norm": 2.754020567152793e-09, + "learning_rate": 6.363100007928447e-07, + "logits/chosen": -0.6791292428970337, + "logits/rejected": 0.6307904124259949, + "logps/chosen": -474.0777282714844, + "logps/rejected": -1043.2554931640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.698204040527344, + "rewards/margins": 49.53090286254883, + "rewards/rejected": -65.22911071777344, + "step": 7910 + }, + { + "epoch": 0.792, + "grad_norm": 0.0, + "learning_rate": 6.305047737536707e-07, + "logits/chosen": -0.8387699127197266, + "logits/rejected": 0.6438099145889282, + "logps/chosen": -380.71185302734375, + "logps/rejected": -1177.8743896484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.04198455810547, + "rewards/margins": 71.42205047607422, + "rewards/rejected": -91.46404266357422, + "step": 7920 + }, + { + "epoch": 0.793, + "grad_norm": 0.0, + "learning_rate": 6.247223259238513e-07, + "logits/chosen": -0.8732248544692993, + "logits/rejected": 1.2813969850540161, + "logps/chosen": -402.7436218261719, + "logps/rejected": -1218.497314453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.730911254882812, + "rewards/margins": 68.49177551269531, + "rewards/rejected": -87.2226791381836, + "step": 7930 + }, + { + "epoch": 0.794, + "grad_norm": 0.0, + "learning_rate": 6.189627277606894e-07, + "logits/chosen": -0.6855721473693848, + "logits/rejected": 0.9724220037460327, + "logps/chosen": -359.7814636230469, + "logps/rejected": -1313.208251953125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.863734245300293, + "rewards/margins": 77.40478515625, + "rewards/rejected": -93.2685317993164, + "step": 7940 + }, + { + "epoch": 0.795, + "grad_norm": 0.0, + "learning_rate": 6.1322604944307e-07, + "logits/chosen": -0.5548166632652283, + "logits/rejected": 0.7209320664405823, + "logps/chosen": -434.252685546875, + "logps/rejected": -1383.366943359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.734821319580078, + "rewards/margins": 82.97564697265625, + "rewards/rejected": -102.7104721069336, + "step": 7950 + }, + { + "epoch": 0.796, + "grad_norm": 0.0, + "learning_rate": 6.075123608706093e-07, + "logits/chosen": -0.6931466460227966, + "logits/rejected": 0.5858211517333984, + "logps/chosen": -304.97662353515625, + "logps/rejected": -1122.347900390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.999123573303223, + "rewards/margins": 73.74942779541016, + "rewards/rejected": -85.74855041503906, + "step": 7960 + }, + { + "epoch": 0.797, + "grad_norm": 0.0, + "learning_rate": 6.01821731662798e-07, + "logits/chosen": -0.5279142260551453, + "logits/rejected": 1.04735267162323, + "logps/chosen": -446.77001953125, + "logps/rejected": -1314.6702880859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.26654052734375, + "rewards/margins": 85.6667251586914, + "rewards/rejected": -102.93327331542969, + "step": 7970 + }, + { + "epoch": 0.798, + "grad_norm": 0.0, + "learning_rate": 5.961542311581586e-07, + "logits/chosen": -0.6521563529968262, + "logits/rejected": 0.8276292681694031, + "logps/chosen": -385.0362243652344, + "logps/rejected": -1391.097412109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.053596496582031, + "rewards/margins": 85.08977508544922, + "rewards/rejected": -100.14338684082031, + "step": 7980 + }, + { + "epoch": 0.799, + "grad_norm": 0.0, + "learning_rate": 5.905099284133953e-07, + "logits/chosen": -0.7128938436508179, + "logits/rejected": 0.9515337944030762, + "logps/chosen": -321.2559814453125, + "logps/rejected": -1249.6884765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.714630126953125, + "rewards/margins": 77.06490325927734, + "rewards/rejected": -91.779541015625, + "step": 7990 + }, + { + "epoch": 0.8, + "grad_norm": 5.697309621690086e-14, + "learning_rate": 5.848888922025553e-07, + "logits/chosen": -0.6935831904411316, + "logits/rejected": 1.0498263835906982, + "logps/chosen": -458.8016662597656, + "logps/rejected": -1300.8131103515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.941858291625977, + "rewards/margins": 80.65689086914062, + "rewards/rejected": -99.5987548828125, + "step": 8000 + }, + { + "epoch": 0.801, + "grad_norm": 0.0, + "learning_rate": 5.792911910161922e-07, + "logits/chosen": -0.32794898748397827, + "logits/rejected": 0.5728577375411987, + "logps/chosen": -395.8527526855469, + "logps/rejected": -1102.51953125, + "loss": 0.3056, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -15.5410795211792, + "rewards/margins": 65.24958801269531, + "rewards/rejected": -80.79066467285156, + "step": 8010 + }, + { + "epoch": 0.802, + "grad_norm": 2.4850881160414627e-19, + "learning_rate": 5.737168930605272e-07, + "logits/chosen": -0.7031986713409424, + "logits/rejected": 0.6816826462745667, + "logps/chosen": -258.4869384765625, + "logps/rejected": -1057.6505126953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.595638275146484, + "rewards/margins": 67.13885498046875, + "rewards/rejected": -77.7344970703125, + "step": 8020 + }, + { + "epoch": 0.803, + "grad_norm": 1.8630241446770945e-11, + "learning_rate": 5.681660662566225e-07, + "logits/chosen": -1.1092045307159424, + "logits/rejected": 1.042096734046936, + "logps/chosen": -316.0285339355469, + "logps/rejected": -1484.916259765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.377889633178711, + "rewards/margins": 94.16575622558594, + "rewards/rejected": -107.54365539550781, + "step": 8030 + }, + { + "epoch": 0.804, + "grad_norm": 0.0, + "learning_rate": 5.626387782395515e-07, + "logits/chosen": -0.5211082696914673, + "logits/rejected": 0.8625160455703735, + "logps/chosen": -279.7019348144531, + "logps/rejected": -1177.743896484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.55776309967041, + "rewards/margins": 74.29469299316406, + "rewards/rejected": -86.85245513916016, + "step": 8040 + }, + { + "epoch": 0.805, + "grad_norm": 1.746179100280054e-18, + "learning_rate": 5.571350963575728e-07, + "logits/chosen": -0.792542576789856, + "logits/rejected": 0.5339727401733398, + "logps/chosen": -340.4499816894531, + "logps/rejected": -1162.2421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.852516174316406, + "rewards/margins": 57.709815979003906, + "rewards/rejected": -74.56233215332031, + "step": 8050 + }, + { + "epoch": 0.806, + "grad_norm": 0.0, + "learning_rate": 5.516550876713142e-07, + "logits/chosen": -0.9227225184440613, + "logits/rejected": 0.7504435777664185, + "logps/chosen": -334.015625, + "logps/rejected": -1463.6187744140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.994755744934082, + "rewards/margins": 93.83234405517578, + "rewards/rejected": -106.82710266113281, + "step": 8060 + }, + { + "epoch": 0.807, + "grad_norm": 3.1763735522036263e-22, + "learning_rate": 5.461988189529529e-07, + "logits/chosen": -0.7259895205497742, + "logits/rejected": 0.7027822136878967, + "logps/chosen": -424.6468200683594, + "logps/rejected": -1181.9427490234375, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.932525634765625, + "rewards/margins": 67.4690170288086, + "rewards/rejected": -84.40153503417969, + "step": 8070 + }, + { + "epoch": 0.808, + "grad_norm": 0.0, + "learning_rate": 5.407663566854008e-07, + "logits/chosen": -0.6267033815383911, + "logits/rejected": 0.7194575071334839, + "logps/chosen": -426.80389404296875, + "logps/rejected": -1407.231201171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.166587829589844, + "rewards/margins": 86.96299743652344, + "rewards/rejected": -104.12960052490234, + "step": 8080 + }, + { + "epoch": 0.809, + "grad_norm": 4.3124832933603033e-17, + "learning_rate": 5.353577670614951e-07, + "logits/chosen": -0.5230453014373779, + "logits/rejected": 1.0515474081039429, + "logps/chosen": -363.8052062988281, + "logps/rejected": -1169.013671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.506147384643555, + "rewards/margins": 74.06648254394531, + "rewards/rejected": -92.57263946533203, + "step": 8090 + }, + { + "epoch": 0.81, + "grad_norm": 7.021869350865018e-06, + "learning_rate": 5.299731159831953e-07, + "logits/chosen": -1.17167329788208, + "logits/rejected": 0.774400532245636, + "logps/chosen": -375.32366943359375, + "logps/rejected": -1388.881103515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -22.649917602539062, + "rewards/margins": 77.11151123046875, + "rewards/rejected": -99.76142883300781, + "step": 8100 + }, + { + "epoch": 0.811, + "grad_norm": 8.09361074207104e-18, + "learning_rate": 5.24612469060774e-07, + "logits/chosen": -0.42869800329208374, + "logits/rejected": 0.7785958051681519, + "logps/chosen": -322.61175537109375, + "logps/rejected": -896.5892333984375, + "loss": 0.6231, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -14.81994915008545, + "rewards/margins": 45.00872039794922, + "rewards/rejected": -59.82868194580078, + "step": 8110 + }, + { + "epoch": 0.812, + "grad_norm": 0.0, + "learning_rate": 5.192758916120236e-07, + "logits/chosen": -0.291696161031723, + "logits/rejected": 0.6524218320846558, + "logps/chosen": -639.3327026367188, + "logps/rejected": -1339.4818115234375, + "loss": 0.0581, + "rewards/accuracies": 1.0, + "rewards/chosen": -23.298053741455078, + "rewards/margins": 75.19126892089844, + "rewards/rejected": -98.48933410644531, + "step": 8120 + }, + { + "epoch": 0.813, + "grad_norm": 0.0, + "learning_rate": 5.139634486614544e-07, + "logits/chosen": -0.8535143136978149, + "logits/rejected": 0.38042861223220825, + "logps/chosen": -614.6556396484375, + "logps/rejected": -1314.8524169921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.462032318115234, + "rewards/margins": 69.82005310058594, + "rewards/rejected": -87.2820816040039, + "step": 8130 + }, + { + "epoch": 0.814, + "grad_norm": 1.0093644127718106e-17, + "learning_rate": 5.086752049395097e-07, + "logits/chosen": -0.5521525740623474, + "logits/rejected": 0.7084673047065735, + "logps/chosen": -460.20452880859375, + "logps/rejected": -1026.403564453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.651652336120605, + "rewards/margins": 60.66973876953125, + "rewards/rejected": -71.32139587402344, + "step": 8140 + }, + { + "epoch": 0.815, + "grad_norm": 0.0, + "learning_rate": 5.034112248817685e-07, + "logits/chosen": -0.33179759979248047, + "logits/rejected": 0.34709256887435913, + "logps/chosen": -472.8624572753906, + "logps/rejected": -989.8903198242188, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.012979507446289, + "rewards/margins": 56.96765899658203, + "rewards/rejected": -71.98063659667969, + "step": 8150 + }, + { + "epoch": 0.816, + "grad_norm": 4.864253169706096e-19, + "learning_rate": 4.981715726281666e-07, + "logits/chosen": -0.70337975025177, + "logits/rejected": 0.5703068971633911, + "logps/chosen": -290.66314697265625, + "logps/rejected": -938.6148681640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.626321792602539, + "rewards/margins": 58.3425178527832, + "rewards/rejected": -67.96883392333984, + "step": 8160 + }, + { + "epoch": 0.817, + "grad_norm": 0.0, + "learning_rate": 4.929563120222142e-07, + "logits/chosen": -0.40683525800704956, + "logits/rejected": 0.7035341262817383, + "logps/chosen": -352.96405029296875, + "logps/rejected": -1056.8221435546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.423770904541016, + "rewards/margins": 65.76141357421875, + "rewards/rejected": -78.18519592285156, + "step": 8170 + }, + { + "epoch": 0.818, + "grad_norm": 1.2423618353016648e-17, + "learning_rate": 4.87765506610215e-07, + "logits/chosen": -0.3892587721347809, + "logits/rejected": 0.3781268894672394, + "logps/chosen": -630.7948608398438, + "logps/rejected": -1205.042724609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.79370403289795, + "rewards/margins": 66.3752212524414, + "rewards/rejected": -82.16891479492188, + "step": 8180 + }, + { + "epoch": 0.819, + "grad_norm": 0.0, + "learning_rate": 4.825992196404958e-07, + "logits/chosen": -0.958387017250061, + "logits/rejected": 0.9771413803100586, + "logps/chosen": -246.85403442382812, + "logps/rejected": -1029.256103515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.022276878356934, + "rewards/margins": 64.25834655761719, + "rewards/rejected": -76.28062438964844, + "step": 8190 + }, + { + "epoch": 0.82, + "grad_norm": 0.0, + "learning_rate": 4.774575140626317e-07, + "logits/chosen": -1.1014697551727295, + "logits/rejected": 0.7440928220748901, + "logps/chosen": -451.4864807128906, + "logps/rejected": -1390.8321533203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.898099899291992, + "rewards/margins": 77.3144760131836, + "rewards/rejected": -97.21258544921875, + "step": 8200 + }, + { + "epoch": 0.821, + "grad_norm": 0.0, + "learning_rate": 4.7234045252668393e-07, + "logits/chosen": -0.47737008333206177, + "logits/rejected": 0.47926831245422363, + "logps/chosen": -418.79327392578125, + "logps/rejected": -1029.6893310546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -23.025524139404297, + "rewards/margins": 53.46759796142578, + "rewards/rejected": -76.49312591552734, + "step": 8210 + }, + { + "epoch": 0.822, + "grad_norm": 3.9256209502687434e-14, + "learning_rate": 4.672480973824312e-07, + "logits/chosen": -0.9004614949226379, + "logits/rejected": 0.6222228407859802, + "logps/chosen": -295.7037048339844, + "logps/rejected": -1047.0849609375, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.201852798461914, + "rewards/margins": 66.40901947021484, + "rewards/rejected": -79.61087036132812, + "step": 8220 + }, + { + "epoch": 0.823, + "grad_norm": 0.0, + "learning_rate": 4.6218051067861423e-07, + "logits/chosen": -1.2087002992630005, + "logits/rejected": 0.8243007659912109, + "logps/chosen": -274.3550109863281, + "logps/rejected": -1274.161376953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.670193672180176, + "rewards/margins": 78.62632751464844, + "rewards/rejected": -87.2965316772461, + "step": 8230 + }, + { + "epoch": 0.824, + "grad_norm": 0.0, + "learning_rate": 4.5713775416217884e-07, + "logits/chosen": -0.5355249643325806, + "logits/rejected": 0.35910564661026, + "logps/chosen": -438.91094970703125, + "logps/rejected": -999.2205200195312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.278973579406738, + "rewards/margins": 58.531837463378906, + "rewards/rejected": -73.81080627441406, + "step": 8240 + }, + { + "epoch": 0.825, + "grad_norm": 0.0, + "learning_rate": 4.5211988927752026e-07, + "logits/chosen": -0.2851003408432007, + "logits/rejected": 0.439518541097641, + "logps/chosen": -613.4663696289062, + "logps/rejected": -1385.641845703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.915315628051758, + "rewards/margins": 81.16056823730469, + "rewards/rejected": -95.07588195800781, + "step": 8250 + }, + { + "epoch": 0.826, + "grad_norm": 0.0, + "learning_rate": 4.4712697716573994e-07, + "logits/chosen": -0.9923319816589355, + "logits/rejected": 0.5478132963180542, + "logps/chosen": -262.02734375, + "logps/rejected": -1090.7371826171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.208616256713867, + "rewards/margins": 70.6222152709961, + "rewards/rejected": -80.83082580566406, + "step": 8260 + }, + { + "epoch": 0.827, + "grad_norm": 3.422428153051528e-16, + "learning_rate": 4.421590786638952e-07, + "logits/chosen": -1.0225954055786133, + "logits/rejected": 0.7992622256278992, + "logps/chosen": -465.78814697265625, + "logps/rejected": -1263.924072265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.851648330688477, + "rewards/margins": 67.95602416992188, + "rewards/rejected": -82.80766296386719, + "step": 8270 + }, + { + "epoch": 0.828, + "grad_norm": 9.069101298121805e-16, + "learning_rate": 4.372162543042624e-07, + "logits/chosen": -0.723979115486145, + "logits/rejected": 0.7619308233261108, + "logps/chosen": -454.51763916015625, + "logps/rejected": -1181.912841796875, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.952386856079102, + "rewards/margins": 69.74320983886719, + "rewards/rejected": -85.69558715820312, + "step": 8280 + }, + { + "epoch": 0.829, + "grad_norm": 0.0, + "learning_rate": 4.3229856431359516e-07, + "logits/chosen": -0.6105092167854309, + "logits/rejected": 0.9017621278762817, + "logps/chosen": -449.3888244628906, + "logps/rejected": -1334.6583251953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.543269157409668, + "rewards/margins": 81.08878326416016, + "rewards/rejected": -96.6320571899414, + "step": 8290 + }, + { + "epoch": 0.83, + "grad_norm": 0.0, + "learning_rate": 4.27406068612396e-07, + "logits/chosen": -0.769940197467804, + "logits/rejected": 0.6583008766174316, + "logps/chosen": -528.0979614257812, + "logps/rejected": -1267.00048828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.954166412353516, + "rewards/margins": 64.58262634277344, + "rewards/rejected": -81.53678894042969, + "step": 8300 + }, + { + "epoch": 0.831, + "grad_norm": 0.0, + "learning_rate": 4.225388268141797e-07, + "logits/chosen": -0.6798024773597717, + "logits/rejected": 0.6938012838363647, + "logps/chosen": -287.25360107421875, + "logps/rejected": -1024.869384765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.446317672729492, + "rewards/margins": 64.9866714477539, + "rewards/rejected": -82.43299102783203, + "step": 8310 + }, + { + "epoch": 0.832, + "grad_norm": 0.0, + "learning_rate": 4.1769689822475147e-07, + "logits/chosen": -0.9560664296150208, + "logits/rejected": 0.6266263127326965, + "logps/chosen": -274.5146789550781, + "logps/rejected": -1223.1676025390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.361339569091797, + "rewards/margins": 76.72416687011719, + "rewards/rejected": -89.08551025390625, + "step": 8320 + }, + { + "epoch": 0.833, + "grad_norm": 1.053383248683648e-11, + "learning_rate": 4.12880341841484e-07, + "logits/chosen": -0.6871368288993835, + "logits/rejected": 0.6744714379310608, + "logps/chosen": -611.6934814453125, + "logps/rejected": -1321.7281494140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.083663940429688, + "rewards/margins": 68.05198669433594, + "rewards/rejected": -88.1356430053711, + "step": 8330 + }, + { + "epoch": 0.834, + "grad_norm": 0.0, + "learning_rate": 4.0808921635259595e-07, + "logits/chosen": -0.8540255427360535, + "logits/rejected": 0.6868570446968079, + "logps/chosen": -434.99517822265625, + "logps/rejected": -1390.0816650390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.621461868286133, + "rewards/margins": 85.2814712524414, + "rewards/rejected": -104.9029312133789, + "step": 8340 + }, + { + "epoch": 0.835, + "grad_norm": 5.242558923731424e-20, + "learning_rate": 4.033235801364402e-07, + "logits/chosen": -0.18468718230724335, + "logits/rejected": 0.8583782315254211, + "logps/chosen": -509.08551025390625, + "logps/rejected": -1399.2689208984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.07135772705078, + "rewards/margins": 88.06640625, + "rewards/rejected": -106.13775634765625, + "step": 8350 + }, + { + "epoch": 0.836, + "grad_norm": 6.893914701322501e-07, + "learning_rate": 3.9858349126078945e-07, + "logits/chosen": -0.6687838435173035, + "logits/rejected": 0.7791027426719666, + "logps/chosen": -348.71868896484375, + "logps/rejected": -1039.650390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.388275146484375, + "rewards/margins": 58.85942459106445, + "rewards/rejected": -73.24771118164062, + "step": 8360 + }, + { + "epoch": 0.837, + "grad_norm": 1.6544232650517188e-08, + "learning_rate": 3.938690074821314e-07, + "logits/chosen": -0.4394214153289795, + "logits/rejected": 0.9431339502334595, + "logps/chosen": -399.00872802734375, + "logps/rejected": -1156.00146484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.371219635009766, + "rewards/margins": 69.17243194580078, + "rewards/rejected": -85.54365539550781, + "step": 8370 + }, + { + "epoch": 0.838, + "grad_norm": 0.0, + "learning_rate": 3.891801862449629e-07, + "logits/chosen": -0.7251814603805542, + "logits/rejected": 0.5887377262115479, + "logps/chosen": -369.7410583496094, + "logps/rejected": -1277.55712890625, + "loss": 0.2999, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -13.028943061828613, + "rewards/margins": 79.9480209350586, + "rewards/rejected": -92.97696685791016, + "step": 8380 + }, + { + "epoch": 0.839, + "grad_norm": 2.708196821822914e-19, + "learning_rate": 3.8451708468109026e-07, + "logits/chosen": -0.7329934239387512, + "logits/rejected": 0.7062281370162964, + "logps/chosen": -432.52197265625, + "logps/rejected": -1181.098876953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.970266342163086, + "rewards/margins": 67.001220703125, + "rewards/rejected": -83.97148895263672, + "step": 8390 + }, + { + "epoch": 0.84, + "grad_norm": 0.0, + "learning_rate": 3.798797596089351e-07, + "logits/chosen": -1.2603354454040527, + "logits/rejected": 0.7970761060714722, + "logps/chosen": -318.9318542480469, + "logps/rejected": -1589.118408203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.669502258300781, + "rewards/margins": 99.53646087646484, + "rewards/rejected": -113.2059555053711, + "step": 8400 + }, + { + "epoch": 0.841, + "grad_norm": 0.0, + "learning_rate": 3.7526826753284065e-07, + "logits/chosen": -0.8607357740402222, + "logits/rejected": 0.9649609327316284, + "logps/chosen": -422.95477294921875, + "logps/rejected": -1409.5218505859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -22.955909729003906, + "rewards/margins": 81.52864074707031, + "rewards/rejected": -104.48453521728516, + "step": 8410 + }, + { + "epoch": 0.842, + "grad_norm": 0.0, + "learning_rate": 3.7068266464238085e-07, + "logits/chosen": -0.9515706300735474, + "logits/rejected": 0.6792925596237183, + "logps/chosen": -400.7596130371094, + "logps/rejected": -1421.045654296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.31169319152832, + "rewards/margins": 90.6186294555664, + "rewards/rejected": -102.93033599853516, + "step": 8420 + }, + { + "epoch": 0.843, + "grad_norm": 6.503150795644785e-10, + "learning_rate": 3.661230068116811e-07, + "logits/chosen": -0.9090437889099121, + "logits/rejected": 0.6916595697402954, + "logps/chosen": -412.43756103515625, + "logps/rejected": -1149.2764892578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.873706817626953, + "rewards/margins": 64.63572692871094, + "rewards/rejected": -81.5094223022461, + "step": 8430 + }, + { + "epoch": 0.844, + "grad_norm": 0.0, + "learning_rate": 3.615893495987335e-07, + "logits/chosen": -0.7333610653877258, + "logits/rejected": 0.6454871892929077, + "logps/chosen": -484.1697692871094, + "logps/rejected": -1221.397705078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.366098403930664, + "rewards/margins": 64.28791809082031, + "rewards/rejected": -81.6540298461914, + "step": 8440 + }, + { + "epoch": 0.845, + "grad_norm": 0.0, + "learning_rate": 3.5708174824471947e-07, + "logits/chosen": -0.8094123601913452, + "logits/rejected": 0.7109456062316895, + "logps/chosen": -383.9529724121094, + "logps/rejected": -1309.468017578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.436595916748047, + "rewards/margins": 81.26898193359375, + "rewards/rejected": -97.70558166503906, + "step": 8450 + }, + { + "epoch": 0.846, + "grad_norm": 0.0, + "learning_rate": 3.5260025767333894e-07, + "logits/chosen": -0.6134425401687622, + "logits/rejected": 0.8118023872375488, + "logps/chosen": -523.8150634765625, + "logps/rejected": -1320.671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -23.67328643798828, + "rewards/margins": 70.70545196533203, + "rewards/rejected": -94.37873840332031, + "step": 8460 + }, + { + "epoch": 0.847, + "grad_norm": 0.0, + "learning_rate": 3.481449324901412e-07, + "logits/chosen": -0.7447524666786194, + "logits/rejected": 1.0092899799346924, + "logps/chosen": -383.1685791015625, + "logps/rejected": -1373.7222900390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.77007484436035, + "rewards/margins": 84.83432006835938, + "rewards/rejected": -103.6043930053711, + "step": 8470 + }, + { + "epoch": 0.848, + "grad_norm": 2.2618764500270672e-11, + "learning_rate": 3.4371582698185636e-07, + "logits/chosen": -0.6285834312438965, + "logits/rejected": 0.22365979850292206, + "logps/chosen": -504.1358947753906, + "logps/rejected": -983.326171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.448890686035156, + "rewards/margins": 49.23127746582031, + "rewards/rejected": -67.68016052246094, + "step": 8480 + }, + { + "epoch": 0.849, + "grad_norm": 3.920732488671419e-11, + "learning_rate": 3.393129951157384e-07, + "logits/chosen": -0.7750714421272278, + "logits/rejected": 0.47617942094802856, + "logps/chosen": -256.8757019042969, + "logps/rejected": -1155.247314453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.384836196899414, + "rewards/margins": 75.5903549194336, + "rewards/rejected": -87.97518157958984, + "step": 8490 + }, + { + "epoch": 0.85, + "grad_norm": 0.0, + "learning_rate": 3.3493649053890325e-07, + "logits/chosen": -0.6838713884353638, + "logits/rejected": 0.6836413741111755, + "logps/chosen": -524.8140869140625, + "logps/rejected": -1271.4749755859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.020029067993164, + "rewards/margins": 75.66142272949219, + "rewards/rejected": -88.68144989013672, + "step": 8500 + }, + { + "epoch": 0.851, + "grad_norm": 0.0, + "learning_rate": 3.3058636657767927e-07, + "logits/chosen": -0.3944496810436249, + "logits/rejected": 0.7332874536514282, + "logps/chosen": -472.6421813964844, + "logps/rejected": -1221.7069091796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.113519668579102, + "rewards/margins": 76.74790954589844, + "rewards/rejected": -87.8614273071289, + "step": 8510 + }, + { + "epoch": 0.852, + "grad_norm": 6.442988373333725e-19, + "learning_rate": 3.262626762369525e-07, + "logits/chosen": -0.8572524785995483, + "logits/rejected": 0.6225007772445679, + "logps/chosen": -361.4649658203125, + "logps/rejected": -1145.927001953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.924433708190918, + "rewards/margins": 66.92786407470703, + "rewards/rejected": -82.852294921875, + "step": 8520 + }, + { + "epoch": 0.853, + "grad_norm": 8.605229723235297e-11, + "learning_rate": 3.219654721995266e-07, + "logits/chosen": -0.5762674808502197, + "logits/rejected": 0.42858409881591797, + "logps/chosen": -341.80096435546875, + "logps/rejected": -897.2291259765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.997732162475586, + "rewards/margins": 48.20355987548828, + "rewards/rejected": -66.20128631591797, + "step": 8530 + }, + { + "epoch": 0.854, + "grad_norm": 1.694723438472095e-20, + "learning_rate": 3.176948068254762e-07, + "logits/chosen": -0.9069119691848755, + "logits/rejected": 0.4635804295539856, + "logps/chosen": -293.68377685546875, + "logps/rejected": -1079.21826171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.914752006530762, + "rewards/margins": 67.25696563720703, + "rewards/rejected": -79.17171478271484, + "step": 8540 + }, + { + "epoch": 0.855, + "grad_norm": 0.0, + "learning_rate": 3.134507321515107e-07, + "logits/chosen": -1.0798920392990112, + "logits/rejected": 0.9683173894882202, + "logps/chosen": -320.7782287597656, + "logps/rejected": -1481.5472412109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.124308586120605, + "rewards/margins": 96.64144134521484, + "rewards/rejected": -109.76573181152344, + "step": 8550 + }, + { + "epoch": 0.856, + "grad_norm": 0.0, + "learning_rate": 3.0923329989034134e-07, + "logits/chosen": -0.7146934866905212, + "logits/rejected": 0.7739596366882324, + "logps/chosen": -322.8594665527344, + "logps/rejected": -1208.615478515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.003753662109375, + "rewards/margins": 80.2951889038086, + "rewards/rejected": -90.2989501953125, + "step": 8560 + }, + { + "epoch": 0.857, + "grad_norm": 0.0, + "learning_rate": 3.050425614300487e-07, + "logits/chosen": -0.807713508605957, + "logits/rejected": 0.5876402854919434, + "logps/chosen": -404.4460754394531, + "logps/rejected": -1126.020751953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.31097412109375, + "rewards/margins": 68.10762786865234, + "rewards/rejected": -84.4186019897461, + "step": 8570 + }, + { + "epoch": 0.858, + "grad_norm": 1.0998899002950119e-14, + "learning_rate": 3.0087856783345916e-07, + "logits/chosen": -0.5385525226593018, + "logits/rejected": 0.42952489852905273, + "logps/chosen": -628.0839233398438, + "logps/rejected": -1292.969482421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.087167739868164, + "rewards/margins": 66.9642105102539, + "rewards/rejected": -81.05137634277344, + "step": 8580 + }, + { + "epoch": 0.859, + "grad_norm": 0.0, + "learning_rate": 2.967413698375196e-07, + "logits/chosen": -1.0018390417099, + "logits/rejected": 0.8285702466964722, + "logps/chosen": -506.5467224121094, + "logps/rejected": -1452.560302734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.048295974731445, + "rewards/margins": 94.05846405029297, + "rewards/rejected": -109.10675048828125, + "step": 8590 + }, + { + "epoch": 0.86, + "grad_norm": 0.0, + "learning_rate": 2.9263101785268253e-07, + "logits/chosen": -0.26270899176597595, + "logits/rejected": 0.7826281785964966, + "logps/chosen": -354.89642333984375, + "logps/rejected": -1114.381591796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.83215045928955, + "rewards/margins": 69.8828353881836, + "rewards/rejected": -85.71498107910156, + "step": 8600 + }, + { + "epoch": 0.861, + "grad_norm": 0.0, + "learning_rate": 2.8854756196229017e-07, + "logits/chosen": -0.2683184742927551, + "logits/rejected": 0.3286227285861969, + "logps/chosen": -564.1398315429688, + "logps/rejected": -1226.2554931640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -30.448040008544922, + "rewards/margins": 64.25608825683594, + "rewards/rejected": -94.70413208007812, + "step": 8610 + }, + { + "epoch": 0.862, + "grad_norm": 0.0, + "learning_rate": 2.844910519219632e-07, + "logits/chosen": -0.8165909051895142, + "logits/rejected": 0.6119577288627625, + "logps/chosen": -444.7513122558594, + "logps/rejected": -1322.9306640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.9857177734375, + "rewards/margins": 77.33323669433594, + "rewards/rejected": -98.3189468383789, + "step": 8620 + }, + { + "epoch": 0.863, + "grad_norm": 0.0, + "learning_rate": 2.8046153715899695e-07, + "logits/chosen": -0.3791617751121521, + "logits/rejected": 0.6677260398864746, + "logps/chosen": -353.25006103515625, + "logps/rejected": -1090.166259765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.440378189086914, + "rewards/margins": 68.6875, + "rewards/rejected": -86.12787628173828, + "step": 8630 + }, + { + "epoch": 0.864, + "grad_norm": 0.0, + "learning_rate": 2.7645906677175594e-07, + "logits/chosen": -0.7198137044906616, + "logits/rejected": 0.9997223615646362, + "logps/chosen": -322.55755615234375, + "logps/rejected": -1218.536865234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.987421035766602, + "rewards/margins": 70.57664489746094, + "rewards/rejected": -84.56407165527344, + "step": 8640 + }, + { + "epoch": 0.865, + "grad_norm": 0.0, + "learning_rate": 2.7248368952908055e-07, + "logits/chosen": -0.34666958451271057, + "logits/rejected": 0.5319896340370178, + "logps/chosen": -393.0049743652344, + "logps/rejected": -1030.311767578125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.975300788879395, + "rewards/margins": 60.349082946777344, + "rewards/rejected": -76.32437896728516, + "step": 8650 + }, + { + "epoch": 0.866, + "grad_norm": 0.0, + "learning_rate": 2.6853545386968607e-07, + "logits/chosen": -0.4963590204715729, + "logits/rejected": 0.9394834637641907, + "logps/chosen": -427.63958740234375, + "logps/rejected": -1305.783447265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.72681427001953, + "rewards/margins": 79.85389709472656, + "rewards/rejected": -97.58070373535156, + "step": 8660 + }, + { + "epoch": 0.867, + "grad_norm": 0.0, + "learning_rate": 2.6461440790157974e-07, + "logits/chosen": -0.7823432683944702, + "logits/rejected": 0.6521458625793457, + "logps/chosen": -349.2547607421875, + "logps/rejected": -1170.160400390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.58772850036621, + "rewards/margins": 69.0348892211914, + "rewards/rejected": -86.62260437011719, + "step": 8670 + }, + { + "epoch": 0.868, + "grad_norm": 0.0, + "learning_rate": 2.6072059940146775e-07, + "logits/chosen": -1.124406099319458, + "logits/rejected": 0.8629137277603149, + "logps/chosen": -329.8213806152344, + "logps/rejected": -1356.882080078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.49382495880127, + "rewards/margins": 81.21879577636719, + "rewards/rejected": -96.7126235961914, + "step": 8680 + }, + { + "epoch": 0.869, + "grad_norm": 0.0, + "learning_rate": 2.568540758141791e-07, + "logits/chosen": -1.092452883720398, + "logits/rejected": 0.8541895747184753, + "logps/chosen": -219.02163696289062, + "logps/rejected": -1051.510986328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.051870346069336, + "rewards/margins": 67.15811157226562, + "rewards/rejected": -79.20997619628906, + "step": 8690 + }, + { + "epoch": 0.87, + "grad_norm": 0.0, + "learning_rate": 2.53014884252083e-07, + "logits/chosen": -0.841894268989563, + "logits/rejected": 0.8022940754890442, + "logps/chosen": -382.36529541015625, + "logps/rejected": -1325.561279296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.09293556213379, + "rewards/margins": 80.35551452636719, + "rewards/rejected": -97.44845581054688, + "step": 8700 + }, + { + "epoch": 0.871, + "grad_norm": 5.792997215066887e-20, + "learning_rate": 2.492030714945162e-07, + "logits/chosen": -0.6809241771697998, + "logits/rejected": 0.5929954648017883, + "logps/chosen": -248.1226043701172, + "logps/rejected": -1119.2655029296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.076974868774414, + "rewards/margins": 76.9704360961914, + "rewards/rejected": -88.04740905761719, + "step": 8710 + }, + { + "epoch": 0.872, + "grad_norm": 1.9078297465225963e-16, + "learning_rate": 2.454186839872158e-07, + "logits/chosen": -0.5456374883651733, + "logits/rejected": 0.8816617727279663, + "logps/chosen": -416.0848083496094, + "logps/rejected": -1127.365966796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.77010726928711, + "rewards/margins": 67.05216979980469, + "rewards/rejected": -83.822265625, + "step": 8720 + }, + { + "epoch": 0.873, + "grad_norm": 0.0, + "learning_rate": 2.416617678417482e-07, + "logits/chosen": -1.4353996515274048, + "logits/rejected": 0.6446617841720581, + "logps/chosen": -272.59814453125, + "logps/rejected": -1227.3382568359375, + "loss": 0.4983, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -13.279558181762695, + "rewards/margins": 69.13825225830078, + "rewards/rejected": -82.41781616210938, + "step": 8730 + }, + { + "epoch": 0.874, + "grad_norm": 0.0, + "learning_rate": 2.3793236883495164e-07, + "logits/chosen": -0.7536391019821167, + "logits/rejected": 0.7938761711120605, + "logps/chosen": -371.52349853515625, + "logps/rejected": -1249.294677734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.000296592712402, + "rewards/margins": 82.83332061767578, + "rewards/rejected": -91.83361053466797, + "step": 8740 + }, + { + "epoch": 0.875, + "grad_norm": 0.0, + "learning_rate": 2.3423053240837518e-07, + "logits/chosen": -0.8754490613937378, + "logits/rejected": 0.880510151386261, + "logps/chosen": -293.3321228027344, + "logps/rejected": -993.5623168945312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.815601348876953, + "rewards/margins": 66.59302520751953, + "rewards/rejected": -75.40861511230469, + "step": 8750 + }, + { + "epoch": 0.876, + "grad_norm": 0.0, + "learning_rate": 2.3055630366772857e-07, + "logits/chosen": -0.5658458471298218, + "logits/rejected": 0.7149707078933716, + "logps/chosen": -543.1046752929688, + "logps/rejected": -1117.0491943359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.547900199890137, + "rewards/margins": 62.61817169189453, + "rewards/rejected": -76.16607666015625, + "step": 8760 + }, + { + "epoch": 0.877, + "grad_norm": 0.0, + "learning_rate": 2.269097273823287e-07, + "logits/chosen": -0.766351044178009, + "logits/rejected": 0.5960129499435425, + "logps/chosen": -310.980224609375, + "logps/rejected": -1305.391357421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.58404541015625, + "rewards/margins": 84.26557922363281, + "rewards/rejected": -98.84963989257812, + "step": 8770 + }, + { + "epoch": 0.878, + "grad_norm": 5.707560185288728e-19, + "learning_rate": 2.2329084798455747e-07, + "logits/chosen": -0.7957559823989868, + "logits/rejected": 0.573180079460144, + "logps/chosen": -346.6117248535156, + "logps/rejected": -1181.4512939453125, + "loss": 0.7611, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -13.210107803344727, + "rewards/margins": 76.22367858886719, + "rewards/rejected": -89.43378448486328, + "step": 8780 + }, + { + "epoch": 0.879, + "grad_norm": 2.2693656578486737e-13, + "learning_rate": 2.1969970956931762e-07, + "logits/chosen": -1.0760763883590698, + "logits/rejected": 0.9111539125442505, + "logps/chosen": -238.1721649169922, + "logps/rejected": -1140.282958984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.755216598510742, + "rewards/margins": 67.73008728027344, + "rewards/rejected": -80.48530578613281, + "step": 8790 + }, + { + "epoch": 0.88, + "grad_norm": 0.0, + "learning_rate": 2.1613635589349756e-07, + "logits/chosen": -0.5612384080886841, + "logits/rejected": 0.535578727722168, + "logps/chosen": -456.2445373535156, + "logps/rejected": -1187.373779296875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -23.860172271728516, + "rewards/margins": 63.09345626831055, + "rewards/rejected": -86.9536361694336, + "step": 8800 + }, + { + "epoch": 0.881, + "grad_norm": 0.0, + "learning_rate": 2.1260083037543817e-07, + "logits/chosen": -0.7693039178848267, + "logits/rejected": 0.8871771097183228, + "logps/chosen": -213.7225341796875, + "logps/rejected": -999.7437744140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.34305477142334, + "rewards/margins": 63.86836624145508, + "rewards/rejected": -73.21142578125, + "step": 8810 + }, + { + "epoch": 0.882, + "grad_norm": 0.0, + "learning_rate": 2.0909317609440093e-07, + "logits/chosen": -0.8072845339775085, + "logits/rejected": 1.0142757892608643, + "logps/chosen": -285.8123474121094, + "logps/rejected": -1325.4332275390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.264997482299805, + "rewards/margins": 89.09748840332031, + "rewards/rejected": -101.36248779296875, + "step": 8820 + }, + { + "epoch": 0.883, + "grad_norm": 1.6345581241948418e-15, + "learning_rate": 2.0561343579004716e-07, + "logits/chosen": -0.7068424820899963, + "logits/rejected": 0.4062952399253845, + "logps/chosen": -512.4677734375, + "logps/rejected": -1109.9837646484375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.541139602661133, + "rewards/margins": 57.4853630065918, + "rewards/rejected": -70.02650451660156, + "step": 8830 + }, + { + "epoch": 0.884, + "grad_norm": 5.385317886075214e-11, + "learning_rate": 2.0216165186191406e-07, + "logits/chosen": -0.7862073183059692, + "logits/rejected": 0.6577833890914917, + "logps/chosen": -203.6630859375, + "logps/rejected": -947.8547973632812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.80967903137207, + "rewards/margins": 59.02329635620117, + "rewards/rejected": -68.8329849243164, + "step": 8840 + }, + { + "epoch": 0.885, + "grad_norm": 0.0, + "learning_rate": 1.9873786636889908e-07, + "logits/chosen": -0.8603401184082031, + "logits/rejected": 0.7639999985694885, + "logps/chosen": -337.18145751953125, + "logps/rejected": -1197.7193603515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.979072570800781, + "rewards/margins": 70.65501403808594, + "rewards/rejected": -84.63409423828125, + "step": 8850 + }, + { + "epoch": 0.886, + "grad_norm": 5.867188956898417e-10, + "learning_rate": 1.95342121028749e-07, + "logits/chosen": -0.3125002980232239, + "logits/rejected": -0.0011765360832214355, + "logps/chosen": -694.6043090820312, + "logps/rejected": -1111.1990966796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.125167846679688, + "rewards/margins": 62.53199005126953, + "rewards/rejected": -76.65715026855469, + "step": 8860 + }, + { + "epoch": 0.887, + "grad_norm": 0.000368534674635157, + "learning_rate": 1.9197445721754777e-07, + "logits/chosen": -1.0669571161270142, + "logits/rejected": 0.7461029887199402, + "logps/chosen": -293.17388916015625, + "logps/rejected": -1265.558349609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.657773971557617, + "rewards/margins": 76.96913146972656, + "rewards/rejected": -89.62691497802734, + "step": 8870 + }, + { + "epoch": 0.888, + "grad_norm": 6.757189956331666e-20, + "learning_rate": 1.8863491596921745e-07, + "logits/chosen": -0.6019073128700256, + "logits/rejected": 0.6984752416610718, + "logps/chosen": -375.70751953125, + "logps/rejected": -1243.9033203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.6553955078125, + "rewards/margins": 80.62157440185547, + "rewards/rejected": -96.27696990966797, + "step": 8880 + }, + { + "epoch": 0.889, + "grad_norm": 0.0, + "learning_rate": 1.8532353797501318e-07, + "logits/chosen": -0.36620140075683594, + "logits/rejected": 0.5800802111625671, + "logps/chosen": -458.1512756347656, + "logps/rejected": -996.4847412109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.343849182128906, + "rewards/margins": 56.926979064941406, + "rewards/rejected": -71.27082824707031, + "step": 8890 + }, + { + "epoch": 0.89, + "grad_norm": 0.0, + "learning_rate": 1.8204036358303173e-07, + "logits/chosen": -0.6936923265457153, + "logits/rejected": 0.49889129400253296, + "logps/chosen": -288.4562683105469, + "logps/rejected": -958.9161987304688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.709049224853516, + "rewards/margins": 58.544578552246094, + "rewards/rejected": -75.25361633300781, + "step": 8900 + }, + { + "epoch": 0.891, + "grad_norm": 0.0, + "learning_rate": 1.787854327977162e-07, + "logits/chosen": -0.9275741577148438, + "logits/rejected": 0.6485947370529175, + "logps/chosen": -330.7060546875, + "logps/rejected": -1296.8360595703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.509262084960938, + "rewards/margins": 77.84185791015625, + "rewards/rejected": -93.35111236572266, + "step": 8910 + }, + { + "epoch": 0.892, + "grad_norm": 0.0, + "learning_rate": 1.7555878527937164e-07, + "logits/chosen": -1.2011134624481201, + "logits/rejected": 0.6075594425201416, + "logps/chosen": -312.7022705078125, + "logps/rejected": -1189.131103515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.9640474319458, + "rewards/margins": 70.05687713623047, + "rewards/rejected": -83.02093505859375, + "step": 8920 + }, + { + "epoch": 0.893, + "grad_norm": 7.314049539630663e-14, + "learning_rate": 1.7236046034367959e-07, + "logits/chosen": -0.8679525256156921, + "logits/rejected": 0.5001112222671509, + "logps/chosen": -436.89337158203125, + "logps/rejected": -974.1031494140625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.955209732055664, + "rewards/margins": 54.162025451660156, + "rewards/rejected": -68.11723327636719, + "step": 8930 + }, + { + "epoch": 0.894, + "grad_norm": 6.464930812910552e-09, + "learning_rate": 1.6919049696121957e-07, + "logits/chosen": -0.7242705225944519, + "logits/rejected": 0.9949488639831543, + "logps/chosen": -402.5562744140625, + "logps/rejected": -1276.095947265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.125904083251953, + "rewards/margins": 72.64402770996094, + "rewards/rejected": -88.76992797851562, + "step": 8940 + }, + { + "epoch": 0.895, + "grad_norm": 0.0, + "learning_rate": 1.6604893375699594e-07, + "logits/chosen": -0.8151917457580566, + "logits/rejected": 0.576383650302887, + "logps/chosen": -461.36309814453125, + "logps/rejected": -1293.8941650390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.251383781433105, + "rewards/margins": 72.68366241455078, + "rewards/rejected": -87.93505096435547, + "step": 8950 + }, + { + "epoch": 0.896, + "grad_norm": 1.5773185534310555e-18, + "learning_rate": 1.629358090099639e-07, + "logits/chosen": -0.7810664772987366, + "logits/rejected": 0.6394819021224976, + "logps/chosen": -414.66259765625, + "logps/rejected": -1331.1300048828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.047119140625, + "rewards/margins": 71.95145416259766, + "rewards/rejected": -88.99857330322266, + "step": 8960 + }, + { + "epoch": 0.897, + "grad_norm": 0.0, + "learning_rate": 1.5985116065256683e-07, + "logits/chosen": -0.6492348909378052, + "logits/rejected": 0.7947621941566467, + "logps/chosen": -443.70635986328125, + "logps/rejected": -1371.603271484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.04248332977295, + "rewards/margins": 90.69773864746094, + "rewards/rejected": -104.740234375, + "step": 8970 + }, + { + "epoch": 0.898, + "grad_norm": 1.7848679588031597e-16, + "learning_rate": 1.567950262702714e-07, + "logits/chosen": -0.4879019260406494, + "logits/rejected": 0.915458083152771, + "logps/chosen": -289.3120422363281, + "logps/rejected": -1085.024658203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.671075820922852, + "rewards/margins": 72.11126708984375, + "rewards/rejected": -85.78233337402344, + "step": 8980 + }, + { + "epoch": 0.899, + "grad_norm": 0.0, + "learning_rate": 1.5376744310111019e-07, + "logits/chosen": -0.7717766761779785, + "logits/rejected": 0.8272747993469238, + "logps/chosen": -352.9714050292969, + "logps/rejected": -1321.5255126953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.448582649230957, + "rewards/margins": 82.11119079589844, + "rewards/rejected": -96.55977630615234, + "step": 8990 + }, + { + "epoch": 0.9, + "grad_norm": 0.0, + "learning_rate": 1.507684480352292e-07, + "logits/chosen": -0.7919676303863525, + "logits/rejected": 0.6880122423171997, + "logps/chosen": -229.85098266601562, + "logps/rejected": -1107.984130859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.7733154296875, + "rewards/margins": 74.07725524902344, + "rewards/rejected": -82.85057067871094, + "step": 9000 + }, + { + "epoch": 0.901, + "grad_norm": 0.0, + "learning_rate": 1.4779807761443638e-07, + "logits/chosen": -0.7188352346420288, + "logits/rejected": -0.2085111141204834, + "logps/chosen": -403.20623779296875, + "logps/rejected": -886.162109375, + "loss": 0.6933, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -13.076815605163574, + "rewards/margins": 49.329856872558594, + "rewards/rejected": -62.40666961669922, + "step": 9010 + }, + { + "epoch": 0.902, + "grad_norm": 0.0, + "learning_rate": 1.4485636803175828e-07, + "logits/chosen": -1.0808542966842651, + "logits/rejected": 0.4395454525947571, + "logps/chosen": -299.3458557128906, + "logps/rejected": -1083.4163818359375, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.202362060546875, + "rewards/margins": 69.3813705444336, + "rewards/rejected": -80.58373260498047, + "step": 9020 + }, + { + "epoch": 0.903, + "grad_norm": 0.0, + "learning_rate": 1.419433551309976e-07, + "logits/chosen": -0.831488311290741, + "logits/rejected": 0.843484103679657, + "logps/chosen": -359.81561279296875, + "logps/rejected": -1077.2774658203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.43194580078125, + "rewards/margins": 61.063232421875, + "rewards/rejected": -74.49516296386719, + "step": 9030 + }, + { + "epoch": 0.904, + "grad_norm": 2.340053706362255e-10, + "learning_rate": 1.3905907440629752e-07, + "logits/chosen": -0.6054459810256958, + "logits/rejected": 0.43269747495651245, + "logps/chosen": -384.861083984375, + "logps/rejected": -831.53076171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.977055549621582, + "rewards/margins": 45.349388122558594, + "rewards/rejected": -60.326438903808594, + "step": 9040 + }, + { + "epoch": 0.905, + "grad_norm": 3.8065961743351227e-17, + "learning_rate": 1.362035610017079e-07, + "logits/chosen": -0.4957023561000824, + "logits/rejected": 0.2642466425895691, + "logps/chosen": -495.8026428222656, + "logps/rejected": -1130.28759765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.497753143310547, + "rewards/margins": 60.88861846923828, + "rewards/rejected": -77.3863754272461, + "step": 9050 + }, + { + "epoch": 0.906, + "grad_norm": 2.3743223557877146e-17, + "learning_rate": 1.3337684971075932e-07, + "logits/chosen": -0.7933815717697144, + "logits/rejected": 0.6370527744293213, + "logps/chosen": -235.04129028320312, + "logps/rejected": -1101.3182373046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.383734703063965, + "rewards/margins": 78.91749572753906, + "rewards/rejected": -91.30121612548828, + "step": 9060 + }, + { + "epoch": 0.907, + "grad_norm": 0.0, + "learning_rate": 1.305789749760361e-07, + "logits/chosen": -0.6855477094650269, + "logits/rejected": 0.779290497303009, + "logps/chosen": -388.5061950683594, + "logps/rejected": -1220.8798828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.840558052062988, + "rewards/margins": 76.36566162109375, + "rewards/rejected": -88.20621490478516, + "step": 9070 + }, + { + "epoch": 0.908, + "grad_norm": 1.1141914078643608e-13, + "learning_rate": 1.278099708887587e-07, + "logits/chosen": -0.7415876984596252, + "logits/rejected": 0.4227770268917084, + "logps/chosen": -351.1391296386719, + "logps/rejected": -1024.048095703125, + "loss": 0.1366, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -13.448495864868164, + "rewards/margins": 56.8670768737793, + "rewards/rejected": -70.3155746459961, + "step": 9080 + }, + { + "epoch": 0.909, + "grad_norm": 0.0, + "learning_rate": 1.2506987118836912e-07, + "logits/chosen": -0.6299499273300171, + "logits/rejected": 0.7337206602096558, + "logps/chosen": -351.2576599121094, + "logps/rejected": -1170.62255859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.964860916137695, + "rewards/margins": 70.13188934326172, + "rewards/rejected": -85.09674835205078, + "step": 9090 + }, + { + "epoch": 0.91, + "grad_norm": 7.363024984640906e-14, + "learning_rate": 1.223587092621162e-07, + "logits/chosen": -0.21692593395709991, + "logits/rejected": 0.2910541892051697, + "logps/chosen": -529.19873046875, + "logps/rejected": -1077.606201171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.683971405029297, + "rewards/margins": 62.53369140625, + "rewards/rejected": -82.21766662597656, + "step": 9100 + }, + { + "epoch": 0.911, + "grad_norm": 3.743392066509216e-23, + "learning_rate": 1.1967651814465353e-07, + "logits/chosen": -0.5983961820602417, + "logits/rejected": 0.39085835218429565, + "logps/chosen": -580.361572265625, + "logps/rejected": -1208.87451171875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.13195514678955, + "rewards/margins": 64.27543640136719, + "rewards/rejected": -79.40740203857422, + "step": 9110 + }, + { + "epoch": 0.912, + "grad_norm": 5.2141750120031276e-17, + "learning_rate": 1.1702333051763271e-07, + "logits/chosen": -0.702211856842041, + "logits/rejected": 0.6391454935073853, + "logps/chosen": -380.1633605957031, + "logps/rejected": -1212.1474609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.09587574005127, + "rewards/margins": 75.67768096923828, + "rewards/rejected": -84.77355194091797, + "step": 9120 + }, + { + "epoch": 0.913, + "grad_norm": 0.0, + "learning_rate": 1.1439917870930795e-07, + "logits/chosen": -0.5169418454170227, + "logits/rejected": 0.09019921720027924, + "logps/chosen": -513.6130981445312, + "logps/rejected": -1079.996337890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.596075057983398, + "rewards/margins": 60.556312561035156, + "rewards/rejected": -75.15238952636719, + "step": 9130 + }, + { + "epoch": 0.914, + "grad_norm": 0.0, + "learning_rate": 1.1180409469414094e-07, + "logits/chosen": -0.6889699697494507, + "logits/rejected": 0.548774778842926, + "logps/chosen": -328.4002685546875, + "logps/rejected": -948.4647216796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.28093147277832, + "rewards/margins": 53.268798828125, + "rewards/rejected": -65.54973602294922, + "step": 9140 + }, + { + "epoch": 0.915, + "grad_norm": 0.0, + "learning_rate": 1.0923811009241142e-07, + "logits/chosen": -0.7122704386711121, + "logits/rejected": 0.8708732724189758, + "logps/chosen": -335.0247497558594, + "logps/rejected": -1252.97265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.75001049041748, + "rewards/margins": 78.19270324707031, + "rewards/rejected": -89.94271850585938, + "step": 9150 + }, + { + "epoch": 0.916, + "grad_norm": 3.938313189073678e-18, + "learning_rate": 1.067012561698319e-07, + "logits/chosen": -0.7977027297019958, + "logits/rejected": 0.4715547561645508, + "logps/chosen": -382.7290344238281, + "logps/rejected": -1013.48046875, + "loss": 0.0247, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.897479057312012, + "rewards/margins": 54.91753005981445, + "rewards/rejected": -70.81500244140625, + "step": 9160 + }, + { + "epoch": 0.917, + "grad_norm": 0.0, + "learning_rate": 1.041935638371669e-07, + "logits/chosen": -0.5972322225570679, + "logits/rejected": 0.8759559392929077, + "logps/chosen": -480.037353515625, + "logps/rejected": -1522.079345703125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.036136627197266, + "rewards/margins": 93.04458618164062, + "rewards/rejected": -112.0807113647461, + "step": 9170 + }, + { + "epoch": 0.918, + "grad_norm": 0.0, + "learning_rate": 1.0171506364985622e-07, + "logits/chosen": -0.6566218733787537, + "logits/rejected": 0.7978938817977905, + "logps/chosen": -257.0160827636719, + "logps/rejected": -1240.74462890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.77754020690918, + "rewards/margins": 84.71733093261719, + "rewards/rejected": -99.49488067626953, + "step": 9180 + }, + { + "epoch": 0.919, + "grad_norm": 0.0, + "learning_rate": 9.926578580764234e-08, + "logits/chosen": -0.6492460370063782, + "logits/rejected": 0.5774198770523071, + "logps/chosen": -416.5879821777344, + "logps/rejected": -1290.6796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.999757766723633, + "rewards/margins": 77.02781677246094, + "rewards/rejected": -93.02757263183594, + "step": 9190 + }, + { + "epoch": 0.92, + "grad_norm": 0.0, + "learning_rate": 9.684576015420277e-08, + "logits/chosen": -0.8115363121032715, + "logits/rejected": 0.26468801498413086, + "logps/chosen": -296.90667724609375, + "logps/rejected": -1148.5294189453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.467525482177734, + "rewards/margins": 76.24222564697266, + "rewards/rejected": -88.70976257324219, + "step": 9200 + }, + { + "epoch": 0.921, + "grad_norm": 0.0, + "learning_rate": 9.445501617678654e-08, + "logits/chosen": -0.8983446955680847, + "logits/rejected": 0.6710523962974548, + "logps/chosen": -518.798828125, + "logps/rejected": -1462.599365234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.77184009552002, + "rewards/margins": 87.92976379394531, + "rewards/rejected": -100.70159912109375, + "step": 9210 + }, + { + "epoch": 0.922, + "grad_norm": 0.0, + "learning_rate": 9.209358300585474e-08, + "logits/chosen": -0.9570168256759644, + "logits/rejected": 0.8016937375068665, + "logps/chosen": -448.8213806152344, + "logps/rejected": -1629.7894287109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.019401550292969, + "rewards/margins": 96.5383529663086, + "rewards/rejected": -111.55775451660156, + "step": 9220 + }, + { + "epoch": 0.923, + "grad_norm": 0.0, + "learning_rate": 8.9761489414725e-08, + "logits/chosen": -0.8167027235031128, + "logits/rejected": 0.7627574801445007, + "logps/chosen": -385.311767578125, + "logps/rejected": -1270.958251953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.449134826660156, + "rewards/margins": 77.73607635498047, + "rewards/rejected": -91.18521118164062, + "step": 9230 + }, + { + "epoch": 0.924, + "grad_norm": 3.164236744665942e-19, + "learning_rate": 8.745876381922147e-08, + "logits/chosen": -0.8649279475212097, + "logits/rejected": 0.11709287017583847, + "logps/chosen": -489.70123291015625, + "logps/rejected": -1005.6638793945312, + "loss": 0.8828, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.700468063354492, + "rewards/margins": 52.23369216918945, + "rewards/rejected": -69.93415832519531, + "step": 9240 + }, + { + "epoch": 0.925, + "grad_norm": 0.0, + "learning_rate": 8.518543427732951e-08, + "logits/chosen": -1.1081702709197998, + "logits/rejected": 0.8128757476806641, + "logps/chosen": -323.5766296386719, + "logps/rejected": -1218.4962158203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.58479118347168, + "rewards/margins": 69.68755340576172, + "rewards/rejected": -82.27234649658203, + "step": 9250 + }, + { + "epoch": 0.926, + "grad_norm": 0.0, + "learning_rate": 8.294152848885156e-08, + "logits/chosen": -0.5787280201911926, + "logits/rejected": 0.5253779888153076, + "logps/chosen": -272.241943359375, + "logps/rejected": -1050.86962890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.380825996398926, + "rewards/margins": 66.82584381103516, + "rewards/rejected": -81.2066650390625, + "step": 9260 + }, + { + "epoch": 0.927, + "grad_norm": 1.3496992307596107e-22, + "learning_rate": 8.072707379507217e-08, + "logits/chosen": -0.6485394239425659, + "logits/rejected": 0.680305004119873, + "logps/chosen": -477.92919921875, + "logps/rejected": -1094.7174072265625, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.70506477355957, + "rewards/margins": 63.1876106262207, + "rewards/rejected": -74.8926773071289, + "step": 9270 + }, + { + "epoch": 0.928, + "grad_norm": 0.0, + "learning_rate": 7.854209717842231e-08, + "logits/chosen": -1.0379279851913452, + "logits/rejected": 0.70207679271698, + "logps/chosen": -348.22869873046875, + "logps/rejected": -1431.361083984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.762044906616211, + "rewards/margins": 92.30265808105469, + "rewards/rejected": -107.06471252441406, + "step": 9280 + }, + { + "epoch": 0.929, + "grad_norm": 0.0, + "learning_rate": 7.638662526215284e-08, + "logits/chosen": -0.8047307729721069, + "logits/rejected": 0.7297149896621704, + "logps/chosen": -355.40277099609375, + "logps/rejected": -1222.614990234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.765396118164062, + "rewards/margins": 76.424560546875, + "rewards/rejected": -89.18995666503906, + "step": 9290 + }, + { + "epoch": 0.93, + "grad_norm": 0.0, + "learning_rate": 7.426068431000883e-08, + "logits/chosen": -0.7463569641113281, + "logits/rejected": 0.5908794403076172, + "logps/chosen": -397.95501708984375, + "logps/rejected": -1326.8583984375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.535120010375977, + "rewards/margins": 80.03962707519531, + "rewards/rejected": -93.57474517822266, + "step": 9300 + }, + { + "epoch": 0.931, + "grad_norm": 4.476295639648952e-20, + "learning_rate": 7.216430022591009e-08, + "logits/chosen": -0.44624462723731995, + "logits/rejected": 0.4880369305610657, + "logps/chosen": -513.1954956054688, + "logps/rejected": -1160.2886962890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.831964492797852, + "rewards/margins": 70.88438415527344, + "rewards/rejected": -81.71633911132812, + "step": 9310 + }, + { + "epoch": 0.932, + "grad_norm": 0.0, + "learning_rate": 7.009749855363457e-08, + "logits/chosen": -1.1946498155593872, + "logits/rejected": 0.6728850603103638, + "logps/chosen": -326.53936767578125, + "logps/rejected": -1230.041748046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.130026817321777, + "rewards/margins": 72.26325225830078, + "rewards/rejected": -83.39328002929688, + "step": 9320 + }, + { + "epoch": 0.933, + "grad_norm": 0.0, + "learning_rate": 6.806030447650879e-08, + "logits/chosen": -0.31252819299697876, + "logits/rejected": 0.6711708307266235, + "logps/chosen": -412.03631591796875, + "logps/rejected": -1164.569580078125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.800756454467773, + "rewards/margins": 69.16845703125, + "rewards/rejected": -83.96920776367188, + "step": 9330 + }, + { + "epoch": 0.934, + "grad_norm": 0.00047836932935751975, + "learning_rate": 6.605274281709929e-08, + "logits/chosen": -0.17928871512413025, + "logits/rejected": 0.322933167219162, + "logps/chosen": -420.1446838378906, + "logps/rejected": -962.8092651367188, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.62596893310547, + "rewards/margins": 52.55628204345703, + "rewards/rejected": -69.1822509765625, + "step": 9340 + }, + { + "epoch": 0.935, + "grad_norm": 0.0, + "learning_rate": 6.407483803691216e-08, + "logits/chosen": -1.3535900115966797, + "logits/rejected": 0.7449843287467957, + "logps/chosen": -324.685546875, + "logps/rejected": -1163.7113037109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.4141263961792, + "rewards/margins": 67.3392333984375, + "rewards/rejected": -80.75336456298828, + "step": 9350 + }, + { + "epoch": 0.936, + "grad_norm": 5.005370894650696e-06, + "learning_rate": 6.212661423609184e-08, + "logits/chosen": -0.7937058806419373, + "logits/rejected": 0.8053327798843384, + "logps/chosen": -362.6256103515625, + "logps/rejected": -1156.7620849609375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.173696517944336, + "rewards/margins": 70.54964447021484, + "rewards/rejected": -83.72335052490234, + "step": 9360 + }, + { + "epoch": 0.937, + "grad_norm": 0.0, + "learning_rate": 6.020809515313141e-08, + "logits/chosen": -0.5480384230613708, + "logits/rejected": 0.6836110949516296, + "logps/chosen": -542.61962890625, + "logps/rejected": -1206.66552734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.34486961364746, + "rewards/margins": 60.3333854675293, + "rewards/rejected": -79.67825317382812, + "step": 9370 + }, + { + "epoch": 0.938, + "grad_norm": 0.0, + "learning_rate": 5.83193041645802e-08, + "logits/chosen": -0.6533951759338379, + "logits/rejected": 0.6204730868339539, + "logps/chosen": -371.08489990234375, + "logps/rejected": -1113.1427001953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.826315879821777, + "rewards/margins": 66.26902770996094, + "rewards/rejected": -78.09534454345703, + "step": 9380 + }, + { + "epoch": 0.939, + "grad_norm": 0.0, + "learning_rate": 5.6460264284760316e-08, + "logits/chosen": -0.37336036562919617, + "logits/rejected": 1.1612873077392578, + "logps/chosen": -526.3222045898438, + "logps/rejected": -1342.774169921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -22.058244705200195, + "rewards/margins": 78.78479766845703, + "rewards/rejected": -100.84303283691406, + "step": 9390 + }, + { + "epoch": 0.94, + "grad_norm": 0.0, + "learning_rate": 5.463099816548578e-08, + "logits/chosen": -0.535962700843811, + "logits/rejected": 0.4035312533378601, + "logps/chosen": -367.12335205078125, + "logps/rejected": -1092.9337158203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.50265121459961, + "rewards/margins": 63.38078689575195, + "rewards/rejected": -80.88343048095703, + "step": 9400 + }, + { + "epoch": 0.941, + "grad_norm": 4.522228316673297e-21, + "learning_rate": 5.283152809578751e-08, + "logits/chosen": -1.1403727531433105, + "logits/rejected": 0.8676480054855347, + "logps/chosen": -311.7748107910156, + "logps/rejected": -1319.3763427734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.405827522277832, + "rewards/margins": 82.8596420288086, + "rewards/rejected": -97.26547241210938, + "step": 9410 + }, + { + "epoch": 0.942, + "grad_norm": 0.0, + "learning_rate": 5.106187600163987e-08, + "logits/chosen": -0.6772249341011047, + "logits/rejected": 0.4636703133583069, + "logps/chosen": -499.16021728515625, + "logps/rejected": -1222.0574951171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.858734130859375, + "rewards/margins": 63.798866271972656, + "rewards/rejected": -84.65760040283203, + "step": 9420 + }, + { + "epoch": 0.943, + "grad_norm": 0.0, + "learning_rate": 4.932206344569562e-08, + "logits/chosen": -1.084229826927185, + "logits/rejected": 0.500342071056366, + "logps/chosen": -337.5640563964844, + "logps/rejected": -1246.141357421875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.161239624023438, + "rewards/margins": 69.31696319580078, + "rewards/rejected": -86.47819519042969, + "step": 9430 + }, + { + "epoch": 0.944, + "grad_norm": 0.0, + "learning_rate": 4.761211162702117e-08, + "logits/chosen": -0.7408514618873596, + "logits/rejected": 0.5778809785842896, + "logps/chosen": -296.14483642578125, + "logps/rejected": -1266.046142578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.5400447845459, + "rewards/margins": 83.34046173095703, + "rewards/rejected": -99.88050842285156, + "step": 9440 + }, + { + "epoch": 0.945, + "grad_norm": 1.6371098564433791e-18, + "learning_rate": 4.593204138084006e-08, + "logits/chosen": -0.6246211528778076, + "logits/rejected": 0.469452440738678, + "logps/chosen": -369.33599853515625, + "logps/rejected": -1046.853515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.994848251342773, + "rewards/margins": 59.64809036254883, + "rewards/rejected": -77.64293670654297, + "step": 9450 + }, + { + "epoch": 0.946, + "grad_norm": 0.0, + "learning_rate": 4.428187317827848e-08, + "logits/chosen": -0.5617231726646423, + "logits/rejected": 0.5635863542556763, + "logps/chosen": -419.0165100097656, + "logps/rejected": -1149.50341796875, + "loss": 0.0865, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.520648956298828, + "rewards/margins": 63.31085205078125, + "rewards/rejected": -83.83150482177734, + "step": 9460 + }, + { + "epoch": 0.947, + "grad_norm": 0.0, + "learning_rate": 4.26616271261146e-08, + "logits/chosen": -0.5824400782585144, + "logits/rejected": 0.41373205184936523, + "logps/chosen": -272.3680419921875, + "logps/rejected": -848.8267822265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.654070854187012, + "rewards/margins": 44.72352981567383, + "rewards/rejected": -60.377593994140625, + "step": 9470 + }, + { + "epoch": 0.948, + "grad_norm": 1.2813891132657006e-15, + "learning_rate": 4.1071322966535487e-08, + "logits/chosen": -0.7137543559074402, + "logits/rejected": 0.5696569085121155, + "logps/chosen": -308.44830322265625, + "logps/rejected": -1056.5924072265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.067167282104492, + "rewards/margins": 62.045066833496094, + "rewards/rejected": -77.11224365234375, + "step": 9480 + }, + { + "epoch": 0.949, + "grad_norm": 2.2281272382684847e-09, + "learning_rate": 3.95109800768953e-08, + "logits/chosen": -0.41896852850914, + "logits/rejected": 0.4885942041873932, + "logps/chosen": -361.94439697265625, + "logps/rejected": -972.2320556640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.41004753112793, + "rewards/margins": 54.427696228027344, + "rewards/rejected": -66.83775329589844, + "step": 9490 + }, + { + "epoch": 0.95, + "grad_norm": 2.754418360062516e-10, + "learning_rate": 3.798061746947995e-08, + "logits/chosen": -0.7670290470123291, + "logits/rejected": 0.12368150055408478, + "logps/chosen": -377.0394287109375, + "logps/rejected": -937.1624755859375, + "loss": 0.1833, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -15.164652824401855, + "rewards/margins": 53.78802490234375, + "rewards/rejected": -68.95268249511719, + "step": 9500 + }, + { + "epoch": 0.951, + "grad_norm": 0.0, + "learning_rate": 3.648025379127479e-08, + "logits/chosen": -0.6356409192085266, + "logits/rejected": 0.4663293957710266, + "logps/chosen": -452.65240478515625, + "logps/rejected": -1188.810791015625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.409733772277832, + "rewards/margins": 72.15763854980469, + "rewards/rejected": -83.56737518310547, + "step": 9510 + }, + { + "epoch": 0.952, + "grad_norm": 0.0, + "learning_rate": 3.5009907323737826e-08, + "logits/chosen": -0.8558729887008667, + "logits/rejected": 0.4594835340976715, + "logps/chosen": -386.46319580078125, + "logps/rejected": -1148.697998046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.500055313110352, + "rewards/margins": 66.51141357421875, + "rewards/rejected": -79.01146697998047, + "step": 9520 + }, + { + "epoch": 0.953, + "grad_norm": 0.0, + "learning_rate": 3.3569595982576584e-08, + "logits/chosen": -0.13464701175689697, + "logits/rejected": 0.6260603666305542, + "logps/chosen": -449.32867431640625, + "logps/rejected": -979.0067138671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.389322280883789, + "rewards/margins": 54.79975509643555, + "rewards/rejected": -67.18907165527344, + "step": 9530 + }, + { + "epoch": 0.954, + "grad_norm": 0.0, + "learning_rate": 3.2159337317530234e-08, + "logits/chosen": -0.9340829849243164, + "logits/rejected": 0.6286576986312866, + "logps/chosen": -289.25457763671875, + "logps/rejected": -977.1263427734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.772436141967773, + "rewards/margins": 60.88823699951172, + "rewards/rejected": -71.66067504882812, + "step": 9540 + }, + { + "epoch": 0.955, + "grad_norm": 0.0, + "learning_rate": 3.077914851215585e-08, + "logits/chosen": -0.2953462600708008, + "logits/rejected": 0.702400803565979, + "logps/chosen": -394.4523010253906, + "logps/rejected": -1066.02001953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.709041595458984, + "rewards/margins": 65.71452331542969, + "rewards/rejected": -82.4235610961914, + "step": 9550 + }, + { + "epoch": 0.956, + "grad_norm": 0.0, + "learning_rate": 2.9429046383618042e-08, + "logits/chosen": -0.6391351222991943, + "logits/rejected": 0.7442789673805237, + "logps/chosen": -252.177490234375, + "logps/rejected": -1095.124267578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.858415603637695, + "rewards/margins": 69.86174774169922, + "rewards/rejected": -80.72015380859375, + "step": 9560 + }, + { + "epoch": 0.957, + "grad_norm": 0.0, + "learning_rate": 2.810904738248549e-08, + "logits/chosen": -0.6452876329421997, + "logits/rejected": 0.4415220618247986, + "logps/chosen": -365.7802429199219, + "logps/rejected": -1228.2845458984375, + "loss": 0.2902, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -13.759931564331055, + "rewards/margins": 76.06869506835938, + "rewards/rejected": -89.82862091064453, + "step": 9570 + }, + { + "epoch": 0.958, + "grad_norm": 0.0, + "learning_rate": 2.681916759252917e-08, + "logits/chosen": -0.8312497138977051, + "logits/rejected": 0.8043157458305359, + "logps/chosen": -297.5036926269531, + "logps/rejected": -1181.446044921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.166455268859863, + "rewards/margins": 72.9308853149414, + "rewards/rejected": -87.09734344482422, + "step": 9580 + }, + { + "epoch": 0.959, + "grad_norm": 8.871064730355953e-17, + "learning_rate": 2.555942273052753e-08, + "logits/chosen": -0.5505019426345825, + "logits/rejected": 0.7471826076507568, + "logps/chosen": -363.4208679199219, + "logps/rejected": -1077.77783203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.896112442016602, + "rewards/margins": 64.6919174194336, + "rewards/rejected": -77.58802032470703, + "step": 9590 + }, + { + "epoch": 0.96, + "grad_norm": 0.0, + "learning_rate": 2.4329828146074096e-08, + "logits/chosen": -0.6976087689399719, + "logits/rejected": 0.6933576464653015, + "logps/chosen": -284.07476806640625, + "logps/rejected": -1206.358642578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.92124080657959, + "rewards/margins": 80.40800476074219, + "rewards/rejected": -91.32923889160156, + "step": 9600 + }, + { + "epoch": 0.961, + "grad_norm": 1.819727244059747e-18, + "learning_rate": 2.313039882139101e-08, + "logits/chosen": -1.001491904258728, + "logits/rejected": 0.6855885982513428, + "logps/chosen": -151.57177734375, + "logps/rejected": -759.6219482421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.934805870056152, + "rewards/margins": 45.081241607666016, + "rewards/rejected": -52.01605224609375, + "step": 9610 + }, + { + "epoch": 0.962, + "grad_norm": 0.0, + "learning_rate": 2.1961149371145795e-08, + "logits/chosen": -0.08522322028875351, + "logits/rejected": 0.5817008018493652, + "logps/chosen": -430.6034240722656, + "logps/rejected": -980.8968505859375, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.8110294342041, + "rewards/margins": 53.823448181152344, + "rewards/rejected": -71.63447570800781, + "step": 9620 + }, + { + "epoch": 0.963, + "grad_norm": 0.0, + "learning_rate": 2.082209404227403e-08, + "logits/chosen": -0.5273114442825317, + "logits/rejected": 0.6189590692520142, + "logps/chosen": -364.1752014160156, + "logps/rejected": -1239.5257568359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.691871643066406, + "rewards/margins": 82.79158020019531, + "rewards/rejected": -94.48345947265625, + "step": 9630 + }, + { + "epoch": 0.964, + "grad_norm": 0.0, + "learning_rate": 1.9713246713805588e-08, + "logits/chosen": -0.6452358365058899, + "logits/rejected": 0.6586201190948486, + "logps/chosen": -361.86376953125, + "logps/rejected": -1184.059326171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.489391326904297, + "rewards/margins": 72.41313171386719, + "rewards/rejected": -88.90251159667969, + "step": 9640 + }, + { + "epoch": 0.965, + "grad_norm": 5.003034098116643e-10, + "learning_rate": 1.8634620896695044e-08, + "logits/chosen": -0.3010661005973816, + "logits/rejected": 0.596214234828949, + "logps/chosen": -331.79730224609375, + "logps/rejected": -967.5947265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.522335052490234, + "rewards/margins": 57.9662971496582, + "rewards/rejected": -77.48863220214844, + "step": 9650 + }, + { + "epoch": 0.966, + "grad_norm": 0.0, + "learning_rate": 1.7586229733657646e-08, + "logits/chosen": -0.9829071760177612, + "logits/rejected": 0.6873185634613037, + "logps/chosen": -311.52313232421875, + "logps/rejected": -1215.0775146484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.806241989135742, + "rewards/margins": 69.84815979003906, + "rewards/rejected": -85.65440368652344, + "step": 9660 + }, + { + "epoch": 0.967, + "grad_norm": 0.0, + "learning_rate": 1.6568085999008886e-08, + "logits/chosen": -0.9289532899856567, + "logits/rejected": 0.6419546604156494, + "logps/chosen": -428.91400146484375, + "logps/rejected": -1263.3416748046875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.84474754333496, + "rewards/margins": 70.19862365722656, + "rewards/rejected": -87.04336547851562, + "step": 9670 + }, + { + "epoch": 0.968, + "grad_norm": 0.0, + "learning_rate": 1.5580202098509078e-08, + "logits/chosen": -0.8172246813774109, + "logits/rejected": 0.6110397577285767, + "logps/chosen": -309.43719482421875, + "logps/rejected": -1129.2818603515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.678646087646484, + "rewards/margins": 72.56365966796875, + "rewards/rejected": -84.24231719970703, + "step": 9680 + }, + { + "epoch": 0.969, + "grad_norm": 0.0, + "learning_rate": 1.4622590069211517e-08, + "logits/chosen": -0.9688760042190552, + "logits/rejected": 0.6006779074668884, + "logps/chosen": -274.1756591796875, + "logps/rejected": -1105.856201171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.810132026672363, + "rewards/margins": 67.08036804199219, + "rewards/rejected": -76.89048767089844, + "step": 9690 + }, + { + "epoch": 0.97, + "grad_norm": 14.400453567504883, + "learning_rate": 1.3695261579316776e-08, + "logits/chosen": -0.7687379121780396, + "logits/rejected": 0.6138051748275757, + "logps/chosen": -404.05914306640625, + "logps/rejected": -1301.695556640625, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.873037338256836, + "rewards/margins": 77.2698745727539, + "rewards/rejected": -92.14290618896484, + "step": 9700 + }, + { + "epoch": 0.971, + "grad_norm": 0.0, + "learning_rate": 1.2798227928029483e-08, + "logits/chosen": -0.6523748636245728, + "logits/rejected": 0.47052305936813354, + "logps/chosen": -480.67681884765625, + "logps/rejected": -1377.9613037109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.083057403564453, + "rewards/margins": 82.21842956542969, + "rewards/rejected": -100.3014907836914, + "step": 9710 + }, + { + "epoch": 0.972, + "grad_norm": 0.0, + "learning_rate": 1.193150004542204e-08, + "logits/chosen": -0.5860597491264343, + "logits/rejected": 1.2617757320404053, + "logps/chosen": -317.54974365234375, + "logps/rejected": -1091.484375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.348005294799805, + "rewards/margins": 67.207275390625, + "rewards/rejected": -83.5552749633789, + "step": 9720 + }, + { + "epoch": 0.973, + "grad_norm": 0.0, + "learning_rate": 1.109508849230001e-08, + "logits/chosen": -0.4087650179862976, + "logits/rejected": 0.43272989988327026, + "logps/chosen": -370.06488037109375, + "logps/rejected": -1022.3318481445312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.40913200378418, + "rewards/margins": 60.79473876953125, + "rewards/rejected": -74.20386505126953, + "step": 9730 + }, + { + "epoch": 0.974, + "grad_norm": 0.0, + "learning_rate": 1.0289003460074165e-08, + "logits/chosen": -0.6892791986465454, + "logits/rejected": 0.5428653359413147, + "logps/chosen": -627.8890380859375, + "logps/rejected": -1332.8411865234375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.19067668914795, + "rewards/margins": 69.77383422851562, + "rewards/rejected": -84.96450805664062, + "step": 9740 + }, + { + "epoch": 0.975, + "grad_norm": 0.0, + "learning_rate": 9.513254770636138e-09, + "logits/chosen": -0.8619287610054016, + "logits/rejected": 0.5513601899147034, + "logps/chosen": -349.90069580078125, + "logps/rejected": -1181.427001953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.454333305358887, + "rewards/margins": 68.1052474975586, + "rewards/rejected": -81.55958557128906, + "step": 9750 + }, + { + "epoch": 0.976, + "grad_norm": 0.0, + "learning_rate": 8.767851876239075e-09, + "logits/chosen": -0.4201585352420807, + "logits/rejected": 0.7723037004470825, + "logps/chosen": -571.9500732421875, + "logps/rejected": -1227.5380859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.813634872436523, + "rewards/margins": 63.92681121826172, + "rewards/rejected": -82.74044799804688, + "step": 9760 + }, + { + "epoch": 0.977, + "grad_norm": 0.0, + "learning_rate": 8.052803859382174e-09, + "logits/chosen": -0.6666244864463806, + "logits/rejected": 0.718521773815155, + "logps/chosen": -256.37188720703125, + "logps/rejected": -1023.0693359375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.587654113769531, + "rewards/margins": 69.85871124267578, + "rewards/rejected": -80.44636535644531, + "step": 9770 + }, + { + "epoch": 0.978, + "grad_norm": 5.396772849053377e-06, + "learning_rate": 7.368119432699383e-09, + "logits/chosen": -0.6672367453575134, + "logits/rejected": 0.411163330078125, + "logps/chosen": -361.1486511230469, + "logps/rejected": -951.3388671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.695712089538574, + "rewards/margins": 52.358123779296875, + "rewards/rejected": -67.05384063720703, + "step": 9780 + }, + { + "epoch": 0.979, + "grad_norm": 4.77047155948609e-16, + "learning_rate": 6.7138069388547614e-09, + "logits/chosen": -1.2189319133758545, + "logits/rejected": 0.6601986885070801, + "logps/chosen": -294.63934326171875, + "logps/rejected": -1052.442626953125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.933565139770508, + "rewards/margins": 63.85259246826172, + "rewards/rejected": -73.7861557006836, + "step": 9790 + }, + { + "epoch": 0.98, + "grad_norm": 8.221260281435824e-15, + "learning_rate": 6.089874350439507e-09, + "logits/chosen": -0.8612964749336243, + "logits/rejected": 0.48667994141578674, + "logps/chosen": -276.33428955078125, + "logps/rejected": -1090.6768798828125, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.823007583618164, + "rewards/margins": 67.98606872558594, + "rewards/rejected": -78.80908203125, + "step": 9800 + }, + { + "epoch": 0.981, + "grad_norm": 8.672072620090093e-19, + "learning_rate": 5.4963292698750896e-09, + "logits/chosen": -0.344230979681015, + "logits/rejected": -0.045682210475206375, + "logps/chosen": -622.1007080078125, + "logps/rejected": -998.2276611328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -15.51715087890625, + "rewards/margins": 42.443077087402344, + "rewards/rejected": -57.960227966308594, + "step": 9810 + }, + { + "epoch": 0.982, + "grad_norm": 3.64466545797206e-15, + "learning_rate": 4.933178929321103e-09, + "logits/chosen": -0.5035933256149292, + "logits/rejected": 0.6916291117668152, + "logps/chosen": -372.07379150390625, + "logps/rejected": -1095.00732421875, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.06197738647461, + "rewards/margins": 60.83811569213867, + "rewards/rejected": -77.90009307861328, + "step": 9820 + }, + { + "epoch": 0.983, + "grad_norm": 0.0, + "learning_rate": 4.400430190586724e-09, + "logits/chosen": -0.6599819660186768, + "logits/rejected": 0.5303076505661011, + "logps/chosen": -485.1739807128906, + "logps/rejected": -1263.5828857421875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.154559135437012, + "rewards/margins": 69.51759338378906, + "rewards/rejected": -83.67214965820312, + "step": 9830 + }, + { + "epoch": 0.984, + "grad_norm": 3.5732080050365767e-07, + "learning_rate": 3.8980895450474455e-09, + "logits/chosen": -0.7814174294471741, + "logits/rejected": 0.642800509929657, + "logps/chosen": -427.40643310546875, + "logps/rejected": -1239.892578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.07494354248047, + "rewards/margins": 76.21549987792969, + "rewards/rejected": -92.29043579101562, + "step": 9840 + }, + { + "epoch": 0.985, + "grad_norm": 0.0, + "learning_rate": 3.4261631135654174e-09, + "logits/chosen": -0.8822159767150879, + "logits/rejected": 0.41710543632507324, + "logps/chosen": -504.44342041015625, + "logps/rejected": -1304.072265625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.102876663208008, + "rewards/margins": 72.31712341308594, + "rewards/rejected": -86.41999816894531, + "step": 9850 + }, + { + "epoch": 0.986, + "grad_norm": 0.0, + "learning_rate": 2.984656646415063e-09, + "logits/chosen": -0.38282960653305054, + "logits/rejected": 0.1189902052283287, + "logps/chosen": -471.0121154785156, + "logps/rejected": -1003.5185546875, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.816166877746582, + "rewards/margins": 56.61848831176758, + "rewards/rejected": -68.43465423583984, + "step": 9860 + }, + { + "epoch": 0.987, + "grad_norm": 6.396711269288306e-22, + "learning_rate": 2.573575523213412e-09, + "logits/chosen": -0.5950853824615479, + "logits/rejected": 0.6149926781654358, + "logps/chosen": -238.8068084716797, + "logps/rejected": -932.0408935546875, + "loss": 0.0283, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.022882461547852, + "rewards/margins": 59.13804244995117, + "rewards/rejected": -70.16092681884766, + "step": 9870 + }, + { + "epoch": 0.988, + "grad_norm": 0.0, + "learning_rate": 2.192924752854042e-09, + "logits/chosen": -0.7473016977310181, + "logits/rejected": 0.5859171748161316, + "logps/chosen": -331.48406982421875, + "logps/rejected": -1262.8597412109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.349139213562012, + "rewards/margins": 80.0912094116211, + "rewards/rejected": -93.44035339355469, + "step": 9880 + }, + { + "epoch": 0.989, + "grad_norm": 0.0, + "learning_rate": 1.842708973447127e-09, + "logits/chosen": -0.7945876121520996, + "logits/rejected": 0.9833480715751648, + "logps/chosen": -267.7002258300781, + "logps/rejected": -1046.1787109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -13.28888988494873, + "rewards/margins": 64.49101257324219, + "rewards/rejected": -77.7799072265625, + "step": 9890 + }, + { + "epoch": 0.99, + "grad_norm": 1.0340225176748083e-13, + "learning_rate": 1.5229324522605949e-09, + "logits/chosen": -0.530807375907898, + "logits/rejected": 0.39602330327033997, + "logps/chosen": -627.578857421875, + "logps/rejected": -1317.6207275390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.532297134399414, + "rewards/margins": 70.44889831542969, + "rewards/rejected": -87.98119354248047, + "step": 9900 + }, + { + "epoch": 0.991, + "grad_norm": 0.0, + "learning_rate": 1.2335990856710001e-09, + "logits/chosen": -0.5323010683059692, + "logits/rejected": 0.8327838182449341, + "logps/chosen": -462.7493591308594, + "logps/rejected": -1313.9512939453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.25348472595215, + "rewards/margins": 77.57730865478516, + "rewards/rejected": -94.8307876586914, + "step": 9910 + }, + { + "epoch": 0.992, + "grad_norm": 0.0, + "learning_rate": 9.747123991141193e-10, + "logits/chosen": -0.5736783742904663, + "logits/rejected": 0.5615373849868774, + "logps/chosen": -391.64105224609375, + "logps/rejected": -1180.13134765625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.440969467163086, + "rewards/margins": 73.5404052734375, + "rewards/rejected": -87.98137664794922, + "step": 9920 + }, + { + "epoch": 0.993, + "grad_norm": 0.0, + "learning_rate": 7.462755470422078e-10, + "logits/chosen": -0.8006995916366577, + "logits/rejected": 0.5760239362716675, + "logps/chosen": -287.2140197753906, + "logps/rejected": -910.4788208007812, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.046598434448242, + "rewards/margins": 51.611488342285156, + "rewards/rejected": -61.6580924987793, + "step": 9930 + }, + { + "epoch": 0.994, + "grad_norm": 0.0, + "learning_rate": 5.48291312886251e-10, + "logits/chosen": -0.19751985371112823, + "logits/rejected": 0.23999378085136414, + "logps/chosen": -447.41473388671875, + "logps/rejected": -837.7698974609375, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -16.676952362060547, + "rewards/margins": 47.939002990722656, + "rewards/rejected": -64.61595153808594, + "step": 9940 + }, + { + "epoch": 0.995, + "grad_norm": 0.0, + "learning_rate": 3.8076210902182607e-10, + "logits/chosen": -0.6151745915412903, + "logits/rejected": 0.7026041150093079, + "logps/chosen": -376.2629089355469, + "logps/rejected": -1227.098388671875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.793085098266602, + "rewards/margins": 81.91486358642578, + "rewards/rejected": -94.70793914794922, + "step": 9950 + }, + { + "epoch": 0.996, + "grad_norm": 5.696053004267121e-11, + "learning_rate": 2.43689976739403e-10, + "logits/chosen": -0.647061824798584, + "logits/rejected": 0.48720255494117737, + "logps/chosen": -303.76898193359375, + "logps/rejected": -941.9822387695312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -12.006851196289062, + "rewards/margins": 54.89765548706055, + "rewards/rejected": -66.90450286865234, + "step": 9960 + }, + { + "epoch": 0.997, + "grad_norm": 0.0, + "learning_rate": 1.3707658621964216e-10, + "logits/chosen": -0.9006298780441284, + "logits/rejected": 0.7990083694458008, + "logps/chosen": -274.3728942871094, + "logps/rejected": -1247.52880859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.891749382019043, + "rewards/margins": 87.26151275634766, + "rewards/rejected": -96.15326690673828, + "step": 9970 + }, + { + "epoch": 0.998, + "grad_norm": 0.0, + "learning_rate": 6.092323651313293e-11, + "logits/chosen": -0.8165764808654785, + "logits/rejected": 0.2772344648838043, + "logps/chosen": -351.58526611328125, + "logps/rejected": -1041.4344482421875, + "loss": 1.2084, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -12.576261520385742, + "rewards/margins": 58.36811065673828, + "rewards/rejected": -70.94436645507812, + "step": 9980 + }, + { + "epoch": 0.999, + "grad_norm": 0.0, + "learning_rate": 1.5230855524017708e-11, + "logits/chosen": -1.0131959915161133, + "logits/rejected": 0.66839599609375, + "logps/chosen": -294.83953857421875, + "logps/rejected": -1126.336669921875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -14.798008918762207, + "rewards/margins": 65.11726379394531, + "rewards/rejected": -79.91526794433594, + "step": 9990 + }, + { + "epoch": 1.0, + "grad_norm": 0.0, + "learning_rate": 0.0, + "logits/chosen": -0.6975020170211792, + "logits/rejected": 0.882247805595398, + "logps/chosen": -402.93658447265625, + "logps/rejected": -1386.987548828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.254676818847656, + "rewards/margins": 91.36619567871094, + "rewards/rejected": -109.6208724975586, + "step": 10000 + }, + { + "epoch": 1.0, + "step": 10000, + "total_flos": 5.747405857487585e+17, + "train_loss": 0.08356396047416255, + "train_runtime": 17143.627, + "train_samples_per_second": 0.583, + "train_steps_per_second": 0.583 + } + ], + "logging_steps": 10, + "max_steps": 10000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.747405857487585e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}