diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -3,18 +3,18 @@ "best_model_checkpoint": null, "epoch": 2.9992254066615027, "eval_steps": 100, - "global_step": 726, + "global_step": 2904, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "learning_rate": 6.84931506849315e-09, - "logits/chosen": -2.34800124168396, - "logits/rejected": -2.4178409576416016, - "logps/chosen": -271.47698974609375, - "logps/rejected": -208.94898986816406, + "learning_rate": 1.7182130584192438e-09, + "logits/chosen": -2.293531894683838, + "logits/rejected": -2.2362442016601562, + "logps/chosen": -280.74072265625, + "logps/rejected": -204.830322265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -23,1073 +23,4125 @@ "step": 1 }, { - "epoch": 0.04, - "learning_rate": 6.84931506849315e-08, - "logits/chosen": -2.4227354526519775, - "logits/rejected": -2.355938196182251, - "logps/chosen": -293.4179992675781, - "logps/rejected": -226.30238342285156, - "loss": 0.6928, - "rewards/accuracies": 0.4496527910232544, - "rewards/chosen": 0.0005499552935361862, - "rewards/margins": 0.00035988984745927155, - "rewards/rejected": 0.00019006534421350807, + "epoch": 0.01, + "learning_rate": 1.718213058419244e-08, + "logits/chosen": -2.411555290222168, + "logits/rejected": -2.3393168449401855, + "logps/chosen": -294.2322998046875, + "logps/rejected": -213.8911895751953, + "loss": 0.6946, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.005316631868481636, + "rewards/margins": 0.0028615635819733143, + "rewards/rejected": 0.002455067355185747, "step": 10 }, { - "epoch": 0.08, - "learning_rate": 1.36986301369863e-07, - "logits/chosen": -2.431461811065674, - "logits/rejected": -2.4046578407287598, - "logps/chosen": -278.50103759765625, - "logps/rejected": -216.76806640625, - "loss": 0.6931, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": 0.004533737897872925, - "rewards/margins": 0.002973187016323209, - "rewards/rejected": 0.0015605507651343942, + "epoch": 0.02, + "learning_rate": 3.436426116838488e-08, + "logits/chosen": -2.4150137901306152, + "logits/rejected": -2.3802390098571777, + "logps/chosen": -279.42938232421875, + "logps/rejected": -237.62747192382812, + "loss": 0.6943, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0023494327906519175, + "rewards/margins": 0.0011181762674823403, + "rewards/rejected": 0.0012312561739236116, "step": 20 }, { - "epoch": 0.12, - "learning_rate": 2.054794520547945e-07, - "logits/chosen": -2.389216899871826, - "logits/rejected": -2.3487401008605957, - "logps/chosen": -252.997314453125, - "logps/rejected": -207.15963745117188, - "loss": 0.692, - "rewards/accuracies": 0.53125, - "rewards/chosen": 0.0029909531585872173, - "rewards/margins": 0.00379578466527164, - "rewards/rejected": -0.000804831855930388, + "epoch": 0.03, + "learning_rate": 5.154639175257731e-08, + "logits/chosen": -2.461092472076416, + "logits/rejected": -2.39383602142334, + "logps/chosen": -301.07952880859375, + "logps/rejected": -215.763427734375, + "loss": 0.6943, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.0003540778416208923, + "rewards/margins": -0.001285408972762525, + "rewards/rejected": 0.0009313317714259028, "step": 30 }, { - "epoch": 0.17, - "learning_rate": 2.73972602739726e-07, - "logits/chosen": -2.4600839614868164, - "logits/rejected": -2.4145703315734863, - "logps/chosen": -283.77569580078125, - "logps/rejected": -216.48251342773438, - "loss": 0.6909, - "rewards/accuracies": 0.520312488079071, - "rewards/chosen": 0.0019030813127756119, - "rewards/margins": 0.005452433601021767, - "rewards/rejected": -0.0035493518225848675, + "epoch": 0.04, + "learning_rate": 6.872852233676976e-08, + "logits/chosen": -2.3856747150421143, + "logits/rejected": -2.3453280925750732, + "logps/chosen": -291.4425354003906, + "logps/rejected": -231.8385772705078, + "loss": 0.6934, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": 0.0012902533635497093, + "rewards/margins": -0.00038409550325013697, + "rewards/rejected": 0.001674349419772625, "step": 40 }, { - "epoch": 0.21, - "learning_rate": 3.424657534246575e-07, - "logits/chosen": -2.436392307281494, - "logits/rejected": -2.3965039253234863, - "logps/chosen": -267.26617431640625, - "logps/rejected": -223.65829467773438, - "loss": 0.6898, - "rewards/accuracies": 0.5703125, - "rewards/chosen": 0.0054888492450118065, - "rewards/margins": 0.009707379154860973, - "rewards/rejected": -0.004218529909849167, + "epoch": 0.05, + "learning_rate": 8.59106529209622e-08, + "logits/chosen": -2.443054437637329, + "logits/rejected": -2.383383274078369, + "logps/chosen": -299.1965026855469, + "logps/rejected": -220.2180938720703, + "loss": 0.6948, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.005029269959777594, + "rewards/margins": 0.004818198271095753, + "rewards/rejected": 0.0002110706700477749, "step": 50 }, { - "epoch": 0.25, - "learning_rate": 4.10958904109589e-07, - "logits/chosen": -2.4120891094207764, - "logits/rejected": -2.388906478881836, - "logps/chosen": -266.8415832519531, - "logps/rejected": -214.7452850341797, - "loss": 0.6867, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.006448288913816214, - "rewards/margins": 0.012911155819892883, - "rewards/rejected": -0.006462865974754095, + "epoch": 0.06, + "learning_rate": 1.0309278350515462e-07, + "logits/chosen": -2.435997247695923, + "logits/rejected": -2.4249629974365234, + "logps/chosen": -272.54656982421875, + "logps/rejected": -227.5023193359375, + "loss": 0.6908, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": 0.0016036666929721832, + "rewards/margins": 0.0018250759458169341, + "rewards/rejected": -0.00022140909277368337, "step": 60 }, { - "epoch": 0.29, - "learning_rate": 4.794520547945205e-07, - "logits/chosen": -2.3903424739837646, - "logits/rejected": -2.397461414337158, - "logps/chosen": -254.0715789794922, - "logps/rejected": -214.6645050048828, - "loss": 0.6826, - "rewards/accuracies": 0.5843750238418579, - "rewards/chosen": 0.00976890604943037, - "rewards/margins": 0.019091714173555374, - "rewards/rejected": -0.00932280719280243, + "epoch": 0.07, + "learning_rate": 1.202749140893471e-07, + "logits/chosen": -2.4656193256378174, + "logits/rejected": -2.420733690261841, + "logps/chosen": -292.0702209472656, + "logps/rejected": -206.99124145507812, + "loss": 0.6908, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.00455916253849864, + "rewards/margins": 0.008069148287177086, + "rewards/rejected": -0.003509984817355871, "step": 70 }, { - "epoch": 0.33, - "learning_rate": 4.946401225114854e-07, - "logits/chosen": -2.4295127391815186, - "logits/rejected": -2.3777496814727783, - "logps/chosen": -265.0728759765625, - "logps/rejected": -218.78775024414062, - "loss": 0.6796, - "rewards/accuracies": 0.6328125, - "rewards/chosen": 0.01282783318310976, - "rewards/margins": 0.030523013323545456, - "rewards/rejected": -0.017695177346467972, + "epoch": 0.08, + "learning_rate": 1.3745704467353952e-07, + "logits/chosen": -2.381108283996582, + "logits/rejected": -2.3896584510803223, + "logps/chosen": -250.19076538085938, + "logps/rejected": -212.47366333007812, + "loss": 0.6914, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.006951277144253254, + "rewards/margins": 0.008491529151797295, + "rewards/rejected": -0.0015402527060359716, "step": 80 }, { - "epoch": 0.37, - "learning_rate": 4.869831546707504e-07, - "logits/chosen": -2.4791455268859863, - "logits/rejected": -2.4226763248443604, - "logps/chosen": -271.70550537109375, - "logps/rejected": -224.4741973876953, - "loss": 0.6724, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.025005927309393883, - "rewards/margins": 0.04857773706316948, - "rewards/rejected": -0.023571809753775597, + "epoch": 0.09, + "learning_rate": 1.5463917525773197e-07, + "logits/chosen": -2.3331446647644043, + "logits/rejected": -2.254476547241211, + "logps/chosen": -241.5772247314453, + "logps/rejected": -185.46815490722656, + "loss": 0.6898, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.008401724509894848, + "rewards/margins": 0.014180210418999195, + "rewards/rejected": -0.005778484977781773, "step": 90 }, { - "epoch": 0.41, - "learning_rate": 4.793261868300153e-07, - "logits/chosen": -2.417412042617798, - "logits/rejected": -2.403857707977295, - "logps/chosen": -273.85968017578125, - "logps/rejected": -227.78335571289062, - "loss": 0.6681, - "rewards/accuracies": 0.6390625238418579, - "rewards/chosen": 0.028032511472702026, - "rewards/margins": 0.058635223656892776, - "rewards/rejected": -0.030602704733610153, + "epoch": 0.1, + "learning_rate": 1.718213058419244e-07, + "logits/chosen": -2.4016473293304443, + "logits/rejected": -2.3915467262268066, + "logps/chosen": -259.73956298828125, + "logps/rejected": -221.9446258544922, + "loss": 0.6896, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0046272119507193565, + "rewards/margins": 0.00822476390749216, + "rewards/rejected": -0.0035975512582808733, "step": 100 }, { - "epoch": 0.45, - "learning_rate": 4.7166921898928023e-07, - "logits/chosen": -2.4506094455718994, - "logits/rejected": -2.390385866165161, - "logps/chosen": -252.6039276123047, - "logps/rejected": -222.64761352539062, - "loss": 0.6624, - "rewards/accuracies": 0.6578124761581421, - "rewards/chosen": 0.024289341643452644, - "rewards/margins": 0.06630216538906097, - "rewards/rejected": -0.04201282188296318, + "epoch": 0.11, + "learning_rate": 1.8900343642611682e-07, + "logits/chosen": -2.441370725631714, + "logits/rejected": -2.3111448287963867, + "logps/chosen": -252.11367797851562, + "logps/rejected": -210.3745574951172, + "loss": 0.6912, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.0009224863606505096, + "rewards/margins": 0.001503048581071198, + "rewards/rejected": -0.0005805626278743148, "step": 110 }, { - "epoch": 0.5, - "learning_rate": 4.640122511485451e-07, - "logits/chosen": -2.4121220111846924, - "logits/rejected": -2.4016165733337402, - "logps/chosen": -256.88958740234375, - "logps/rejected": -219.8152618408203, - "loss": 0.6584, - "rewards/accuracies": 0.6265624761581421, - "rewards/chosen": 0.031187813729047775, - "rewards/margins": 0.08014924824237823, - "rewards/rejected": -0.04896143823862076, + "epoch": 0.12, + "learning_rate": 2.0618556701030925e-07, + "logits/chosen": -2.3808603286743164, + "logits/rejected": -2.437734842300415, + "logps/chosen": -258.5278015136719, + "logps/rejected": -210.9561309814453, + "loss": 0.6901, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.001118434825912118, + "rewards/margins": 0.00486636720597744, + "rewards/rejected": -0.0037479314487427473, "step": 120 }, { - "epoch": 0.54, - "learning_rate": 4.563552833078101e-07, - "logits/chosen": -2.456305980682373, - "logits/rejected": -2.394624948501587, - "logps/chosen": -265.97003173828125, - "logps/rejected": -225.9492950439453, - "loss": 0.6492, - "rewards/accuracies": 0.682812511920929, - "rewards/chosen": 0.02872040495276451, - "rewards/margins": 0.09756486117839813, - "rewards/rejected": -0.06884445250034332, + "epoch": 0.13, + "learning_rate": 2.2336769759450173e-07, + "logits/chosen": -2.447282075881958, + "logits/rejected": -2.431652784347534, + "logps/chosen": -279.3333435058594, + "logps/rejected": -213.950439453125, + "loss": 0.688, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.005058329086750746, + "rewards/margins": 0.012270588427782059, + "rewards/rejected": -0.0072122602723538876, "step": 130 }, { - "epoch": 0.58, - "learning_rate": 4.4869831546707505e-07, - "logits/chosen": -2.460322380065918, - "logits/rejected": -2.4006354808807373, - "logps/chosen": -270.3127746582031, - "logps/rejected": -230.38339233398438, - "loss": 0.6493, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": 0.03178644925355911, - "rewards/margins": 0.11458346992731094, - "rewards/rejected": -0.08279702812433243, + "epoch": 0.14, + "learning_rate": 2.405498281786942e-07, + "logits/chosen": -2.574840545654297, + "logits/rejected": -2.400458812713623, + "logps/chosen": -267.6883544921875, + "logps/rejected": -203.22642517089844, + "loss": 0.6891, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.002968291286379099, + "rewards/margins": 0.005260258913040161, + "rewards/rejected": -0.0022919676266610622, "step": 140 }, { - "epoch": 0.62, - "learning_rate": 4.4104134762633994e-07, - "logits/chosen": -2.484839916229248, - "logits/rejected": -2.4318273067474365, - "logps/chosen": -265.9433288574219, - "logps/rejected": -226.5823974609375, - "loss": 0.6432, - "rewards/accuracies": 0.703125, - "rewards/chosen": 0.03730004280805588, - "rewards/margins": 0.12808530032634735, - "rewards/rejected": -0.09078525006771088, + "epoch": 0.15, + "learning_rate": 2.5773195876288655e-07, + "logits/chosen": -2.357297420501709, + "logits/rejected": -2.391117811203003, + "logps/chosen": -280.30828857421875, + "logps/rejected": -214.0823974609375, + "loss": 0.6878, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.013498497195541859, + "rewards/margins": 0.025946879759430885, + "rewards/rejected": -0.012448383495211601, "step": 150 }, { - "epoch": 0.66, - "learning_rate": 4.333843797856049e-07, - "logits/chosen": -2.44659161567688, - "logits/rejected": -2.402761936187744, - "logps/chosen": -279.3069152832031, - "logps/rejected": -239.0000457763672, - "loss": 0.6334, - "rewards/accuracies": 0.6890624761581421, - "rewards/chosen": 0.04617582634091377, - "rewards/margins": 0.15378056466579437, - "rewards/rejected": -0.1076047420501709, + "epoch": 0.17, + "learning_rate": 2.7491408934707903e-07, + "logits/chosen": -2.460391044616699, + "logits/rejected": -2.435685873031616, + "logps/chosen": -307.55450439453125, + "logps/rejected": -234.9291534423828, + "loss": 0.6834, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.007904368452727795, + "rewards/margins": 0.025973070412874222, + "rewards/rejected": -0.01806870475411415, "step": 160 }, { - "epoch": 0.7, - "learning_rate": 4.257274119448698e-07, - "logits/chosen": -2.47914457321167, - "logits/rejected": -2.4519801139831543, - "logps/chosen": -270.73345947265625, - "logps/rejected": -239.12258911132812, - "loss": 0.6333, - "rewards/accuracies": 0.660937488079071, - "rewards/chosen": 0.02273646369576454, - "rewards/margins": 0.1370951235294342, - "rewards/rejected": -0.11435866355895996, + "epoch": 0.18, + "learning_rate": 2.9209621993127146e-07, + "logits/chosen": -2.417241334915161, + "logits/rejected": -2.4194204807281494, + "logps/chosen": -284.2513732910156, + "logps/rejected": -220.6437530517578, + "loss": 0.6872, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.006491424981504679, + "rewards/margins": 0.01496223546564579, + "rewards/rejected": -0.008470811881124973, "step": 170 }, { - "epoch": 0.74, - "learning_rate": 4.180704441041347e-07, - "logits/chosen": -2.435364246368408, - "logits/rejected": -2.388087272644043, - "logps/chosen": -256.7988586425781, - "logps/rejected": -226.1533966064453, - "loss": 0.6362, - "rewards/accuracies": 0.6640625, - "rewards/chosen": 0.014816783368587494, - "rewards/margins": 0.14678119122982025, - "rewards/rejected": -0.13196441531181335, + "epoch": 0.19, + "learning_rate": 3.0927835051546394e-07, + "logits/chosen": -2.426492214202881, + "logits/rejected": -2.427013635635376, + "logps/chosen": -261.0791320800781, + "logps/rejected": -236.6595916748047, + "loss": 0.6821, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.007498173974454403, + "rewards/margins": 0.020344991236925125, + "rewards/rejected": -0.012846815399825573, "step": 180 }, { - "epoch": 0.78, - "learning_rate": 4.1041347626339966e-07, - "logits/chosen": -2.447838306427002, - "logits/rejected": -2.4064364433288574, - "logps/chosen": -261.65130615234375, - "logps/rejected": -213.19497680664062, - "loss": 0.6274, - "rewards/accuracies": 0.653124988079071, - "rewards/chosen": 0.017185209318995476, - "rewards/margins": 0.16870170831680298, - "rewards/rejected": -0.15151652693748474, + "epoch": 0.2, + "learning_rate": 3.2646048109965636e-07, + "logits/chosen": -2.479682445526123, + "logits/rejected": -2.3931996822357178, + "logps/chosen": -261.3951721191406, + "logps/rejected": -213.754150390625, + "loss": 0.6816, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.011733494699001312, + "rewards/margins": 0.023404525592923164, + "rewards/rejected": -0.011671033687889576, "step": 190 }, { - "epoch": 0.83, - "learning_rate": 4.027565084226646e-07, - "logits/chosen": -2.471174716949463, - "logits/rejected": -2.4145748615264893, - "logps/chosen": -262.2930603027344, - "logps/rejected": -218.0220489501953, - "loss": 0.6186, - "rewards/accuracies": 0.6968749761581421, - "rewards/chosen": 0.027294564992189407, - "rewards/margins": 0.19551445543766022, - "rewards/rejected": -0.16821987926959991, + "epoch": 0.21, + "learning_rate": 3.436426116838488e-07, + "logits/chosen": -2.420584201812744, + "logits/rejected": -2.3466110229492188, + "logps/chosen": -262.15338134765625, + "logps/rejected": -223.8980255126953, + "loss": 0.6793, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.01480065006762743, + "rewards/margins": 0.030916428193449974, + "rewards/rejected": -0.01611577905714512, "step": 200 }, { - "epoch": 0.87, - "learning_rate": 3.9509954058192954e-07, - "logits/chosen": -2.4748778343200684, - "logits/rejected": -2.4354960918426514, - "logps/chosen": -283.8720703125, - "logps/rejected": -231.70785522460938, - "loss": 0.6158, - "rewards/accuracies": 0.6640625, - "rewards/chosen": 0.028042469173669815, - "rewards/margins": 0.21716149151325226, - "rewards/rejected": -0.18911901116371155, + "epoch": 0.22, + "learning_rate": 3.608247422680412e-07, + "logits/chosen": -2.4468910694122314, + "logits/rejected": -2.367849826812744, + "logps/chosen": -276.70526123046875, + "logps/rejected": -203.1634979248047, + "loss": 0.6773, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0047667198814451694, + "rewards/margins": 0.023584634065628052, + "rewards/rejected": -0.01881791278719902, "step": 210 }, { - "epoch": 0.91, - "learning_rate": 3.874425727411945e-07, - "logits/chosen": -2.4220242500305176, - "logits/rejected": -2.4089436531066895, - "logps/chosen": -276.76727294921875, - "logps/rejected": -229.3019561767578, - "loss": 0.6139, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": 0.01833787001669407, - "rewards/margins": 0.22341260313987732, - "rewards/rejected": -0.205074742436409, + "epoch": 0.23, + "learning_rate": 3.7800687285223364e-07, + "logits/chosen": -2.443112850189209, + "logits/rejected": -2.4011592864990234, + "logps/chosen": -248.66348266601562, + "logps/rejected": -211.2028350830078, + "loss": 0.6725, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.01790793612599373, + "rewards/margins": 0.04252880811691284, + "rewards/rejected": -0.024620870128273964, "step": 220 }, { - "epoch": 0.95, - "learning_rate": 3.797856049004594e-07, - "logits/chosen": -2.4443860054016113, - "logits/rejected": -2.388517141342163, - "logps/chosen": -264.1051025390625, - "logps/rejected": -228.3242950439453, - "loss": 0.612, - "rewards/accuracies": 0.671875, - "rewards/chosen": 0.02690083347260952, - "rewards/margins": 0.2188110649585724, - "rewards/rejected": -0.19191020727157593, + "epoch": 0.24, + "learning_rate": 3.9518900343642607e-07, + "logits/chosen": -2.451524257659912, + "logits/rejected": -2.444117546081543, + "logps/chosen": -261.34912109375, + "logps/rejected": -210.658447265625, + "loss": 0.6754, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.027729609981179237, + "rewards/margins": 0.043738484382629395, + "rewards/rejected": -0.016008879989385605, "step": 230 }, { - "epoch": 0.99, - "learning_rate": 3.7212863705972436e-07, - "logits/chosen": -2.463099479675293, - "logits/rejected": -2.418534755706787, - "logps/chosen": -271.4806213378906, - "logps/rejected": -222.47207641601562, - "loss": 0.6123, - "rewards/accuracies": 0.6953125, - "rewards/chosen": 0.020688241347670555, - "rewards/margins": 0.21067467331886292, - "rewards/rejected": -0.1899864375591278, + "epoch": 0.25, + "learning_rate": 4.123711340206185e-07, + "logits/chosen": -2.3075928688049316, + "logits/rejected": -2.343151092529297, + "logps/chosen": -280.16119384765625, + "logps/rejected": -234.6321563720703, + "loss": 0.6685, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.024107476696372032, + "rewards/margins": 0.05809453874826431, + "rewards/rejected": -0.03398705646395683, "step": 240 }, { - "epoch": 1.0, - "eval_logits/chosen": -2.143613815307617, - "eval_logits/rejected": -2.023483991622925, - "eval_logps/chosen": -264.629150390625, - "eval_logps/rejected": -221.70089721679688, - "eval_loss": 0.6065443754196167, - "eval_rewards/accuracies": 0.6660000085830688, - "eval_rewards/chosen": 0.00029431533766910434, - "eval_rewards/margins": 0.2420923262834549, - "eval_rewards/rejected": -0.2417980283498764, - "eval_runtime": 601.9951, - "eval_samples_per_second": 3.322, - "eval_steps_per_second": 0.208, - "step": 242 - }, - { - "epoch": 1.03, - "learning_rate": 3.6447166921898925e-07, - "logits/chosen": -2.4006145000457764, - "logits/rejected": -2.347562789916992, - "logps/chosen": -257.4266052246094, - "logps/rejected": -210.4308319091797, - "loss": 0.6169, - "rewards/accuracies": 0.6703125238418579, - "rewards/chosen": 0.007514593191444874, - "rewards/margins": 0.2254376858472824, - "rewards/rejected": -0.21792307496070862, + "epoch": 0.26, + "learning_rate": 4.2955326460481097e-07, + "logits/chosen": -2.4088199138641357, + "logits/rejected": -2.380805492401123, + "logps/chosen": -267.1762390136719, + "logps/rejected": -210.53866577148438, + "loss": 0.6682, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.02717725932598114, + "rewards/margins": 0.06459168344736099, + "rewards/rejected": -0.03741442412137985, "step": 250 }, { - "epoch": 1.07, - "learning_rate": 3.568147013782542e-07, - "logits/chosen": -2.4155120849609375, - "logits/rejected": -2.3756167888641357, - "logps/chosen": -261.9485778808594, - "logps/rejected": -226.1790313720703, - "loss": 0.5988, - "rewards/accuracies": 0.729687511920929, - "rewards/chosen": 0.02337762340903282, - "rewards/margins": 0.2698908746242523, - "rewards/rejected": -0.2465132772922516, + "epoch": 0.27, + "learning_rate": 4.4673539518900345e-07, + "logits/chosen": -2.3903017044067383, + "logits/rejected": -2.4178988933563232, + "logps/chosen": -261.8699951171875, + "logps/rejected": -215.27633666992188, + "loss": 0.6653, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.01536791305989027, + "rewards/margins": 0.03926190733909607, + "rewards/rejected": -0.023893997073173523, "step": 260 }, { - "epoch": 1.12, - "learning_rate": 3.4915773353751913e-07, - "logits/chosen": -2.429805278778076, - "logits/rejected": -2.366258144378662, - "logps/chosen": -278.4192810058594, - "logps/rejected": -236.13729858398438, - "loss": 0.5918, - "rewards/accuracies": 0.754687488079071, - "rewards/chosen": 0.02766202948987484, - "rewards/margins": 0.34231704473495483, - "rewards/rejected": -0.31465503573417664, + "epoch": 0.28, + "learning_rate": 4.639175257731959e-07, + "logits/chosen": -2.3510518074035645, + "logits/rejected": -2.374760389328003, + "logps/chosen": -221.19140625, + "logps/rejected": -214.09078979492188, + "loss": 0.6619, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.025187481194734573, + "rewards/margins": 0.06599839776754379, + "rewards/rejected": -0.04081092029809952, "step": 270 }, { - "epoch": 1.16, - "learning_rate": 3.41500765696784e-07, - "logits/chosen": -2.4361157417297363, - "logits/rejected": -2.3966078758239746, - "logps/chosen": -251.09671020507812, - "logps/rejected": -225.8013458251953, - "loss": 0.6033, - "rewards/accuracies": 0.7046874761581421, - "rewards/chosen": 0.009674707427620888, - "rewards/margins": 0.24585363268852234, - "rewards/rejected": -0.236178919672966, + "epoch": 0.29, + "learning_rate": 4.810996563573884e-07, + "logits/chosen": -2.4117298126220703, + "logits/rejected": -2.4171319007873535, + "logps/chosen": -265.48126220703125, + "logps/rejected": -219.87637329101562, + "loss": 0.6574, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.02810204029083252, + "rewards/margins": 0.07569292932748795, + "rewards/rejected": -0.047590889036655426, "step": 280 }, { - "epoch": 1.2, - "learning_rate": 3.33843797856049e-07, - "logits/chosen": -2.408876895904541, - "logits/rejected": -2.394926071166992, - "logps/chosen": -283.1513671875, - "logps/rejected": -228.31497192382812, - "loss": 0.5912, - "rewards/accuracies": 0.7203124761581421, - "rewards/chosen": 0.0072790393605828285, - "rewards/margins": 0.3071500062942505, - "rewards/rejected": -0.29987096786499023, + "epoch": 0.3, + "learning_rate": 4.982817869415807e-07, + "logits/chosen": -2.4645633697509766, + "logits/rejected": -2.3363564014434814, + "logps/chosen": -299.02349853515625, + "logps/rejected": -233.5424346923828, + "loss": 0.6555, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.042131923139095306, + "rewards/margins": 0.09228460490703583, + "rewards/rejected": -0.05015267804265022, "step": 290 }, { - "epoch": 1.24, - "learning_rate": 3.2618683001531396e-07, - "logits/chosen": -2.4088923931121826, - "logits/rejected": -2.3377671241760254, - "logps/chosen": -261.37591552734375, - "logps/rejected": -227.7656707763672, - "loss": 0.5887, - "rewards/accuracies": 0.7046874761581421, - "rewards/chosen": 0.010361125692725182, - "rewards/margins": 0.30056527256965637, - "rewards/rejected": -0.29020413756370544, + "epoch": 0.31, + "learning_rate": 4.982778415614236e-07, + "logits/chosen": -2.388867139816284, + "logits/rejected": -2.3494858741760254, + "logps/chosen": -236.2740020751953, + "logps/rejected": -220.2272186279297, + "loss": 0.6511, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.020576827228069305, + "rewards/margins": 0.09741847962141037, + "rewards/rejected": -0.07684165239334106, "step": 300 }, { - "epoch": 1.28, - "learning_rate": 3.1852986217457885e-07, - "logits/chosen": -2.457613468170166, - "logits/rejected": -2.3739213943481445, - "logps/chosen": -261.27642822265625, - "logps/rejected": -228.55892944335938, - "loss": 0.5963, - "rewards/accuracies": 0.7046874761581421, - "rewards/chosen": 0.00804700143635273, - "rewards/margins": 0.3003460764884949, - "rewards/rejected": -0.2922991216182709, + "epoch": 0.32, + "learning_rate": 4.963643321852277e-07, + "logits/chosen": -2.4252231121063232, + "logits/rejected": -2.3302061557769775, + "logps/chosen": -285.35650634765625, + "logps/rejected": -231.33602905273438, + "loss": 0.6499, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.015449454076588154, + "rewards/margins": 0.10541415214538574, + "rewards/rejected": -0.08996469527482986, "step": 310 }, { - "epoch": 1.32, - "learning_rate": 3.108728943338438e-07, - "logits/chosen": -2.444277763366699, - "logits/rejected": -2.4352035522460938, - "logps/chosen": -252.82119750976562, - "logps/rejected": -237.87826538085938, - "loss": 0.5957, - "rewards/accuracies": 0.6796875, - "rewards/chosen": -0.017001762986183167, - "rewards/margins": 0.2523428499698639, - "rewards/rejected": -0.26934462785720825, + "epoch": 0.33, + "learning_rate": 4.944508228090318e-07, + "logits/chosen": -2.437065839767456, + "logits/rejected": -2.4959487915039062, + "logps/chosen": -238.969482421875, + "logps/rejected": -192.4582977294922, + "loss": 0.6367, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.03995648771524429, + "rewards/margins": 0.13508270680904388, + "rewards/rejected": -0.09512621909379959, "step": 320 }, { - "epoch": 1.36, - "learning_rate": 3.0321592649310873e-07, - "logits/chosen": -2.424394130706787, - "logits/rejected": -2.357229232788086, - "logps/chosen": -253.7223358154297, - "logps/rejected": -224.33401489257812, - "loss": 0.5855, - "rewards/accuracies": 0.6953125, - "rewards/chosen": -0.0046251388266682625, - "rewards/margins": 0.3039107620716095, - "rewards/rejected": -0.3085358738899231, + "epoch": 0.34, + "learning_rate": 4.925373134328357e-07, + "logits/chosen": -2.461618423461914, + "logits/rejected": -2.4392247200012207, + "logps/chosen": -252.608642578125, + "logps/rejected": -222.25125122070312, + "loss": 0.6428, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0467003658413887, + "rewards/margins": 0.106337770819664, + "rewards/rejected": -0.05963738635182381, "step": 330 }, { - "epoch": 1.4, - "learning_rate": 2.955589586523736e-07, - "logits/chosen": -2.4427707195281982, - "logits/rejected": -2.3827157020568848, - "logps/chosen": -265.708984375, - "logps/rejected": -226.42001342773438, - "loss": 0.5926, - "rewards/accuracies": 0.7046874761581421, - "rewards/chosen": -0.01725461333990097, - "rewards/margins": 0.3239946961402893, - "rewards/rejected": -0.3412492871284485, + "epoch": 0.35, + "learning_rate": 4.906238040566398e-07, + "logits/chosen": -2.4741828441619873, + "logits/rejected": -2.355389356613159, + "logps/chosen": -271.66387939453125, + "logps/rejected": -231.6305694580078, + "loss": 0.6431, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.04233894124627113, + "rewards/margins": 0.15629062056541443, + "rewards/rejected": -0.11395169794559479, "step": 340 }, { - "epoch": 1.45, - "learning_rate": 2.8790199081163856e-07, - "logits/chosen": -2.439563035964966, - "logits/rejected": -2.369589328765869, - "logps/chosen": -266.0650329589844, - "logps/rejected": -225.261474609375, - "loss": 0.5934, - "rewards/accuracies": 0.6640625, - "rewards/chosen": -0.026776671409606934, - "rewards/margins": 0.28162795305252075, - "rewards/rejected": -0.3084046244621277, + "epoch": 0.36, + "learning_rate": 4.887102946804438e-07, + "logits/chosen": -2.5339910984039307, + "logits/rejected": -2.424262523651123, + "logps/chosen": -289.12408447265625, + "logps/rejected": -223.707275390625, + "loss": 0.6293, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.05516533926129341, + "rewards/margins": 0.20021691918373108, + "rewards/rejected": -0.14505159854888916, "step": 350 }, { - "epoch": 1.49, - "learning_rate": 2.802450229709035e-07, - "logits/chosen": -2.3998467922210693, - "logits/rejected": -2.349297523498535, - "logps/chosen": -254.8881378173828, - "logps/rejected": -213.3016815185547, - "loss": 0.5844, - "rewards/accuracies": 0.676562488079071, - "rewards/chosen": -0.03948161005973816, - "rewards/margins": 0.2969672679901123, - "rewards/rejected": -0.3364488482475281, + "epoch": 0.37, + "learning_rate": 4.867967853042479e-07, + "logits/chosen": -2.440347194671631, + "logits/rejected": -2.469924211502075, + "logps/chosen": -272.43304443359375, + "logps/rejected": -223.846435546875, + "loss": 0.6225, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.05506114289164543, + "rewards/margins": 0.1845804899930954, + "rewards/rejected": -0.12951937317848206, "step": 360 }, { - "epoch": 1.53, - "learning_rate": 2.725880551301684e-07, - "logits/chosen": -2.462137222290039, - "logits/rejected": -2.406816005706787, - "logps/chosen": -274.73101806640625, - "logps/rejected": -232.8585968017578, - "loss": 0.5927, - "rewards/accuracies": 0.692187488079071, - "rewards/chosen": -0.022728387266397476, - "rewards/margins": 0.31267625093460083, - "rewards/rejected": -0.3354046940803528, + "epoch": 0.38, + "learning_rate": 4.84883275928052e-07, + "logits/chosen": -2.4580020904541016, + "logits/rejected": -2.422905206680298, + "logps/chosen": -274.3728332519531, + "logps/rejected": -228.1702117919922, + "loss": 0.622, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.04404681175947189, + "rewards/margins": 0.1968606859445572, + "rewards/rejected": -0.1528138816356659, "step": 370 }, { - "epoch": 1.57, - "learning_rate": 2.649310872894334e-07, - "logits/chosen": -2.448570728302002, - "logits/rejected": -2.4157679080963135, - "logps/chosen": -274.98968505859375, - "logps/rejected": -223.1295623779297, - "loss": 0.582, - "rewards/accuracies": 0.7281249761581421, - "rewards/chosen": -0.015962181612849236, - "rewards/margins": 0.37291616201400757, - "rewards/rejected": -0.38887840509414673, + "epoch": 0.39, + "learning_rate": 4.82969766551856e-07, + "logits/chosen": -2.3649065494537354, + "logits/rejected": -2.3759725093841553, + "logps/chosen": -258.05328369140625, + "logps/rejected": -228.05404663085938, + "loss": 0.6335, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.04060830920934677, + "rewards/margins": 0.17422744631767273, + "rewards/rejected": -0.13361915946006775, "step": 380 }, { - "epoch": 1.61, - "learning_rate": 2.572741194486983e-07, - "logits/chosen": -2.4490561485290527, - "logits/rejected": -2.393211841583252, - "logps/chosen": -273.8176574707031, - "logps/rejected": -208.3898162841797, - "loss": 0.58, - "rewards/accuracies": 0.6968749761581421, - "rewards/chosen": -0.02870849333703518, - "rewards/margins": 0.3663511276245117, - "rewards/rejected": -0.39505964517593384, + "epoch": 0.4, + "learning_rate": 4.810562571756601e-07, + "logits/chosen": -2.4177675247192383, + "logits/rejected": -2.406047821044922, + "logps/chosen": -255.5844268798828, + "logps/rejected": -219.80984497070312, + "loss": 0.6251, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.02088143676519394, + "rewards/margins": 0.18797752261161804, + "rewards/rejected": -0.1670960783958435, "step": 390 }, { - "epoch": 1.65, - "learning_rate": 2.496171516079632e-07, - "logits/chosen": -2.465646266937256, - "logits/rejected": -2.399637222290039, - "logps/chosen": -293.25164794921875, - "logps/rejected": -239.49093627929688, - "loss": 0.5819, - "rewards/accuracies": 0.7328125238418579, - "rewards/chosen": 0.001335096312686801, - "rewards/margins": 0.3930011987686157, - "rewards/rejected": -0.39166611433029175, + "epoch": 0.41, + "learning_rate": 4.791427477994642e-07, + "logits/chosen": -2.418471574783325, + "logits/rejected": -2.4022955894470215, + "logps/chosen": -306.84869384765625, + "logps/rejected": -239.74612426757812, + "loss": 0.62, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.06454652547836304, + "rewards/margins": 0.1981131136417389, + "rewards/rejected": -0.13356655836105347, "step": 400 }, { - "epoch": 1.69, - "learning_rate": 2.4196018376722816e-07, - "logits/chosen": -2.4297971725463867, - "logits/rejected": -2.400850296020508, - "logps/chosen": -278.59759521484375, - "logps/rejected": -228.43798828125, - "loss": 0.5868, - "rewards/accuracies": 0.6796875, - "rewards/chosen": -0.027199868112802505, - "rewards/margins": 0.3639487326145172, - "rewards/rejected": -0.3911486268043518, + "epoch": 0.42, + "learning_rate": 4.772292384232682e-07, + "logits/chosen": -2.4177165031433105, + "logits/rejected": -2.379561185836792, + "logps/chosen": -246.2447967529297, + "logps/rejected": -245.50753784179688, + "loss": 0.6214, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.025952398777008057, + "rewards/margins": 0.16164085268974304, + "rewards/rejected": -0.1875932663679123, "step": 410 }, { - "epoch": 1.74, - "learning_rate": 2.343032159264931e-07, - "logits/chosen": -2.4039382934570312, - "logits/rejected": -2.333247661590576, - "logps/chosen": -268.90826416015625, - "logps/rejected": -224.38961791992188, - "loss": 0.5785, - "rewards/accuracies": 0.729687511920929, - "rewards/chosen": -0.022465692833065987, - "rewards/margins": 0.4325350821018219, - "rewards/rejected": -0.45500072836875916, + "epoch": 0.43, + "learning_rate": 4.753157290470723e-07, + "logits/chosen": -2.488081216812134, + "logits/rejected": -2.4657857418060303, + "logps/chosen": -256.111083984375, + "logps/rejected": -225.01748657226562, + "loss": 0.6292, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.001765048480592668, + "rewards/margins": 0.16318608820438385, + "rewards/rejected": -0.1614210307598114, "step": 420 }, { - "epoch": 1.78, - "learning_rate": 2.26646248085758e-07, - "logits/chosen": -2.383586883544922, - "logits/rejected": -2.335700273513794, - "logps/chosen": -259.71624755859375, - "logps/rejected": -217.78988647460938, - "loss": 0.5739, - "rewards/accuracies": 0.7171875238418579, - "rewards/chosen": -0.042887382209300995, - "rewards/margins": 0.38991934061050415, - "rewards/rejected": -0.43280667066574097, + "epoch": 0.44, + "learning_rate": 4.7340221967087635e-07, + "logits/chosen": -2.411403179168701, + "logits/rejected": -2.3677725791931152, + "logps/chosen": -251.43051147460938, + "logps/rejected": -224.96240234375, + "loss": 0.6157, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.036291979253292084, + "rewards/margins": 0.2256316840648651, + "rewards/rejected": -0.18933971226215363, "step": 430 }, { - "epoch": 1.82, - "learning_rate": 2.1898928024502298e-07, - "logits/chosen": -2.4449055194854736, - "logits/rejected": -2.387500047683716, - "logps/chosen": -263.4512939453125, - "logps/rejected": -221.47128295898438, - "loss": 0.5744, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.019188478589057922, - "rewards/margins": 0.3951462507247925, - "rewards/rejected": -0.4143346846103668, + "epoch": 0.45, + "learning_rate": 4.714887102946804e-07, + "logits/chosen": -2.4702084064483643, + "logits/rejected": -2.3358662128448486, + "logps/chosen": -257.2681884765625, + "logps/rejected": -200.87564086914062, + "loss": 0.6008, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.021172259002923965, + "rewards/margins": 0.22813072800636292, + "rewards/rejected": -0.20695844292640686, "step": 440 }, { - "epoch": 1.86, - "learning_rate": 2.113323124042879e-07, - "logits/chosen": -2.428199291229248, - "logits/rejected": -2.391083240509033, - "logps/chosen": -271.2774658203125, - "logps/rejected": -231.4476776123047, - "loss": 0.5749, - "rewards/accuracies": 0.721875011920929, - "rewards/chosen": -0.019557902589440346, - "rewards/margins": 0.4145377278327942, - "rewards/rejected": -0.4340956211090088, + "epoch": 0.46, + "learning_rate": 4.6957520091848447e-07, + "logits/chosen": -2.426776885986328, + "logits/rejected": -2.3923580646514893, + "logps/chosen": -228.310791015625, + "logps/rejected": -204.06149291992188, + "loss": 0.6346, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.00926109217107296, + "rewards/margins": 0.17081685364246368, + "rewards/rejected": -0.16155575215816498, "step": 450 }, { - "epoch": 1.9, - "learning_rate": 2.036753445635528e-07, - "logits/chosen": -2.4364569187164307, - "logits/rejected": -2.415283203125, - "logps/chosen": -284.0417785644531, - "logps/rejected": -238.28244018554688, - "loss": 0.5686, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.022761384025216103, - "rewards/margins": 0.4516824781894684, - "rewards/rejected": -0.4744439125061035, + "epoch": 0.48, + "learning_rate": 4.6766169154228853e-07, + "logits/chosen": -2.3557937145233154, + "logits/rejected": -2.3138818740844727, + "logps/chosen": -268.3694152832031, + "logps/rejected": -239.6737518310547, + "loss": 0.5994, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.07315589487552643, + "rewards/margins": 0.2832576632499695, + "rewards/rejected": -0.21010179817676544, "step": 460 }, { - "epoch": 1.94, - "learning_rate": 1.9601837672281775e-07, - "logits/chosen": -2.4187042713165283, - "logits/rejected": -2.3860645294189453, - "logps/chosen": -270.18487548828125, - "logps/rejected": -236.39755249023438, - "loss": 0.5765, - "rewards/accuracies": 0.7171875238418579, - "rewards/chosen": -0.06018770858645439, - "rewards/margins": 0.3767903447151184, - "rewards/rejected": -0.4369780421257019, + "epoch": 0.49, + "learning_rate": 4.657481821660926e-07, + "logits/chosen": -2.4533188343048096, + "logits/rejected": -2.4328866004943848, + "logps/chosen": -283.4711608886719, + "logps/rejected": -211.56640625, + "loss": 0.5987, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.0002489805337972939, + "rewards/margins": 0.27414873242378235, + "rewards/rejected": -0.27439773082733154, "step": 470 }, { - "epoch": 1.98, - "learning_rate": 1.883614088820827e-07, - "logits/chosen": -2.438513994216919, - "logits/rejected": -2.3922152519226074, - "logps/chosen": -268.73028564453125, - "logps/rejected": -224.90036010742188, - "loss": 0.5841, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.046872757375240326, - "rewards/margins": 0.38516464829444885, - "rewards/rejected": -0.4320374131202698, + "epoch": 0.5, + "learning_rate": 4.6383467278989666e-07, + "logits/chosen": -2.3915047645568848, + "logits/rejected": -2.4537439346313477, + "logps/chosen": -248.0155029296875, + "logps/rejected": -230.64767456054688, + "loss": 0.6144, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.018264885991811752, + "rewards/margins": 0.20035696029663086, + "rewards/rejected": -0.21862182021141052, "step": 480 }, { - "epoch": 2.0, - "eval_logits/chosen": -2.1272027492523193, - "eval_logits/rejected": -2.0063846111297607, - "eval_logps/chosen": -265.2384338378906, - "eval_logps/rejected": -223.98130798339844, - "eval_loss": 0.5733410716056824, - "eval_rewards/accuracies": 0.7039999961853027, - "eval_rewards/chosen": -0.060633424669504166, - "eval_rewards/margins": 0.40920668840408325, - "eval_rewards/rejected": -0.46984007954597473, - "eval_runtime": 600.9449, - "eval_samples_per_second": 3.328, - "eval_steps_per_second": 0.208, - "step": 484 - }, - { - "epoch": 2.02, - "learning_rate": 1.807044410413476e-07, - "logits/chosen": -2.41025972366333, - "logits/rejected": -2.3768038749694824, - "logps/chosen": -257.9417419433594, - "logps/rejected": -236.35751342773438, - "loss": 0.5774, - "rewards/accuracies": 0.707812488079071, - "rewards/chosen": -0.0522170290350914, - "rewards/margins": 0.39746755361557007, - "rewards/rejected": -0.4496845602989197, + "epoch": 0.51, + "learning_rate": 4.6192116341370067e-07, + "logits/chosen": -2.5227386951446533, + "logits/rejected": -2.4403090476989746, + "logps/chosen": -277.166748046875, + "logps/rejected": -230.45849609375, + "loss": 0.5927, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.04206278175115585, + "rewards/margins": 0.3326976001262665, + "rewards/rejected": -0.29063481092453003, "step": 490 }, { - "epoch": 2.07, - "learning_rate": 1.7304747320061255e-07, - "logits/chosen": -2.4075393676757812, - "logits/rejected": -2.403886556625366, - "logps/chosen": -263.6119689941406, - "logps/rejected": -230.6759796142578, - "loss": 0.5768, - "rewards/accuracies": 0.7046874761581421, - "rewards/chosen": -0.023285437375307083, - "rewards/margins": 0.3978777825832367, - "rewards/rejected": -0.42116326093673706, + "epoch": 0.52, + "learning_rate": 4.6000765403750473e-07, + "logits/chosen": -2.4241671562194824, + "logits/rejected": -2.336174488067627, + "logps/chosen": -255.9370574951172, + "logps/rejected": -211.1270751953125, + "loss": 0.5983, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.01858004741370678, + "rewards/margins": 0.2676704525947571, + "rewards/rejected": -0.24909043312072754, "step": 500 }, { - "epoch": 2.11, - "learning_rate": 1.6539050535987747e-07, - "logits/chosen": -2.479966402053833, - "logits/rejected": -2.3705756664276123, - "logps/chosen": -270.12005615234375, - "logps/rejected": -225.0802459716797, - "loss": 0.5698, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.03792610391974449, - "rewards/margins": 0.43896961212158203, - "rewards/rejected": -0.4768957197666168, + "epoch": 0.53, + "learning_rate": 4.580941446613088e-07, + "logits/chosen": -2.4633803367614746, + "logits/rejected": -2.4187140464782715, + "logps/chosen": -257.75225830078125, + "logps/rejected": -238.34164428710938, + "loss": 0.5972, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.019993241876363754, + "rewards/margins": 0.240191251039505, + "rewards/rejected": -0.26018446683883667, "step": 510 }, { - "epoch": 2.15, - "learning_rate": 1.5773353751914243e-07, - "logits/chosen": -2.486560821533203, - "logits/rejected": -2.425957679748535, - "logps/chosen": -284.8712158203125, - "logps/rejected": -229.9952392578125, - "loss": 0.5714, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.024752041324973106, - "rewards/margins": 0.4690770208835602, - "rewards/rejected": -0.49382907152175903, + "epoch": 0.54, + "learning_rate": 4.5618063528511285e-07, + "logits/chosen": -2.387589454650879, + "logits/rejected": -2.358363628387451, + "logps/chosen": -274.30670166015625, + "logps/rejected": -232.08969116210938, + "loss": 0.6006, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.05401581525802612, + "rewards/margins": 0.24342355132102966, + "rewards/rejected": -0.29743942618370056, "step": 520 }, { - "epoch": 2.19, - "learning_rate": 1.5007656967840735e-07, - "logits/chosen": -2.383312702178955, - "logits/rejected": -2.342586040496826, - "logps/chosen": -254.04122924804688, - "logps/rejected": -230.623046875, - "loss": 0.5678, - "rewards/accuracies": 0.7109375, - "rewards/chosen": -0.04037480801343918, - "rewards/margins": 0.4469054341316223, - "rewards/rejected": -0.4872801899909973, + "epoch": 0.55, + "learning_rate": 4.542671259089169e-07, + "logits/chosen": -2.4646763801574707, + "logits/rejected": -2.407026767730713, + "logps/chosen": -273.80029296875, + "logps/rejected": -233.99826049804688, + "loss": 0.6071, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.009464024566113949, + "rewards/margins": 0.2972859740257263, + "rewards/rejected": -0.30674999952316284, "step": 530 }, { - "epoch": 2.23, - "learning_rate": 1.4241960183767226e-07, - "logits/chosen": -2.429459810256958, - "logits/rejected": -2.373309850692749, - "logps/chosen": -282.88043212890625, - "logps/rejected": -235.9270477294922, - "loss": 0.5726, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.0583997443318367, - "rewards/margins": 0.42233991622924805, - "rewards/rejected": -0.48073965311050415, + "epoch": 0.56, + "learning_rate": 4.52353616532721e-07, + "logits/chosen": -2.4378743171691895, + "logits/rejected": -2.415499448776245, + "logps/chosen": -266.07171630859375, + "logps/rejected": -235.11093139648438, + "loss": 0.6005, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.009244078770279884, + "rewards/margins": 0.26727497577667236, + "rewards/rejected": -0.2765190303325653, "step": 540 }, { - "epoch": 2.27, - "learning_rate": 1.347626339969372e-07, - "logits/chosen": -2.423274517059326, - "logits/rejected": -2.3877205848693848, - "logps/chosen": -270.8163757324219, - "logps/rejected": -242.1139678955078, - "loss": 0.5754, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.015940938144922256, - "rewards/margins": 0.41127151250839233, - "rewards/rejected": -0.4272124767303467, + "epoch": 0.57, + "learning_rate": 4.5044010715652504e-07, + "logits/chosen": -2.439612627029419, + "logits/rejected": -2.3910512924194336, + "logps/chosen": -249.3505401611328, + "logps/rejected": -228.9892120361328, + "loss": 0.6001, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.029611006379127502, + "rewards/margins": 0.2741519510746002, + "rewards/rejected": -0.30376294255256653, "step": 550 }, { - "epoch": 2.31, - "learning_rate": 1.2710566615620215e-07, - "logits/chosen": -2.3739233016967773, - "logits/rejected": -2.328768253326416, - "logps/chosen": -274.34735107421875, - "logps/rejected": -225.1607208251953, - "loss": 0.5587, - "rewards/accuracies": 0.7109375, - "rewards/chosen": -0.035870593041181564, - "rewards/margins": 0.445881187915802, - "rewards/rejected": -0.48175176978111267, + "epoch": 0.58, + "learning_rate": 4.485265977803291e-07, + "logits/chosen": -2.46055269241333, + "logits/rejected": -2.3553805351257324, + "logps/chosen": -293.1697692871094, + "logps/rejected": -232.68115234375, + "loss": 0.5852, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.061341024935245514, + "rewards/margins": 0.43009573221206665, + "rewards/rejected": -0.36875468492507935, "step": 560 }, { - "epoch": 2.35, - "learning_rate": 1.1944869831546706e-07, - "logits/chosen": -2.399712085723877, - "logits/rejected": -2.379559278488159, - "logps/chosen": -266.9978332519531, - "logps/rejected": -238.74691772460938, - "loss": 0.5713, - "rewards/accuracies": 0.692187488079071, - "rewards/chosen": -0.07225313037633896, - "rewards/margins": 0.4113592207431793, - "rewards/rejected": -0.4836123585700989, + "epoch": 0.59, + "learning_rate": 4.4661308840413316e-07, + "logits/chosen": -2.509950637817383, + "logits/rejected": -2.377487897872925, + "logps/chosen": -285.7837829589844, + "logps/rejected": -236.22866821289062, + "loss": 0.6017, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.0002725020167417824, + "rewards/margins": 0.3595966100692749, + "rewards/rejected": -0.3593241274356842, "step": 570 }, { - "epoch": 2.4, - "learning_rate": 1.11791730474732e-07, - "logits/chosen": -2.413283586502075, - "logits/rejected": -2.3746399879455566, - "logps/chosen": -262.73028564453125, - "logps/rejected": -226.4913787841797, - "loss": 0.5653, - "rewards/accuracies": 0.745312511920929, - "rewards/chosen": -0.06956593692302704, - "rewards/margins": 0.4847482740879059, - "rewards/rejected": -0.554314136505127, + "epoch": 0.6, + "learning_rate": 4.446995790279372e-07, + "logits/chosen": -2.4554500579833984, + "logits/rejected": -2.4359169006347656, + "logps/chosen": -283.475341796875, + "logps/rejected": -230.9565887451172, + "loss": 0.5871, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.004635247401893139, + "rewards/margins": 0.32853394746780396, + "rewards/rejected": -0.32389870285987854, "step": 580 }, { - "epoch": 2.44, - "learning_rate": 1.0413476263399694e-07, - "logits/chosen": -2.451280355453491, - "logits/rejected": -2.4071993827819824, - "logps/chosen": -269.5108947753906, - "logps/rejected": -219.07644653320312, - "loss": 0.5672, - "rewards/accuracies": 0.7281249761581421, - "rewards/chosen": -0.07316569238901138, - "rewards/margins": 0.4291161596775055, - "rewards/rejected": -0.5022818446159363, + "epoch": 0.61, + "learning_rate": 4.4278606965174123e-07, + "logits/chosen": -2.472580671310425, + "logits/rejected": -2.470784902572632, + "logps/chosen": -262.70196533203125, + "logps/rejected": -241.69784545898438, + "loss": 0.5868, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.021802525967359543, + "rewards/margins": 0.33116960525512695, + "rewards/rejected": -0.3529720902442932, "step": 590 }, { - "epoch": 2.48, - "learning_rate": 9.647779479326186e-08, - "logits/chosen": -2.393155097961426, - "logits/rejected": -2.3883702754974365, - "logps/chosen": -251.05142211914062, - "logps/rejected": -224.50564575195312, - "loss": 0.5763, - "rewards/accuracies": 0.6781250238418579, - "rewards/chosen": -0.06612516194581985, - "rewards/margins": 0.3570297658443451, - "rewards/rejected": -0.4231549799442291, + "epoch": 0.62, + "learning_rate": 4.408725602755453e-07, + "logits/chosen": -2.4604482650756836, + "logits/rejected": -2.4069600105285645, + "logps/chosen": -233.8094482421875, + "logps/rejected": -207.21536254882812, + "loss": 0.5901, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03362672030925751, + "rewards/margins": 0.2702116370201111, + "rewards/rejected": -0.3038383424282074, "step": 600 }, { - "epoch": 2.52, - "learning_rate": 8.88208269525268e-08, - "logits/chosen": -2.3813252449035645, - "logits/rejected": -2.39128041267395, - "logps/chosen": -260.7297058105469, - "logps/rejected": -223.4501495361328, - "loss": 0.5656, - "rewards/accuracies": 0.7109375, - "rewards/chosen": -0.07072791457176208, - "rewards/margins": 0.41946038603782654, - "rewards/rejected": -0.49018827080726624, + "epoch": 0.63, + "learning_rate": 4.3895905089934936e-07, + "logits/chosen": -2.476191282272339, + "logits/rejected": -2.3460450172424316, + "logps/chosen": -276.6174621582031, + "logps/rejected": -231.13705444335938, + "loss": 0.5887, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.0008199826115742326, + "rewards/margins": 0.3593784272670746, + "rewards/rejected": -0.360198438167572, "step": 610 }, { - "epoch": 2.56, - "learning_rate": 8.116385911179173e-08, - "logits/chosen": -2.371321201324463, - "logits/rejected": -2.3626811504364014, - "logps/chosen": -279.528564453125, - "logps/rejected": -219.47598266601562, - "loss": 0.5688, - "rewards/accuracies": 0.723437488079071, - "rewards/chosen": -0.020319191738963127, - "rewards/margins": 0.4656984210014343, - "rewards/rejected": -0.4860176146030426, + "epoch": 0.64, + "learning_rate": 4.370455415231534e-07, + "logits/chosen": -2.418025493621826, + "logits/rejected": -2.426182270050049, + "logps/chosen": -276.7029724121094, + "logps/rejected": -260.8800354003906, + "loss": 0.5695, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.050284016877412796, + "rewards/margins": 0.325679212808609, + "rewards/rejected": -0.37596315145492554, "step": 620 }, { - "epoch": 2.6, - "learning_rate": 7.350689127105667e-08, - "logits/chosen": -2.4530327320098877, - "logits/rejected": -2.3786721229553223, - "logps/chosen": -272.66961669921875, - "logps/rejected": -232.73129272460938, - "loss": 0.5604, - "rewards/accuracies": 0.7015625238418579, - "rewards/chosen": -0.04867444932460785, - "rewards/margins": 0.4719608724117279, - "rewards/rejected": -0.5206353068351746, + "epoch": 0.65, + "learning_rate": 4.351320321469575e-07, + "logits/chosen": -2.4381816387176514, + "logits/rejected": -2.4007935523986816, + "logps/chosen": -298.5264587402344, + "logps/rejected": -234.97250366210938, + "loss": 0.5822, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.040602535009384155, + "rewards/margins": 0.45465603470802307, + "rewards/rejected": -0.4140535295009613, "step": 630 }, { - "epoch": 2.64, - "learning_rate": 6.584992343032159e-08, - "logits/chosen": -2.3559648990631104, - "logits/rejected": -2.342261552810669, - "logps/chosen": -263.22344970703125, - "logps/rejected": -229.0948028564453, - "loss": 0.5758, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.0527966246008873, - "rewards/margins": 0.4219232201576233, - "rewards/rejected": -0.4747198522090912, + "epoch": 0.66, + "learning_rate": 4.3321852277076154e-07, + "logits/chosen": -2.4025540351867676, + "logits/rejected": -2.3902947902679443, + "logps/chosen": -267.2010192871094, + "logps/rejected": -240.09646606445312, + "loss": 0.5675, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.013172095641493797, + "rewards/margins": 0.4019550383090973, + "rewards/rejected": -0.38878297805786133, "step": 640 }, { - "epoch": 2.69, - "learning_rate": 5.819295558958652e-08, - "logits/chosen": -2.3997654914855957, - "logits/rejected": -2.3493194580078125, - "logps/chosen": -288.34991455078125, - "logps/rejected": -222.42855834960938, - "loss": 0.5643, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.02724199928343296, - "rewards/margins": 0.5219191908836365, - "rewards/rejected": -0.5491611361503601, + "epoch": 0.67, + "learning_rate": 4.313050133945656e-07, + "logits/chosen": -2.507800340652466, + "logits/rejected": -2.4536452293395996, + "logps/chosen": -291.55218505859375, + "logps/rejected": -242.00558471679688, + "loss": 0.5795, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.00018588601960800588, + "rewards/margins": 0.46111243963241577, + "rewards/rejected": -0.4609266221523285, "step": 650 }, { - "epoch": 2.73, - "learning_rate": 5.0535987748851455e-08, - "logits/chosen": -2.46504282951355, - "logits/rejected": -2.412127733230591, - "logps/chosen": -275.7881774902344, - "logps/rejected": -231.99331665039062, - "loss": 0.5616, - "rewards/accuracies": 0.7109375, - "rewards/chosen": -0.07443860173225403, - "rewards/margins": 0.41263580322265625, - "rewards/rejected": -0.48707443475723267, + "epoch": 0.68, + "learning_rate": 4.2939150401836967e-07, + "logits/chosen": -2.4589836597442627, + "logits/rejected": -2.4613184928894043, + "logps/chosen": -245.21237182617188, + "logps/rejected": -246.0800323486328, + "loss": 0.5688, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.06368513405323029, + "rewards/margins": 0.32503411173820496, + "rewards/rejected": -0.38871926069259644, "step": 660 }, { - "epoch": 2.77, - "learning_rate": 4.287901990811638e-08, - "logits/chosen": -2.4429898262023926, - "logits/rejected": -2.356980800628662, - "logps/chosen": -268.50396728515625, - "logps/rejected": -244.95095825195312, - "loss": 0.5638, - "rewards/accuracies": 0.7015625238418579, - "rewards/chosen": -0.04781431332230568, - "rewards/margins": 0.4686294496059418, - "rewards/rejected": -0.5164437294006348, + "epoch": 0.69, + "learning_rate": 4.2747799464217373e-07, + "logits/chosen": -2.397803783416748, + "logits/rejected": -2.3992929458618164, + "logps/chosen": -262.5749816894531, + "logps/rejected": -231.0945281982422, + "loss": 0.574, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.10684232413768768, + "rewards/margins": 0.3226371705532074, + "rewards/rejected": -0.4294795095920563, "step": 670 }, { - "epoch": 2.81, - "learning_rate": 3.522205206738132e-08, - "logits/chosen": -2.407198905944824, - "logits/rejected": -2.3732337951660156, - "logps/chosen": -275.3802185058594, - "logps/rejected": -230.15750122070312, - "loss": 0.5689, - "rewards/accuracies": 0.7109375, - "rewards/chosen": -0.0815020427107811, - "rewards/margins": 0.4394635558128357, - "rewards/rejected": -0.5209656953811646, + "epoch": 0.7, + "learning_rate": 4.255644852659778e-07, + "logits/chosen": -2.4963364601135254, + "logits/rejected": -2.4439988136291504, + "logps/chosen": -287.1022644042969, + "logps/rejected": -249.372802734375, + "loss": 0.6135, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08951963484287262, + "rewards/margins": 0.2950531840324402, + "rewards/rejected": -0.384572833776474, "step": 680 }, { - "epoch": 2.85, - "learning_rate": 2.7565084226646246e-08, - "logits/chosen": -2.3888936042785645, - "logits/rejected": -2.376246929168701, - "logps/chosen": -264.2990417480469, - "logps/rejected": -236.7455291748047, - "loss": 0.5745, - "rewards/accuracies": 0.714062511920929, - "rewards/chosen": -0.04266131669282913, - "rewards/margins": 0.4692384600639343, - "rewards/rejected": -0.5118998289108276, + "epoch": 0.71, + "learning_rate": 4.236509758897818e-07, + "logits/chosen": -2.3793249130249023, + "logits/rejected": -2.3877103328704834, + "logps/chosen": -260.2186584472656, + "logps/rejected": -218.5549774169922, + "loss": 0.6012, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.07342827320098877, + "rewards/margins": 0.3570996820926666, + "rewards/rejected": -0.4305279850959778, "step": 690 }, { - "epoch": 2.89, - "learning_rate": 1.9908116385911178e-08, - "logits/chosen": -2.4064769744873047, - "logits/rejected": -2.388720989227295, - "logps/chosen": -260.6070556640625, - "logps/rejected": -227.61105346679688, - "loss": 0.5712, + "epoch": 0.72, + "learning_rate": 4.2173746651358586e-07, + "logits/chosen": -2.4117445945739746, + "logits/rejected": -2.3893179893493652, + "logps/chosen": -260.35223388671875, + "logps/rejected": -246.88528442382812, + "loss": 0.5919, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.09105347096920013, - "rewards/margins": 0.38188761472702026, - "rewards/rejected": -0.4729411005973816, + "rewards/chosen": -0.11649465560913086, + "rewards/margins": 0.276920884847641, + "rewards/rejected": -0.39341551065444946, "step": 700 }, { - "epoch": 2.93, - "learning_rate": 1.225114854517611e-08, - "logits/chosen": -2.43709659576416, - "logits/rejected": -2.371903657913208, - "logps/chosen": -280.87945556640625, - "logps/rejected": -221.88931274414062, - "loss": 0.5684, - "rewards/accuracies": 0.7171875238418579, - "rewards/chosen": -0.06613589823246002, - "rewards/margins": 0.4434446394443512, - "rewards/rejected": -0.5095804929733276, + "epoch": 0.73, + "learning_rate": 4.198239571373899e-07, + "logits/chosen": -2.4213128089904785, + "logits/rejected": -2.353787660598755, + "logps/chosen": -243.3746795654297, + "logps/rejected": -195.84048461914062, + "loss": 0.5849, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.056009601801633835, + "rewards/margins": 0.43854936957359314, + "rewards/rejected": -0.4945589601993561, "step": 710 }, { - "epoch": 2.97, - "learning_rate": 4.594180704441042e-09, - "logits/chosen": -2.400611639022827, - "logits/rejected": -2.3782918453216553, - "logps/chosen": -257.98419189453125, - "logps/rejected": -225.0382843017578, - "loss": 0.567, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.05685793235898018, - "rewards/margins": 0.41515034437179565, - "rewards/rejected": -0.47200828790664673, + "epoch": 0.74, + "learning_rate": 4.17910447761194e-07, + "logits/chosen": -2.46687388420105, + "logits/rejected": -2.3652591705322266, + "logps/chosen": -267.1708679199219, + "logps/rejected": -255.4759521484375, + "loss": 0.5977, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.08689933270215988, + "rewards/margins": 0.33677542209625244, + "rewards/rejected": -0.42367473244667053, "step": 720 }, + { + "epoch": 0.75, + "learning_rate": 4.1599693838499805e-07, + "logits/chosen": -2.4511024951934814, + "logits/rejected": -2.4267566204071045, + "logps/chosen": -295.463134765625, + "logps/rejected": -215.197265625, + "loss": 0.5815, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.06298734992742538, + "rewards/margins": 0.3604966402053833, + "rewards/rejected": -0.4234839975833893, + "step": 730 + }, + { + "epoch": 0.76, + "learning_rate": 4.140834290088021e-07, + "logits/chosen": -2.4394567012786865, + "logits/rejected": -2.4174628257751465, + "logps/chosen": -277.1340026855469, + "logps/rejected": -221.7968292236328, + "loss": 0.5643, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.0872669368982315, + "rewards/margins": 0.4404314458370209, + "rewards/rejected": -0.5276983976364136, + "step": 740 + }, + { + "epoch": 0.77, + "learning_rate": 4.121699196326062e-07, + "logits/chosen": -2.4356467723846436, + "logits/rejected": -2.309382915496826, + "logps/chosen": -218.226318359375, + "logps/rejected": -185.0907440185547, + "loss": 0.5819, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.055658143013715744, + "rewards/margins": 0.3867154121398926, + "rewards/rejected": -0.44237351417541504, + "step": 750 + }, + { + "epoch": 0.78, + "learning_rate": 4.1025641025641024e-07, + "logits/chosen": -2.398838520050049, + "logits/rejected": -2.4070441722869873, + "logps/chosen": -259.14996337890625, + "logps/rejected": -243.26882934570312, + "loss": 0.5757, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06217324733734131, + "rewards/margins": 0.4077116549015045, + "rewards/rejected": -0.4698849320411682, + "step": 760 + }, + { + "epoch": 0.8, + "learning_rate": 4.083429008802143e-07, + "logits/chosen": -2.5257222652435303, + "logits/rejected": -2.471179485321045, + "logps/chosen": -274.23980712890625, + "logps/rejected": -213.1348876953125, + "loss": 0.5549, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.041472114622592926, + "rewards/margins": 0.4583619236946106, + "rewards/rejected": -0.4998340606689453, + "step": 770 + }, + { + "epoch": 0.81, + "learning_rate": 4.0642939150401836e-07, + "logits/chosen": -2.488083600997925, + "logits/rejected": -2.3540916442871094, + "logps/chosen": -290.1893310546875, + "logps/rejected": -217.73001098632812, + "loss": 0.5702, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.0671161487698555, + "rewards/margins": 0.4410739541053772, + "rewards/rejected": -0.5081900954246521, + "step": 780 + }, + { + "epoch": 0.82, + "learning_rate": 4.0451588212782237e-07, + "logits/chosen": -2.4297478199005127, + "logits/rejected": -2.3958935737609863, + "logps/chosen": -255.67984008789062, + "logps/rejected": -227.6651153564453, + "loss": 0.5417, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.0655713826417923, + "rewards/margins": 0.5016980171203613, + "rewards/rejected": -0.5672693252563477, + "step": 790 + }, + { + "epoch": 0.83, + "learning_rate": 4.0260237275162643e-07, + "logits/chosen": -2.368698835372925, + "logits/rejected": -2.366753578186035, + "logps/chosen": -232.51876831054688, + "logps/rejected": -227.71176147460938, + "loss": 0.574, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.06221083551645279, + "rewards/margins": 0.45073550939559937, + "rewards/rejected": -0.5129462480545044, + "step": 800 + }, + { + "epoch": 0.84, + "learning_rate": 4.006888633754305e-07, + "logits/chosen": -2.523768186569214, + "logits/rejected": -2.472125291824341, + "logps/chosen": -277.5516662597656, + "logps/rejected": -229.77294921875, + "loss": 0.5448, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.022703617811203003, + "rewards/margins": 0.5766944289207458, + "rewards/rejected": -0.5993980169296265, + "step": 810 + }, + { + "epoch": 0.85, + "learning_rate": 3.9877535399923456e-07, + "logits/chosen": -2.407522201538086, + "logits/rejected": -2.370136260986328, + "logps/chosen": -267.1480712890625, + "logps/rejected": -224.8208770751953, + "loss": 0.5611, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.05944644287228584, + "rewards/margins": 0.5258339643478394, + "rewards/rejected": -0.5852803587913513, + "step": 820 + }, + { + "epoch": 0.86, + "learning_rate": 3.968618446230386e-07, + "logits/chosen": -2.4421579837799072, + "logits/rejected": -2.4140655994415283, + "logps/chosen": -319.13446044921875, + "logps/rejected": -247.4228973388672, + "loss": 0.5424, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0017402932280674577, + "rewards/margins": 0.6056521534919739, + "rewards/rejected": -0.607392430305481, + "step": 830 + }, + { + "epoch": 0.87, + "learning_rate": 3.949483352468427e-07, + "logits/chosen": -2.4426145553588867, + "logits/rejected": -2.408177614212036, + "logps/chosen": -275.4983215332031, + "logps/rejected": -240.1235809326172, + "loss": 0.6003, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.188354954123497, + "rewards/margins": 0.3069398105144501, + "rewards/rejected": -0.49529480934143066, + "step": 840 + }, + { + "epoch": 0.88, + "learning_rate": 3.9303482587064674e-07, + "logits/chosen": -2.4034695625305176, + "logits/rejected": -2.420605421066284, + "logps/chosen": -282.95208740234375, + "logps/rejected": -238.9861297607422, + "loss": 0.5644, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.07752462476491928, + "rewards/margins": 0.49765148758888245, + "rewards/rejected": -0.5751761198043823, + "step": 850 + }, + { + "epoch": 0.89, + "learning_rate": 3.911213164944508e-07, + "logits/chosen": -2.448660373687744, + "logits/rejected": -2.3693861961364746, + "logps/chosen": -291.59942626953125, + "logps/rejected": -245.4176483154297, + "loss": 0.564, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.07620221376419067, + "rewards/margins": 0.5494655966758728, + "rewards/rejected": -0.6256678700447083, + "step": 860 + }, + { + "epoch": 0.9, + "learning_rate": 3.8920780711825487e-07, + "logits/chosen": -2.332820177078247, + "logits/rejected": -2.3668315410614014, + "logps/chosen": -270.84954833984375, + "logps/rejected": -226.4775390625, + "loss": 0.5639, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06978223472833633, + "rewards/margins": 0.528697669506073, + "rewards/rejected": -0.5984798669815063, + "step": 870 + }, + { + "epoch": 0.91, + "learning_rate": 3.8729429774205893e-07, + "logits/chosen": -2.414726495742798, + "logits/rejected": -2.3955094814300537, + "logps/chosen": -265.5794982910156, + "logps/rejected": -221.8883056640625, + "loss": 0.5595, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.09429865330457687, + "rewards/margins": 0.48285895586013794, + "rewards/rejected": -0.5771576166152954, + "step": 880 + }, + { + "epoch": 0.92, + "learning_rate": 3.8538078836586294e-07, + "logits/chosen": -2.4266517162323, + "logits/rejected": -2.3529062271118164, + "logps/chosen": -255.7076873779297, + "logps/rejected": -249.5386199951172, + "loss": 0.5809, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1304847002029419, + "rewards/margins": 0.40244507789611816, + "rewards/rejected": -0.5329297780990601, + "step": 890 + }, + { + "epoch": 0.93, + "learning_rate": 3.83467278989667e-07, + "logits/chosen": -2.407705545425415, + "logits/rejected": -2.356353282928467, + "logps/chosen": -281.96405029296875, + "logps/rejected": -222.4244384765625, + "loss": 0.5463, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": 0.0021881351713091135, + "rewards/margins": 0.5727441310882568, + "rewards/rejected": -0.5705560445785522, + "step": 900 + }, + { + "epoch": 0.94, + "learning_rate": 3.8155376961347106e-07, + "logits/chosen": -2.420719861984253, + "logits/rejected": -2.4088523387908936, + "logps/chosen": -250.53616333007812, + "logps/rejected": -203.44956970214844, + "loss": 0.5655, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.07977491617202759, + "rewards/margins": 0.5309610366821289, + "rewards/rejected": -0.6107359528541565, + "step": 910 + }, + { + "epoch": 0.95, + "learning_rate": 3.796402602372751e-07, + "logits/chosen": -2.426771640777588, + "logits/rejected": -2.343543291091919, + "logps/chosen": -272.2907409667969, + "logps/rejected": -252.50698852539062, + "loss": 0.5557, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.0921485498547554, + "rewards/margins": 0.42352181673049927, + "rewards/rejected": -0.5156703591346741, + "step": 920 + }, + { + "epoch": 0.96, + "learning_rate": 3.777267508610792e-07, + "logits/chosen": -2.493851661682129, + "logits/rejected": -2.429084539413452, + "logps/chosen": -270.8280944824219, + "logps/rejected": -260.43084716796875, + "loss": 0.5676, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.10896792262792587, + "rewards/margins": 0.5754088163375854, + "rewards/rejected": -0.6843767166137695, + "step": 930 + }, + { + "epoch": 0.97, + "learning_rate": 3.7581324148488325e-07, + "logits/chosen": -2.4216136932373047, + "logits/rejected": -2.37446665763855, + "logps/chosen": -299.2573547363281, + "logps/rejected": -224.32192993164062, + "loss": 0.5703, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.09940418601036072, + "rewards/margins": 0.46459144353866577, + "rewards/rejected": -0.5639955401420593, + "step": 940 + }, + { + "epoch": 0.98, + "learning_rate": 3.738997321086873e-07, + "logits/chosen": -2.4052574634552, + "logits/rejected": -2.359984874725342, + "logps/chosen": -257.97174072265625, + "logps/rejected": -224.43533325195312, + "loss": 0.569, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.055308748036623, + "rewards/margins": 0.41939839720726013, + "rewards/rejected": -0.47470712661743164, + "step": 950 + }, + { + "epoch": 0.99, + "learning_rate": 3.7198622273249137e-07, + "logits/chosen": -2.4320504665374756, + "logits/rejected": -2.4140851497650146, + "logps/chosen": -262.2895202636719, + "logps/rejected": -195.298583984375, + "loss": 0.5516, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.0959898829460144, + "rewards/margins": 0.4007217288017273, + "rewards/rejected": -0.4967115819454193, + "step": 960 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -2.11159610748291, + "eval_logits/rejected": -1.9903388023376465, + "eval_logps/chosen": -265.77178955078125, + "eval_logps/rejected": -225.71365356445312, + "eval_loss": 0.5546568632125854, + "eval_rewards/accuracies": 0.7160000205039978, + "eval_rewards/chosen": -0.11396687477827072, + "eval_rewards/margins": 0.5291071534156799, + "eval_rewards/rejected": -0.6430740356445312, + "eval_runtime": 602.672, + "eval_samples_per_second": 3.319, + "eval_steps_per_second": 0.207, + "step": 968 + }, + { + "epoch": 1.0, + "learning_rate": 3.7007271335629544e-07, + "logits/chosen": -2.3931021690368652, + "logits/rejected": -2.2940726280212402, + "logps/chosen": -265.32965087890625, + "logps/rejected": -216.5413055419922, + "loss": 0.5557, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06538908183574677, + "rewards/margins": 0.636194109916687, + "rewards/rejected": -0.7015832662582397, + "step": 970 + }, + { + "epoch": 1.01, + "learning_rate": 3.681592039800995e-07, + "logits/chosen": -2.3853671550750732, + "logits/rejected": -2.3550171852111816, + "logps/chosen": -242.0519561767578, + "logps/rejected": -217.999755859375, + "loss": 0.5501, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.10068665444850922, + "rewards/margins": 0.456498384475708, + "rewards/rejected": -0.5571850538253784, + "step": 980 + }, + { + "epoch": 1.02, + "learning_rate": 3.662456946039035e-07, + "logits/chosen": -2.339399814605713, + "logits/rejected": -2.3183839321136475, + "logps/chosen": -231.6448974609375, + "logps/rejected": -201.71688842773438, + "loss": 0.5999, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.19742384552955627, + "rewards/margins": 0.38226670026779175, + "rewards/rejected": -0.5796905159950256, + "step": 990 + }, + { + "epoch": 1.03, + "learning_rate": 3.6433218522770757e-07, + "logits/chosen": -2.3831546306610107, + "logits/rejected": -2.319021701812744, + "logps/chosen": -295.373291015625, + "logps/rejected": -220.7757110595703, + "loss": 0.5809, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.07577923685312271, + "rewards/margins": 0.4884832501411438, + "rewards/rejected": -0.5642624497413635, + "step": 1000 + }, + { + "epoch": 1.04, + "learning_rate": 3.6241867585151163e-07, + "logits/chosen": -2.4494717121124268, + "logits/rejected": -2.387357711791992, + "logps/chosen": -248.5869140625, + "logps/rejected": -225.55770874023438, + "loss": 0.5584, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13781091570854187, + "rewards/margins": 0.47680073976516724, + "rewards/rejected": -0.6146116256713867, + "step": 1010 + }, + { + "epoch": 1.05, + "learning_rate": 3.605051664753157e-07, + "logits/chosen": -2.4245686531066895, + "logits/rejected": -2.430293560028076, + "logps/chosen": -251.4219207763672, + "logps/rejected": -202.93777465820312, + "loss": 0.5474, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.09323601424694061, + "rewards/margins": 0.5475600957870483, + "rewards/rejected": -0.6407961249351501, + "step": 1020 + }, + { + "epoch": 1.06, + "learning_rate": 3.5859165709911975e-07, + "logits/chosen": -2.3130688667297363, + "logits/rejected": -2.319304943084717, + "logps/chosen": -292.4891052246094, + "logps/rejected": -256.63336181640625, + "loss": 0.5382, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.045892782509326935, + "rewards/margins": 0.6422568559646606, + "rewards/rejected": -0.688149631023407, + "step": 1030 + }, + { + "epoch": 1.07, + "learning_rate": 3.566781477229238e-07, + "logits/chosen": -2.3673741817474365, + "logits/rejected": -2.2591471672058105, + "logps/chosen": -259.9045104980469, + "logps/rejected": -235.18899536132812, + "loss": 0.5544, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.09036435186862946, + "rewards/margins": 0.5122971534729004, + "rewards/rejected": -0.6026615500450134, + "step": 1040 + }, + { + "epoch": 1.08, + "learning_rate": 3.547646383467279e-07, + "logits/chosen": -2.3258612155914307, + "logits/rejected": -2.3310484886169434, + "logps/chosen": -256.3821105957031, + "logps/rejected": -235.92697143554688, + "loss": 0.5302, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.04464683681726456, + "rewards/margins": 0.6154407858848572, + "rewards/rejected": -0.6600876450538635, + "step": 1050 + }, + { + "epoch": 1.09, + "learning_rate": 3.5285112897053194e-07, + "logits/chosen": -2.401237726211548, + "logits/rejected": -2.2862296104431152, + "logps/chosen": -275.3665466308594, + "logps/rejected": -235.6302032470703, + "loss": 0.5405, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.1668693572282791, + "rewards/margins": 0.7094846367835999, + "rewards/rejected": -0.8763540387153625, + "step": 1060 + }, + { + "epoch": 1.11, + "learning_rate": 3.50937619594336e-07, + "logits/chosen": -2.427232265472412, + "logits/rejected": -2.3744897842407227, + "logps/chosen": -291.2286682128906, + "logps/rejected": -261.8435363769531, + "loss": 0.5492, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0822446197271347, + "rewards/margins": 0.643202543258667, + "rewards/rejected": -0.7254471778869629, + "step": 1070 + }, + { + "epoch": 1.12, + "learning_rate": 3.4902411021814007e-07, + "logits/chosen": -2.4489409923553467, + "logits/rejected": -2.3586411476135254, + "logps/chosen": -294.7502746582031, + "logps/rejected": -229.1472625732422, + "loss": 0.5273, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.000645542168058455, + "rewards/margins": 0.7959606051445007, + "rewards/rejected": -0.796606183052063, + "step": 1080 + }, + { + "epoch": 1.13, + "learning_rate": 3.4711060084194413e-07, + "logits/chosen": -2.3935751914978027, + "logits/rejected": -2.4032771587371826, + "logps/chosen": -272.8512878417969, + "logps/rejected": -228.17575073242188, + "loss": 0.5702, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.05014977604150772, + "rewards/margins": 0.5901867747306824, + "rewards/rejected": -0.6403365135192871, + "step": 1090 + }, + { + "epoch": 1.14, + "learning_rate": 3.4519709146574814e-07, + "logits/chosen": -2.474759578704834, + "logits/rejected": -2.3402395248413086, + "logps/chosen": -267.2554931640625, + "logps/rejected": -247.7689666748047, + "loss": 0.5598, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.08641926199197769, + "rewards/margins": 0.5510476231575012, + "rewards/rejected": -0.6374668478965759, + "step": 1100 + }, + { + "epoch": 1.15, + "learning_rate": 3.432835820895522e-07, + "logits/chosen": -2.408102512359619, + "logits/rejected": -2.3299994468688965, + "logps/chosen": -249.98458862304688, + "logps/rejected": -230.2236328125, + "loss": 0.5427, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.10258053243160248, + "rewards/margins": 0.5458223223686218, + "rewards/rejected": -0.6484029293060303, + "step": 1110 + }, + { + "epoch": 1.16, + "learning_rate": 3.4137007271335626e-07, + "logits/chosen": -2.349371910095215, + "logits/rejected": -2.3972179889678955, + "logps/chosen": -218.875, + "logps/rejected": -212.8376922607422, + "loss": 0.5422, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.180108904838562, + "rewards/margins": 0.4184727072715759, + "rewards/rejected": -0.5985815525054932, + "step": 1120 + }, + { + "epoch": 1.17, + "learning_rate": 3.394565633371603e-07, + "logits/chosen": -2.358839511871338, + "logits/rejected": -2.299063205718994, + "logps/chosen": -293.2892150878906, + "logps/rejected": -245.3092498779297, + "loss": 0.5364, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.14460349082946777, + "rewards/margins": 0.6029571294784546, + "rewards/rejected": -0.7475606203079224, + "step": 1130 + }, + { + "epoch": 1.18, + "learning_rate": 3.375430539609644e-07, + "logits/chosen": -2.2807836532592773, + "logits/rejected": -2.345672130584717, + "logps/chosen": -239.72476196289062, + "logps/rejected": -211.0787353515625, + "loss": 0.5132, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.13182897865772247, + "rewards/margins": 0.6086454391479492, + "rewards/rejected": -0.7404743432998657, + "step": 1140 + }, + { + "epoch": 1.19, + "learning_rate": 3.3562954458476845e-07, + "logits/chosen": -2.441943407058716, + "logits/rejected": -2.4284987449645996, + "logps/chosen": -306.66900634765625, + "logps/rejected": -233.69290161132812, + "loss": 0.5589, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.15195252001285553, + "rewards/margins": 0.5843501687049866, + "rewards/rejected": -0.7363026738166809, + "step": 1150 + }, + { + "epoch": 1.2, + "learning_rate": 3.337160352085725e-07, + "logits/chosen": -2.4269309043884277, + "logits/rejected": -2.3844103813171387, + "logps/chosen": -298.20721435546875, + "logps/rejected": -241.6977996826172, + "loss": 0.5365, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.07097179442644119, + "rewards/margins": 0.7560560703277588, + "rewards/rejected": -0.8270279169082642, + "step": 1160 + }, + { + "epoch": 1.21, + "learning_rate": 3.3180252583237657e-07, + "logits/chosen": -2.336698532104492, + "logits/rejected": -2.2902188301086426, + "logps/chosen": -278.00860595703125, + "logps/rejected": -231.4420928955078, + "loss": 0.5528, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06676442921161652, + "rewards/margins": 0.5907629132270813, + "rewards/rejected": -0.6575273275375366, + "step": 1170 + }, + { + "epoch": 1.22, + "learning_rate": 3.2988901645618063e-07, + "logits/chosen": -2.368316650390625, + "logits/rejected": -2.2780606746673584, + "logps/chosen": -261.25482177734375, + "logps/rejected": -225.92269897460938, + "loss": 0.5388, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.1373300552368164, + "rewards/margins": 0.5785337686538696, + "rewards/rejected": -0.715863823890686, + "step": 1180 + }, + { + "epoch": 1.23, + "learning_rate": 3.279755070799847e-07, + "logits/chosen": -2.402346134185791, + "logits/rejected": -2.3095052242279053, + "logps/chosen": -259.71075439453125, + "logps/rejected": -257.037353515625, + "loss": 0.5278, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.15283453464508057, + "rewards/margins": 0.526613175868988, + "rewards/rejected": -0.6794477105140686, + "step": 1190 + }, + { + "epoch": 1.24, + "learning_rate": 3.260619977037887e-07, + "logits/chosen": -2.3968770503997803, + "logits/rejected": -2.340967893600464, + "logps/chosen": -251.2769775390625, + "logps/rejected": -214.03146362304688, + "loss": 0.5209, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.07637427002191544, + "rewards/margins": 0.7686988711357117, + "rewards/rejected": -0.8450730443000793, + "step": 1200 + }, + { + "epoch": 1.25, + "learning_rate": 3.2414848832759277e-07, + "logits/chosen": -2.4229166507720947, + "logits/rejected": -2.2702252864837646, + "logps/chosen": -261.66217041015625, + "logps/rejected": -227.0311279296875, + "loss": 0.534, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.14070875942707062, + "rewards/margins": 0.5856814384460449, + "rewards/rejected": -0.7263902425765991, + "step": 1210 + }, + { + "epoch": 1.26, + "learning_rate": 3.2223497895139683e-07, + "logits/chosen": -2.3875441551208496, + "logits/rejected": -2.3234646320343018, + "logps/chosen": -263.7579040527344, + "logps/rejected": -219.34719848632812, + "loss": 0.5537, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11774899810552597, + "rewards/margins": 0.5334910154342651, + "rewards/rejected": -0.6512399911880493, + "step": 1220 + }, + { + "epoch": 1.27, + "learning_rate": 3.203214695752009e-07, + "logits/chosen": -2.4645657539367676, + "logits/rejected": -2.385596990585327, + "logps/chosen": -262.8403015136719, + "logps/rejected": -230.195556640625, + "loss": 0.5689, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.15035703778266907, + "rewards/margins": 0.5768887400627136, + "rewards/rejected": -0.7272458076477051, + "step": 1230 + }, + { + "epoch": 1.28, + "learning_rate": 3.1840796019900495e-07, + "logits/chosen": -2.426673173904419, + "logits/rejected": -2.3877110481262207, + "logps/chosen": -262.39178466796875, + "logps/rejected": -254.9650115966797, + "loss": 0.5457, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.11363549530506134, + "rewards/margins": 0.6810011267662048, + "rewards/rejected": -0.7946366667747498, + "step": 1240 + }, + { + "epoch": 1.29, + "learning_rate": 3.16494450822809e-07, + "logits/chosen": -2.3528525829315186, + "logits/rejected": -2.3542237281799316, + "logps/chosen": -236.56967163085938, + "logps/rejected": -233.4956512451172, + "loss": 0.5372, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.18625274300575256, + "rewards/margins": 0.48403066396713257, + "rewards/rejected": -0.6702834367752075, + "step": 1250 + }, + { + "epoch": 1.3, + "learning_rate": 3.145809414466131e-07, + "logits/chosen": -2.48598313331604, + "logits/rejected": -2.4767704010009766, + "logps/chosen": -272.78662109375, + "logps/rejected": -266.92413330078125, + "loss": 0.5373, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.14394977688789368, + "rewards/margins": 0.49152618646621704, + "rewards/rejected": -0.6354759335517883, + "step": 1260 + }, + { + "epoch": 1.31, + "learning_rate": 3.1266743207041714e-07, + "logits/chosen": -2.4058761596679688, + "logits/rejected": -2.403879165649414, + "logps/chosen": -247.5891571044922, + "logps/rejected": -226.5963592529297, + "loss": 0.5451, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.14257661998271942, + "rewards/margins": 0.6169610023498535, + "rewards/rejected": -0.7595376968383789, + "step": 1270 + }, + { + "epoch": 1.32, + "learning_rate": 3.107539226942212e-07, + "logits/chosen": -2.392540216445923, + "logits/rejected": -2.370917558670044, + "logps/chosen": -259.8185729980469, + "logps/rejected": -240.8582000732422, + "loss": 0.5764, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.14314888417720795, + "rewards/margins": 0.5050605535507202, + "rewards/rejected": -0.6482094526290894, + "step": 1280 + }, + { + "epoch": 1.33, + "learning_rate": 3.0884041331802526e-07, + "logits/chosen": -2.3564376831054688, + "logits/rejected": -2.2619478702545166, + "logps/chosen": -243.14151000976562, + "logps/rejected": -216.67678833007812, + "loss": 0.5529, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.17953407764434814, + "rewards/margins": 0.5211489200592041, + "rewards/rejected": -0.7006829977035522, + "step": 1290 + }, + { + "epoch": 1.34, + "learning_rate": 3.0692690394182927e-07, + "logits/chosen": -2.435981035232544, + "logits/rejected": -2.3964314460754395, + "logps/chosen": -277.2529296875, + "logps/rejected": -233.1074676513672, + "loss": 0.5528, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.05609757825732231, + "rewards/margins": 0.6833099722862244, + "rewards/rejected": -0.7394075393676758, + "step": 1300 + }, + { + "epoch": 1.35, + "learning_rate": 3.0501339456563334e-07, + "logits/chosen": -2.414605140686035, + "logits/rejected": -2.270141363143921, + "logps/chosen": -269.33770751953125, + "logps/rejected": -243.461669921875, + "loss": 0.5338, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.1670106053352356, + "rewards/margins": 0.5737408995628357, + "rewards/rejected": -0.7407516241073608, + "step": 1310 + }, + { + "epoch": 1.36, + "learning_rate": 3.030998851894374e-07, + "logits/chosen": -2.3524057865142822, + "logits/rejected": -2.3626418113708496, + "logps/chosen": -229.76596069335938, + "logps/rejected": -221.08889770507812, + "loss": 0.5137, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.07673561573028564, + "rewards/margins": 0.6764390468597412, + "rewards/rejected": -0.7531746029853821, + "step": 1320 + }, + { + "epoch": 1.37, + "learning_rate": 3.0118637581324146e-07, + "logits/chosen": -2.422010898590088, + "logits/rejected": -2.3673031330108643, + "logps/chosen": -257.2743835449219, + "logps/rejected": -223.86349487304688, + "loss": 0.5407, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.09269039332866669, + "rewards/margins": 0.7229348421096802, + "rewards/rejected": -0.8156253099441528, + "step": 1330 + }, + { + "epoch": 1.38, + "learning_rate": 2.992728664370455e-07, + "logits/chosen": -2.4492976665496826, + "logits/rejected": -2.3872756958007812, + "logps/chosen": -239.734375, + "logps/rejected": -225.47982788085938, + "loss": 0.5593, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.21187777817249298, + "rewards/margins": 0.5356577634811401, + "rewards/rejected": -0.7475355267524719, + "step": 1340 + }, + { + "epoch": 1.39, + "learning_rate": 2.973593570608496e-07, + "logits/chosen": -2.3429622650146484, + "logits/rejected": -2.2622060775756836, + "logps/chosen": -261.2279052734375, + "logps/rejected": -220.1876983642578, + "loss": 0.552, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.11654462665319443, + "rewards/margins": 0.6566459536552429, + "rewards/rejected": -0.7731907367706299, + "step": 1350 + }, + { + "epoch": 1.4, + "learning_rate": 2.9544584768465365e-07, + "logits/chosen": -2.415435314178467, + "logits/rejected": -2.3692467212677, + "logps/chosen": -310.05035400390625, + "logps/rejected": -254.33297729492188, + "loss": 0.553, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.1930284947156906, + "rewards/margins": 0.6540157198905945, + "rewards/rejected": -0.8470442891120911, + "step": 1360 + }, + { + "epoch": 1.41, + "learning_rate": 2.935323383084577e-07, + "logits/chosen": -2.382997989654541, + "logits/rejected": -2.331480026245117, + "logps/chosen": -290.0738830566406, + "logps/rejected": -242.3267822265625, + "loss": 0.5599, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09350194036960602, + "rewards/margins": 0.5290672183036804, + "rewards/rejected": -0.6225691437721252, + "step": 1370 + }, + { + "epoch": 1.43, + "learning_rate": 2.9161882893226177e-07, + "logits/chosen": -2.358276128768921, + "logits/rejected": -2.2521064281463623, + "logps/chosen": -240.8756561279297, + "logps/rejected": -204.3744659423828, + "loss": 0.5443, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.21055755019187927, + "rewards/margins": 0.43394798040390015, + "rewards/rejected": -0.6445055603981018, + "step": 1380 + }, + { + "epoch": 1.44, + "learning_rate": 2.8970531955606583e-07, + "logits/chosen": -2.453087568283081, + "logits/rejected": -2.353877305984497, + "logps/chosen": -270.0442810058594, + "logps/rejected": -231.3137664794922, + "loss": 0.5677, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.15533334016799927, + "rewards/margins": 0.6584349870681763, + "rewards/rejected": -0.8137682676315308, + "step": 1390 + }, + { + "epoch": 1.45, + "learning_rate": 2.8779181017986984e-07, + "logits/chosen": -2.4245669841766357, + "logits/rejected": -2.402334213256836, + "logps/chosen": -269.0023498535156, + "logps/rejected": -239.24465942382812, + "loss": 0.5534, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.22131021320819855, + "rewards/margins": 0.5528482794761658, + "rewards/rejected": -0.7741583585739136, + "step": 1400 + }, + { + "epoch": 1.46, + "learning_rate": 2.858783008036739e-07, + "logits/chosen": -2.314622402191162, + "logits/rejected": -2.2626185417175293, + "logps/chosen": -278.54620361328125, + "logps/rejected": -220.9359893798828, + "loss": 0.5297, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1343500018119812, + "rewards/margins": 0.6538098454475403, + "rewards/rejected": -0.7881597280502319, + "step": 1410 + }, + { + "epoch": 1.47, + "learning_rate": 2.8396479142747797e-07, + "logits/chosen": -2.3986763954162598, + "logits/rejected": -2.3692593574523926, + "logps/chosen": -230.23330688476562, + "logps/rejected": -201.45968627929688, + "loss": 0.5397, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.16032734513282776, + "rewards/margins": 0.5956254005432129, + "rewards/rejected": -0.7559527158737183, + "step": 1420 + }, + { + "epoch": 1.48, + "learning_rate": 2.8205128205128203e-07, + "logits/chosen": -2.3531031608581543, + "logits/rejected": -2.3053078651428223, + "logps/chosen": -260.5692443847656, + "logps/rejected": -229.7131805419922, + "loss": 0.5422, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.27216142416000366, + "rewards/margins": 0.5033277869224548, + "rewards/rejected": -0.7754892110824585, + "step": 1430 + }, + { + "epoch": 1.49, + "learning_rate": 2.801377726750861e-07, + "logits/chosen": -2.386915683746338, + "logits/rejected": -2.315340757369995, + "logps/chosen": -256.1626281738281, + "logps/rejected": -217.6416473388672, + "loss": 0.5438, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.18697381019592285, + "rewards/margins": 0.49360641837120056, + "rewards/rejected": -0.6805802583694458, + "step": 1440 + }, + { + "epoch": 1.5, + "learning_rate": 2.7822426329889015e-07, + "logits/chosen": -2.428358554840088, + "logits/rejected": -2.3230361938476562, + "logps/chosen": -294.2705993652344, + "logps/rejected": -235.54171752929688, + "loss": 0.5623, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.146778404712677, + "rewards/margins": 0.6038089394569397, + "rewards/rejected": -0.7505873441696167, + "step": 1450 + }, + { + "epoch": 1.51, + "learning_rate": 2.763107539226942e-07, + "logits/chosen": -2.3813319206237793, + "logits/rejected": -2.3310248851776123, + "logps/chosen": -280.82110595703125, + "logps/rejected": -242.4944305419922, + "loss": 0.5582, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.12899862229824066, + "rewards/margins": 0.6421502828598022, + "rewards/rejected": -0.7711488604545593, + "step": 1460 + }, + { + "epoch": 1.52, + "learning_rate": 2.743972445464983e-07, + "logits/chosen": -2.4226574897766113, + "logits/rejected": -2.411165475845337, + "logps/chosen": -254.8177947998047, + "logps/rejected": -232.544921875, + "loss": 0.5599, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1896516978740692, + "rewards/margins": 0.4945623278617859, + "rewards/rejected": -0.6842139959335327, + "step": 1470 + }, + { + "epoch": 1.53, + "learning_rate": 2.7248373517030234e-07, + "logits/chosen": -2.475400924682617, + "logits/rejected": -2.419384479522705, + "logps/chosen": -274.40484619140625, + "logps/rejected": -237.3146209716797, + "loss": 0.5407, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.16451963782310486, + "rewards/margins": 0.6172757744789124, + "rewards/rejected": -0.7817953824996948, + "step": 1480 + }, + { + "epoch": 1.54, + "learning_rate": 2.705702257941064e-07, + "logits/chosen": -2.433335304260254, + "logits/rejected": -2.435105562210083, + "logps/chosen": -311.65618896484375, + "logps/rejected": -246.72998046875, + "loss": 0.5362, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.16845114529132843, + "rewards/margins": 0.6784511804580688, + "rewards/rejected": -0.8469023704528809, + "step": 1490 + }, + { + "epoch": 1.55, + "learning_rate": 2.686567164179104e-07, + "logits/chosen": -2.3490521907806396, + "logits/rejected": -2.275282144546509, + "logps/chosen": -259.00787353515625, + "logps/rejected": -219.07669067382812, + "loss": 0.5453, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.16528849303722382, + "rewards/margins": 0.5990539193153381, + "rewards/rejected": -0.7643424272537231, + "step": 1500 + }, + { + "epoch": 1.56, + "learning_rate": 2.6674320704171447e-07, + "logits/chosen": -2.4171411991119385, + "logits/rejected": -2.3591837882995605, + "logps/chosen": -259.59521484375, + "logps/rejected": -206.38058471679688, + "loss": 0.5221, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.10826051235198975, + "rewards/margins": 0.7560392022132874, + "rewards/rejected": -0.8642997741699219, + "step": 1510 + }, + { + "epoch": 1.57, + "learning_rate": 2.6482969766551853e-07, + "logits/chosen": -2.4498002529144287, + "logits/rejected": -2.447680950164795, + "logps/chosen": -274.2942810058594, + "logps/rejected": -237.7410430908203, + "loss": 0.5422, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.0813172236084938, + "rewards/margins": 0.7396507859230042, + "rewards/rejected": -0.8209678530693054, + "step": 1520 + }, + { + "epoch": 1.58, + "learning_rate": 2.629161882893226e-07, + "logits/chosen": -2.446469783782959, + "logits/rejected": -2.3727550506591797, + "logps/chosen": -280.4594421386719, + "logps/rejected": -214.17269897460938, + "loss": 0.5184, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.1613025665283203, + "rewards/margins": 0.7477348446846008, + "rewards/rejected": -0.9090374708175659, + "step": 1530 + }, + { + "epoch": 1.59, + "learning_rate": 2.6100267891312666e-07, + "logits/chosen": -2.3352808952331543, + "logits/rejected": -2.3035221099853516, + "logps/chosen": -269.15008544921875, + "logps/rejected": -208.6403350830078, + "loss": 0.5564, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.1250370442867279, + "rewards/margins": 0.6833819150924683, + "rewards/rejected": -0.8084190487861633, + "step": 1540 + }, + { + "epoch": 1.6, + "learning_rate": 2.590891695369307e-07, + "logits/chosen": -2.4506659507751465, + "logits/rejected": -2.362159490585327, + "logps/chosen": -256.0645751953125, + "logps/rejected": -210.37814331054688, + "loss": 0.5268, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.16175204515457153, + "rewards/margins": 0.5650383830070496, + "rewards/rejected": -0.7267904281616211, + "step": 1550 + }, + { + "epoch": 1.61, + "learning_rate": 2.571756601607348e-07, + "logits/chosen": -2.418126106262207, + "logits/rejected": -2.3879923820495605, + "logps/chosen": -294.83013916015625, + "logps/rejected": -217.4688720703125, + "loss": 0.5443, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.19011719524860382, + "rewards/margins": 0.65594881772995, + "rewards/rejected": -0.8460659980773926, + "step": 1560 + }, + { + "epoch": 1.62, + "learning_rate": 2.5526215078453884e-07, + "logits/chosen": -2.4299476146698, + "logits/rejected": -2.3945670127868652, + "logps/chosen": -301.6930847167969, + "logps/rejected": -237.73892211914062, + "loss": 0.5348, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.1024274080991745, + "rewards/margins": 0.7538636326789856, + "rewards/rejected": -0.8562909960746765, + "step": 1570 + }, + { + "epoch": 1.63, + "learning_rate": 2.533486414083429e-07, + "logits/chosen": -2.4256393909454346, + "logits/rejected": -2.3724522590637207, + "logps/chosen": -306.9393615722656, + "logps/rejected": -249.2686004638672, + "loss": 0.516, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.08279596269130707, + "rewards/margins": 0.7269363403320312, + "rewards/rejected": -0.8097323179244995, + "step": 1580 + }, + { + "epoch": 1.64, + "learning_rate": 2.5143513203214697e-07, + "logits/chosen": -2.3666234016418457, + "logits/rejected": -2.3259263038635254, + "logps/chosen": -275.7200012207031, + "logps/rejected": -225.26425170898438, + "loss": 0.5617, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20898135006427765, + "rewards/margins": 0.559633195400238, + "rewards/rejected": -0.7686145305633545, + "step": 1590 + }, + { + "epoch": 1.65, + "learning_rate": 2.49521622655951e-07, + "logits/chosen": -2.4929592609405518, + "logits/rejected": -2.355666160583496, + "logps/chosen": -293.1372375488281, + "logps/rejected": -262.8376159667969, + "loss": 0.5547, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.04876649007201195, + "rewards/margins": 0.7978218197822571, + "rewards/rejected": -0.8465882539749146, + "step": 1600 + }, + { + "epoch": 1.66, + "learning_rate": 2.4760811327975504e-07, + "logits/chosen": -2.4678738117218018, + "logits/rejected": -2.3763632774353027, + "logps/chosen": -273.61614990234375, + "logps/rejected": -253.130126953125, + "loss": 0.5343, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1364731788635254, + "rewards/margins": 0.5863515734672546, + "rewards/rejected": -0.72282475233078, + "step": 1610 + }, + { + "epoch": 1.67, + "learning_rate": 2.456946039035591e-07, + "logits/chosen": -2.4671170711517334, + "logits/rejected": -2.4366583824157715, + "logps/chosen": -290.646484375, + "logps/rejected": -257.22418212890625, + "loss": 0.5646, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.10060103982686996, + "rewards/margins": 0.7575327754020691, + "rewards/rejected": -0.8581337928771973, + "step": 1620 + }, + { + "epoch": 1.68, + "learning_rate": 2.4378109452736316e-07, + "logits/chosen": -2.3189241886138916, + "logits/rejected": -2.3253917694091797, + "logps/chosen": -272.53802490234375, + "logps/rejected": -198.298095703125, + "loss": 0.5301, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.10420193523168564, + "rewards/margins": 0.7064443826675415, + "rewards/rejected": -0.8106463551521301, + "step": 1630 + }, + { + "epoch": 1.69, + "learning_rate": 2.418675851511672e-07, + "logits/chosen": -2.3077940940856934, + "logits/rejected": -2.3118600845336914, + "logps/chosen": -281.74365234375, + "logps/rejected": -221.730712890625, + "loss": 0.5622, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.18291965126991272, + "rewards/margins": 0.653192937374115, + "rewards/rejected": -0.8361126184463501, + "step": 1640 + }, + { + "epoch": 1.7, + "learning_rate": 2.399540757749713e-07, + "logits/chosen": -2.373295783996582, + "logits/rejected": -2.2997496128082275, + "logps/chosen": -250.97915649414062, + "logps/rejected": -216.4395751953125, + "loss": 0.5263, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.07915716618299484, + "rewards/margins": 0.9101131558418274, + "rewards/rejected": -0.9892703294754028, + "step": 1650 + }, + { + "epoch": 1.71, + "learning_rate": 2.3804056639877535e-07, + "logits/chosen": -2.3339505195617676, + "logits/rejected": -2.2506117820739746, + "logps/chosen": -269.00439453125, + "logps/rejected": -230.38687133789062, + "loss": 0.5566, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.12828503549098969, + "rewards/margins": 0.7284771203994751, + "rewards/rejected": -0.8567621111869812, + "step": 1660 + }, + { + "epoch": 1.72, + "learning_rate": 2.361270570225794e-07, + "logits/chosen": -2.3134007453918457, + "logits/rejected": -2.2540464401245117, + "logps/chosen": -263.0733337402344, + "logps/rejected": -223.71591186523438, + "loss": 0.5048, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.10775689035654068, + "rewards/margins": 0.7968653440475464, + "rewards/rejected": -0.9046221971511841, + "step": 1670 + }, + { + "epoch": 1.74, + "learning_rate": 2.3421354764638345e-07, + "logits/chosen": -2.438647747039795, + "logits/rejected": -2.3648548126220703, + "logps/chosen": -296.474365234375, + "logps/rejected": -244.5324249267578, + "loss": 0.5559, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.16449935734272003, + "rewards/margins": 0.6564738750457764, + "rewards/rejected": -0.8209732174873352, + "step": 1680 + }, + { + "epoch": 1.75, + "learning_rate": 2.323000382701875e-07, + "logits/chosen": -2.3885536193847656, + "logits/rejected": -2.274456739425659, + "logps/chosen": -289.57586669921875, + "logps/rejected": -226.74734497070312, + "loss": 0.5188, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.11683692783117294, + "rewards/margins": 0.7003597021102905, + "rewards/rejected": -0.8171966671943665, + "step": 1690 + }, + { + "epoch": 1.76, + "learning_rate": 2.3038652889399157e-07, + "logits/chosen": -2.3566040992736816, + "logits/rejected": -2.3693103790283203, + "logps/chosen": -259.44281005859375, + "logps/rejected": -235.6457061767578, + "loss": 0.5218, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.07110615074634552, + "rewards/margins": 0.7475603818893433, + "rewards/rejected": -0.8186665773391724, + "step": 1700 + }, + { + "epoch": 1.77, + "learning_rate": 2.2847301951779563e-07, + "logits/chosen": -2.3193366527557373, + "logits/rejected": -2.2707600593566895, + "logps/chosen": -230.35177612304688, + "logps/rejected": -218.6141357421875, + "loss": 0.5379, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2341269999742508, + "rewards/margins": 0.5890001654624939, + "rewards/rejected": -0.8231271505355835, + "step": 1710 + }, + { + "epoch": 1.78, + "learning_rate": 2.265595101415997e-07, + "logits/chosen": -2.303278923034668, + "logits/rejected": -2.260132312774658, + "logps/chosen": -263.609130859375, + "logps/rejected": -207.05221557617188, + "loss": 0.5417, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.16092923283576965, + "rewards/margins": 0.8012853860855103, + "rewards/rejected": -0.9622145891189575, + "step": 1720 + }, + { + "epoch": 1.79, + "learning_rate": 2.2464600076540373e-07, + "logits/chosen": -2.3414905071258545, + "logits/rejected": -2.3637521266937256, + "logps/chosen": -272.04595947265625, + "logps/rejected": -233.1014862060547, + "loss": 0.5402, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.15146999061107635, + "rewards/margins": 0.6853641271591187, + "rewards/rejected": -0.8368341326713562, + "step": 1730 + }, + { + "epoch": 1.8, + "learning_rate": 2.227324913892078e-07, + "logits/chosen": -2.45133113861084, + "logits/rejected": -2.3326609134674072, + "logps/chosen": -262.7754821777344, + "logps/rejected": -233.98037719726562, + "loss": 0.5238, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.09686625003814697, + "rewards/margins": 0.6828486919403076, + "rewards/rejected": -0.7797149419784546, + "step": 1740 + }, + { + "epoch": 1.81, + "learning_rate": 2.2081898201301186e-07, + "logits/chosen": -2.4001007080078125, + "logits/rejected": -2.3274292945861816, + "logps/chosen": -251.53952026367188, + "logps/rejected": -219.2271270751953, + "loss": 0.535, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.14206108450889587, + "rewards/margins": 0.6470782160758972, + "rewards/rejected": -0.7891392707824707, + "step": 1750 + }, + { + "epoch": 1.82, + "learning_rate": 2.1890547263681592e-07, + "logits/chosen": -2.4295260906219482, + "logits/rejected": -2.36590313911438, + "logps/chosen": -271.3697814941406, + "logps/rejected": -215.4159698486328, + "loss": 0.54, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.07891818135976791, + "rewards/margins": 0.7567145824432373, + "rewards/rejected": -0.8356328010559082, + "step": 1760 + }, + { + "epoch": 1.83, + "learning_rate": 2.1699196326061998e-07, + "logits/chosen": -2.371452569961548, + "logits/rejected": -2.3672077655792236, + "logps/chosen": -265.51470947265625, + "logps/rejected": -241.41165161132812, + "loss": 0.5586, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1335269808769226, + "rewards/margins": 0.708950400352478, + "rewards/rejected": -0.8424774408340454, + "step": 1770 + }, + { + "epoch": 1.84, + "learning_rate": 2.1507845388442402e-07, + "logits/chosen": -2.395923614501953, + "logits/rejected": -2.2845911979675293, + "logps/chosen": -287.7729187011719, + "logps/rejected": -233.2380828857422, + "loss": 0.5312, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.05316392332315445, + "rewards/margins": 0.7853686809539795, + "rewards/rejected": -0.8385326266288757, + "step": 1780 + }, + { + "epoch": 1.85, + "learning_rate": 2.1316494450822808e-07, + "logits/chosen": -2.331926107406616, + "logits/rejected": -2.3294596672058105, + "logps/chosen": -269.21484375, + "logps/rejected": -242.924072265625, + "loss": 0.5469, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.12467104196548462, + "rewards/margins": 0.7131480574607849, + "rewards/rejected": -0.8378192186355591, + "step": 1790 + }, + { + "epoch": 1.86, + "learning_rate": 2.1125143513203214e-07, + "logits/chosen": -2.4469873905181885, + "logits/rejected": -2.4214444160461426, + "logps/chosen": -266.3757629394531, + "logps/rejected": -224.87142944335938, + "loss": 0.5127, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.14371030032634735, + "rewards/margins": 0.739290714263916, + "rewards/rejected": -0.8830010294914246, + "step": 1800 + }, + { + "epoch": 1.87, + "learning_rate": 2.093379257558362e-07, + "logits/chosen": -2.4263453483581543, + "logits/rejected": -2.430192232131958, + "logps/chosen": -274.2438659667969, + "logps/rejected": -237.7062530517578, + "loss": 0.5516, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.19841806590557098, + "rewards/margins": 0.6394414901733398, + "rewards/rejected": -0.8378594517707825, + "step": 1810 + }, + { + "epoch": 1.88, + "learning_rate": 2.0742441637964026e-07, + "logits/chosen": -2.3289737701416016, + "logits/rejected": -2.3154170513153076, + "logps/chosen": -299.1728820800781, + "logps/rejected": -238.6165771484375, + "loss": 0.5113, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08980865776538849, + "rewards/margins": 0.9050415754318237, + "rewards/rejected": -0.9948502779006958, + "step": 1820 + }, + { + "epoch": 1.89, + "learning_rate": 2.055109070034443e-07, + "logits/chosen": -2.4139952659606934, + "logits/rejected": -2.394843101501465, + "logps/chosen": -287.22265625, + "logps/rejected": -256.1412353515625, + "loss": 0.5224, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1007101982831955, + "rewards/margins": 0.7850233912467957, + "rewards/rejected": -0.8857336044311523, + "step": 1830 + }, + { + "epoch": 1.9, + "learning_rate": 2.0359739762724836e-07, + "logits/chosen": -2.415116310119629, + "logits/rejected": -2.3596065044403076, + "logps/chosen": -279.6313781738281, + "logps/rejected": -237.9734649658203, + "loss": 0.531, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.11246969550848007, + "rewards/margins": 0.7976399660110474, + "rewards/rejected": -0.9101096391677856, + "step": 1840 + }, + { + "epoch": 1.91, + "learning_rate": 2.0168388825105242e-07, + "logits/chosen": -2.3417422771453857, + "logits/rejected": -2.2845070362091064, + "logps/chosen": -260.4609680175781, + "logps/rejected": -265.02410888671875, + "loss": 0.5148, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.14557930827140808, + "rewards/margins": 0.8025070428848267, + "rewards/rejected": -0.9480863809585571, + "step": 1850 + }, + { + "epoch": 1.92, + "learning_rate": 1.997703788748565e-07, + "logits/chosen": -2.347747564315796, + "logits/rejected": -2.301016330718994, + "logps/chosen": -278.42144775390625, + "logps/rejected": -228.9193878173828, + "loss": 0.535, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23046691715717316, + "rewards/margins": 0.6747422218322754, + "rewards/rejected": -0.9052090644836426, + "step": 1860 + }, + { + "epoch": 1.93, + "learning_rate": 1.9785686949866055e-07, + "logits/chosen": -2.374584913253784, + "logits/rejected": -2.449509382247925, + "logps/chosen": -275.76531982421875, + "logps/rejected": -233.2628936767578, + "loss": 0.5633, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.12146653980016708, + "rewards/margins": 0.5764524340629578, + "rewards/rejected": -0.6979190111160278, + "step": 1870 + }, + { + "epoch": 1.94, + "learning_rate": 1.9594336012246458e-07, + "logits/chosen": -2.4476470947265625, + "logits/rejected": -2.3424344062805176, + "logps/chosen": -270.17877197265625, + "logps/rejected": -235.03125, + "loss": 0.5134, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.15192639827728271, + "rewards/margins": 0.7095075845718384, + "rewards/rejected": -0.8614339828491211, + "step": 1880 + }, + { + "epoch": 1.95, + "learning_rate": 1.9402985074626865e-07, + "logits/chosen": -2.378629446029663, + "logits/rejected": -2.394209861755371, + "logps/chosen": -269.76898193359375, + "logps/rejected": -228.36172485351562, + "loss": 0.5524, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.18615347146987915, + "rewards/margins": 0.6800569295883179, + "rewards/rejected": -0.8662103414535522, + "step": 1890 + }, + { + "epoch": 1.96, + "learning_rate": 1.921163413700727e-07, + "logits/chosen": -2.3340277671813965, + "logits/rejected": -2.29093861579895, + "logps/chosen": -280.81298828125, + "logps/rejected": -209.24929809570312, + "loss": 0.5775, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.2276122123003006, + "rewards/margins": 0.6620305180549622, + "rewards/rejected": -0.8896427154541016, + "step": 1900 + }, + { + "epoch": 1.97, + "learning_rate": 1.9020283199387677e-07, + "logits/chosen": -2.444658041000366, + "logits/rejected": -2.381373405456543, + "logps/chosen": -262.42974853515625, + "logps/rejected": -233.9261474609375, + "loss": 0.5526, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.06337813287973404, + "rewards/margins": 0.7153123617172241, + "rewards/rejected": -0.7786905169487, + "step": 1910 + }, + { + "epoch": 1.98, + "learning_rate": 1.8828932261768083e-07, + "logits/chosen": -2.431870222091675, + "logits/rejected": -2.3378379344940186, + "logps/chosen": -266.4524230957031, + "logps/rejected": -244.3126678466797, + "loss": 0.5322, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.16464334726333618, + "rewards/margins": 0.6538031101226807, + "rewards/rejected": -0.8184464573860168, + "step": 1920 + }, + { + "epoch": 1.99, + "learning_rate": 1.8637581324148487e-07, + "logits/chosen": -2.4436469078063965, + "logits/rejected": -2.416304349899292, + "logps/chosen": -265.97393798828125, + "logps/rejected": -240.1495361328125, + "loss": 0.5443, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.11401587724685669, + "rewards/margins": 0.8116496801376343, + "rewards/rejected": -0.9256657361984253, + "step": 1930 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -2.0740063190460205, + "eval_logits/rejected": -1.9495693445205688, + "eval_logps/chosen": -266.1383361816406, + "eval_logps/rejected": -227.92555236816406, + "eval_loss": 0.530714750289917, + "eval_rewards/accuracies": 0.7419999837875366, + "eval_rewards/chosen": -0.1506224423646927, + "eval_rewards/margins": 0.7136407494544983, + "eval_rewards/rejected": -0.8642632961273193, + "eval_runtime": 601.1247, + "eval_samples_per_second": 3.327, + "eval_steps_per_second": 0.208, + "step": 1936 + }, + { + "epoch": 2.0, + "learning_rate": 1.8446230386528893e-07, + "logits/chosen": -2.3547306060791016, + "logits/rejected": -2.309804677963257, + "logps/chosen": -239.15170288085938, + "logps/rejected": -227.03646850585938, + "loss": 0.5441, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.20203718543052673, + "rewards/margins": 0.5975244641304016, + "rewards/rejected": -0.799561619758606, + "step": 1940 + }, + { + "epoch": 2.01, + "learning_rate": 1.82548794489093e-07, + "logits/chosen": -2.30757474899292, + "logits/rejected": -2.3320353031158447, + "logps/chosen": -243.75341796875, + "logps/rejected": -234.9235076904297, + "loss": 0.5412, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.19783169031143188, + "rewards/margins": 0.6387797594070435, + "rewards/rejected": -0.8366113901138306, + "step": 1950 + }, + { + "epoch": 2.02, + "learning_rate": 1.8063528511289706e-07, + "logits/chosen": -2.358428478240967, + "logits/rejected": -2.277144432067871, + "logps/chosen": -286.93426513671875, + "logps/rejected": -259.59027099609375, + "loss": 0.5317, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.09961538016796112, + "rewards/margins": 0.7642472982406616, + "rewards/rejected": -0.8638626337051392, + "step": 1960 + }, + { + "epoch": 2.03, + "learning_rate": 1.7872177573670112e-07, + "logits/chosen": -2.4061942100524902, + "logits/rejected": -2.4171481132507324, + "logps/chosen": -252.14535522460938, + "logps/rejected": -230.7240753173828, + "loss": 0.5397, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.09236567467451096, + "rewards/margins": 0.6671853065490723, + "rewards/rejected": -0.7595510482788086, + "step": 1970 + }, + { + "epoch": 2.04, + "learning_rate": 1.7680826636050515e-07, + "logits/chosen": -2.3134634494781494, + "logits/rejected": -2.2795519828796387, + "logps/chosen": -278.4905090332031, + "logps/rejected": -234.19482421875, + "loss": 0.5249, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.07433702796697617, + "rewards/margins": 0.7174164652824402, + "rewards/rejected": -0.7917534708976746, + "step": 1980 + }, + { + "epoch": 2.06, + "learning_rate": 1.7489475698430921e-07, + "logits/chosen": -2.3330302238464355, + "logits/rejected": -2.3669610023498535, + "logps/chosen": -249.8580322265625, + "logps/rejected": -249.130615234375, + "loss": 0.5555, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.14169186353683472, + "rewards/margins": 0.6893213391304016, + "rewards/rejected": -0.8310132026672363, + "step": 1990 + }, + { + "epoch": 2.07, + "learning_rate": 1.7298124760811328e-07, + "logits/chosen": -2.404921054840088, + "logits/rejected": -2.3844478130340576, + "logps/chosen": -277.40667724609375, + "logps/rejected": -224.07015991210938, + "loss": 0.5389, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.13002373278141022, + "rewards/margins": 0.7138842344284058, + "rewards/rejected": -0.8439079523086548, + "step": 2000 + }, + { + "epoch": 2.08, + "learning_rate": 1.7106773823191734e-07, + "logits/chosen": -2.4383153915405273, + "logits/rejected": -2.3728132247924805, + "logps/chosen": -270.46270751953125, + "logps/rejected": -240.8298797607422, + "loss": 0.53, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.1426972895860672, + "rewards/margins": 0.8270591497421265, + "rewards/rejected": -0.9697564244270325, + "step": 2010 + }, + { + "epoch": 2.09, + "learning_rate": 1.691542288557214e-07, + "logits/chosen": -2.418792247772217, + "logits/rejected": -2.321077346801758, + "logps/chosen": -297.71197509765625, + "logps/rejected": -230.908935546875, + "loss": 0.543, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.18689216673374176, + "rewards/margins": 0.7213765382766724, + "rewards/rejected": -0.9082688093185425, + "step": 2020 + }, + { + "epoch": 2.1, + "learning_rate": 1.6724071947952544e-07, + "logits/chosen": -2.4345123767852783, + "logits/rejected": -2.274202823638916, + "logps/chosen": -240.6513671875, + "logps/rejected": -232.0623779296875, + "loss": 0.5363, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.16003349423408508, + "rewards/margins": 0.6547808647155762, + "rewards/rejected": -0.8148144483566284, + "step": 2030 + }, + { + "epoch": 2.11, + "learning_rate": 1.653272101033295e-07, + "logits/chosen": -2.4675419330596924, + "logits/rejected": -2.341759204864502, + "logps/chosen": -276.0350036621094, + "logps/rejected": -212.1591339111328, + "loss": 0.514, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.10016350448131561, + "rewards/margins": 0.6785081624984741, + "rewards/rejected": -0.7786716222763062, + "step": 2040 + }, + { + "epoch": 2.12, + "learning_rate": 1.6341370072713356e-07, + "logits/chosen": -2.470984935760498, + "logits/rejected": -2.4148213863372803, + "logps/chosen": -302.93951416015625, + "logps/rejected": -248.380859375, + "loss": 0.5384, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.08969398587942123, + "rewards/margins": 0.8469365835189819, + "rewards/rejected": -0.9366306066513062, + "step": 2050 + }, + { + "epoch": 2.13, + "learning_rate": 1.6150019135093762e-07, + "logits/chosen": -2.452855348587036, + "logits/rejected": -2.3725485801696777, + "logps/chosen": -304.5735168457031, + "logps/rejected": -232.25656127929688, + "loss": 0.5285, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.132027268409729, + "rewards/margins": 0.7218815088272095, + "rewards/rejected": -0.8539088368415833, + "step": 2060 + }, + { + "epoch": 2.14, + "learning_rate": 1.5958668197474169e-07, + "logits/chosen": -2.461775541305542, + "logits/rejected": -2.3796803951263428, + "logps/chosen": -272.814453125, + "logps/rejected": -222.52627563476562, + "loss": 0.5367, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.105324387550354, + "rewards/margins": 0.8349732160568237, + "rewards/rejected": -0.9402976036071777, + "step": 2070 + }, + { + "epoch": 2.15, + "learning_rate": 1.5767317259854572e-07, + "logits/chosen": -2.4011590480804443, + "logits/rejected": -2.3761496543884277, + "logps/chosen": -262.963623046875, + "logps/rejected": -233.1734619140625, + "loss": 0.5336, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.15259674191474915, + "rewards/margins": 0.7275049090385437, + "rewards/rejected": -0.8801015615463257, + "step": 2080 + }, + { + "epoch": 2.16, + "learning_rate": 1.5575966322234978e-07, + "logits/chosen": -2.326183319091797, + "logits/rejected": -2.2559120655059814, + "logps/chosen": -259.02337646484375, + "logps/rejected": -247.8983154296875, + "loss": 0.5301, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.1156080812215805, + "rewards/margins": 0.7456148862838745, + "rewards/rejected": -0.861223042011261, + "step": 2090 + }, + { + "epoch": 2.17, + "learning_rate": 1.5384615384615385e-07, + "logits/chosen": -2.3786349296569824, + "logits/rejected": -2.2938733100891113, + "logps/chosen": -238.00930786132812, + "logps/rejected": -218.44833374023438, + "loss": 0.5195, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.17429223656654358, + "rewards/margins": 0.7917351126670837, + "rewards/rejected": -0.9660272598266602, + "step": 2100 + }, + { + "epoch": 2.18, + "learning_rate": 1.519326444699579e-07, + "logits/chosen": -2.3865954875946045, + "logits/rejected": -2.3452370166778564, + "logps/chosen": -279.7684631347656, + "logps/rejected": -252.83035278320312, + "loss": 0.5121, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.11177567392587662, + "rewards/margins": 0.7900134921073914, + "rewards/rejected": -0.9017891883850098, + "step": 2110 + }, + { + "epoch": 2.19, + "learning_rate": 1.5001913509376197e-07, + "logits/chosen": -2.2664945125579834, + "logits/rejected": -2.2997994422912598, + "logps/chosen": -242.1395721435547, + "logps/rejected": -219.6393585205078, + "loss": 0.5355, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.03740229830145836, + "rewards/margins": 0.8150936365127563, + "rewards/rejected": -0.8524959683418274, + "step": 2120 + }, + { + "epoch": 2.2, + "learning_rate": 1.4810562571756603e-07, + "logits/chosen": -2.40497088432312, + "logits/rejected": -2.382087230682373, + "logps/chosen": -260.3258972167969, + "logps/rejected": -238.91629028320312, + "loss": 0.5401, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.19490735232830048, + "rewards/margins": 0.6507551074028015, + "rewards/rejected": -0.8456624746322632, + "step": 2130 + }, + { + "epoch": 2.21, + "learning_rate": 1.4619211634137007e-07, + "logits/chosen": -2.4172415733337402, + "logits/rejected": -2.3459484577178955, + "logps/chosen": -296.9003601074219, + "logps/rejected": -244.6107177734375, + "loss": 0.5205, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.08288192749023438, + "rewards/margins": 0.7755564451217651, + "rewards/rejected": -0.8584383726119995, + "step": 2140 + }, + { + "epoch": 2.22, + "learning_rate": 1.4427860696517413e-07, + "logits/chosen": -2.3107922077178955, + "logits/rejected": -2.272061824798584, + "logps/chosen": -276.7105407714844, + "logps/rejected": -244.1803436279297, + "loss": 0.547, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.24030157923698425, + "rewards/margins": 0.5712770819664001, + "rewards/rejected": -0.8115787506103516, + "step": 2150 + }, + { + "epoch": 2.23, + "learning_rate": 1.423650975889782e-07, + "logits/chosen": -2.4137063026428223, + "logits/rejected": -2.317960262298584, + "logps/chosen": -301.14422607421875, + "logps/rejected": -230.6436004638672, + "loss": 0.5227, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.0714234784245491, + "rewards/margins": 0.8001400232315063, + "rewards/rejected": -0.8715635538101196, + "step": 2160 + }, + { + "epoch": 2.24, + "learning_rate": 1.4045158821278225e-07, + "logits/chosen": -2.437873363494873, + "logits/rejected": -2.3846659660339355, + "logps/chosen": -281.734619140625, + "logps/rejected": -252.9493408203125, + "loss": 0.5234, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.0724717229604721, + "rewards/margins": 0.8193691968917847, + "rewards/rejected": -0.891840934753418, + "step": 2170 + }, + { + "epoch": 2.25, + "learning_rate": 1.3853807883658632e-07, + "logits/chosen": -2.3811519145965576, + "logits/rejected": -2.3631045818328857, + "logps/chosen": -286.5116271972656, + "logps/rejected": -263.0299072265625, + "loss": 0.5568, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.0669230967760086, + "rewards/margins": 0.6814225912094116, + "rewards/rejected": -0.748345673084259, + "step": 2180 + }, + { + "epoch": 2.26, + "learning_rate": 1.3662456946039035e-07, + "logits/chosen": -2.3424034118652344, + "logits/rejected": -2.3057944774627686, + "logps/chosen": -244.4705352783203, + "logps/rejected": -206.46615600585938, + "loss": 0.5341, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.1323441118001938, + "rewards/margins": 0.6189785599708557, + "rewards/rejected": -0.7513227462768555, + "step": 2190 + }, + { + "epoch": 2.27, + "learning_rate": 1.3471106008419441e-07, + "logits/chosen": -2.3643927574157715, + "logits/rejected": -2.329315662384033, + "logps/chosen": -273.6473083496094, + "logps/rejected": -260.7103576660156, + "loss": 0.5286, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.10188891738653183, + "rewards/margins": 0.6854437589645386, + "rewards/rejected": -0.7873327732086182, + "step": 2200 + }, + { + "epoch": 2.28, + "learning_rate": 1.3279755070799848e-07, + "logits/chosen": -2.328723430633545, + "logits/rejected": -2.2633702754974365, + "logps/chosen": -284.9334411621094, + "logps/rejected": -232.8155517578125, + "loss": 0.5415, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.1077471598982811, + "rewards/margins": 0.7453306913375854, + "rewards/rejected": -0.8530778884887695, + "step": 2210 + }, + { + "epoch": 2.29, + "learning_rate": 1.3088404133180254e-07, + "logits/chosen": -2.299356460571289, + "logits/rejected": -2.2545862197875977, + "logps/chosen": -306.23406982421875, + "logps/rejected": -226.7987823486328, + "loss": 0.4804, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -0.07135553658008575, + "rewards/margins": 0.9380094408988953, + "rewards/rejected": -1.0093649625778198, + "step": 2220 + }, + { + "epoch": 2.3, + "learning_rate": 1.289705319556066e-07, + "logits/chosen": -2.314143419265747, + "logits/rejected": -2.2511839866638184, + "logps/chosen": -252.2981719970703, + "logps/rejected": -221.84194946289062, + "loss": 0.5264, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.14519211649894714, + "rewards/margins": 0.6200018525123596, + "rewards/rejected": -0.7651939988136292, + "step": 2230 + }, + { + "epoch": 2.31, + "learning_rate": 1.2705702257941064e-07, + "logits/chosen": -2.3738441467285156, + "logits/rejected": -2.3623602390289307, + "logps/chosen": -257.61328125, + "logps/rejected": -234.92190551757812, + "loss": 0.5131, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.18813610076904297, + "rewards/margins": 0.68475741147995, + "rewards/rejected": -0.8728936314582825, + "step": 2240 + }, + { + "epoch": 2.32, + "learning_rate": 1.251435132032147e-07, + "logits/chosen": -2.346224546432495, + "logits/rejected": -2.337629795074463, + "logps/chosen": -273.5932312011719, + "logps/rejected": -259.9046325683594, + "loss": 0.5298, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.07904721796512604, + "rewards/margins": 0.7541002035140991, + "rewards/rejected": -0.833147406578064, + "step": 2250 + }, + { + "epoch": 2.33, + "learning_rate": 1.2323000382701873e-07, + "logits/chosen": -2.3779919147491455, + "logits/rejected": -2.3776283264160156, + "logps/chosen": -270.1576232910156, + "logps/rejected": -229.17239379882812, + "loss": 0.5543, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.20323677361011505, + "rewards/margins": 0.6380544900894165, + "rewards/rejected": -0.8412912487983704, + "step": 2260 + }, + { + "epoch": 2.34, + "learning_rate": 1.213164944508228e-07, + "logits/chosen": -2.450532913208008, + "logits/rejected": -2.3503329753875732, + "logps/chosen": -266.23175048828125, + "logps/rejected": -269.7557067871094, + "loss": 0.509, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.25909024477005005, + "rewards/margins": 0.6517874002456665, + "rewards/rejected": -0.9108778238296509, + "step": 2270 + }, + { + "epoch": 2.35, + "learning_rate": 1.1940298507462686e-07, + "logits/chosen": -2.244485378265381, + "logits/rejected": -2.2757506370544434, + "logps/chosen": -261.5260314941406, + "logps/rejected": -211.95291137695312, + "loss": 0.5175, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.09936892986297607, + "rewards/margins": 0.829565167427063, + "rewards/rejected": -0.9289340972900391, + "step": 2280 + }, + { + "epoch": 2.37, + "learning_rate": 1.1748947569843092e-07, + "logits/chosen": -2.3904809951782227, + "logits/rejected": -2.307392120361328, + "logps/chosen": -263.42333984375, + "logps/rejected": -225.5657196044922, + "loss": 0.5077, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.11825220286846161, + "rewards/margins": 0.8992505073547363, + "rewards/rejected": -1.017502784729004, + "step": 2290 + }, + { + "epoch": 2.38, + "learning_rate": 1.1557596632223497e-07, + "logits/chosen": -2.360848903656006, + "logits/rejected": -2.3750827312469482, + "logps/chosen": -272.0371398925781, + "logps/rejected": -224.0919647216797, + "loss": 0.5255, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1476692408323288, + "rewards/margins": 0.835718035697937, + "rewards/rejected": -0.983387291431427, + "step": 2300 + }, + { + "epoch": 2.39, + "learning_rate": 1.1366245694603903e-07, + "logits/chosen": -2.4289305210113525, + "logits/rejected": -2.2458267211914062, + "logps/chosen": -265.1494140625, + "logps/rejected": -249.3345489501953, + "loss": 0.556, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.24553117156028748, + "rewards/margins": 0.6707764863967896, + "rewards/rejected": -0.9163076281547546, + "step": 2310 + }, + { + "epoch": 2.4, + "learning_rate": 1.1174894756984308e-07, + "logits/chosen": -2.2975571155548096, + "logits/rejected": -2.393068790435791, + "logps/chosen": -254.4977569580078, + "logps/rejected": -224.1728973388672, + "loss": 0.5205, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.18545860052108765, + "rewards/margins": 0.8345575332641602, + "rewards/rejected": -1.020016074180603, + "step": 2320 + }, + { + "epoch": 2.41, + "learning_rate": 1.0983543819364714e-07, + "logits/chosen": -2.464052200317383, + "logits/rejected": -2.4497199058532715, + "logps/chosen": -278.8863525390625, + "logps/rejected": -231.97512817382812, + "loss": 0.477, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.08197133243083954, + "rewards/margins": 0.8996642231941223, + "rewards/rejected": -0.9816356897354126, + "step": 2330 + }, + { + "epoch": 2.42, + "learning_rate": 1.079219288174512e-07, + "logits/chosen": -2.333491086959839, + "logits/rejected": -2.31835675239563, + "logps/chosen": -247.50698852539062, + "logps/rejected": -209.3056182861328, + "loss": 0.5348, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.20645038783550262, + "rewards/margins": 0.5899510979652405, + "rewards/rejected": -0.7964013814926147, + "step": 2340 + }, + { + "epoch": 2.43, + "learning_rate": 1.0600841944125525e-07, + "logits/chosen": -2.43719744682312, + "logits/rejected": -2.396315574645996, + "logps/chosen": -271.6535949707031, + "logps/rejected": -221.7565155029297, + "loss": 0.5527, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.2088332623243332, + "rewards/margins": 0.6539155840873718, + "rewards/rejected": -0.8627488017082214, + "step": 2350 + }, + { + "epoch": 2.44, + "learning_rate": 1.0409491006505931e-07, + "logits/chosen": -2.400672435760498, + "logits/rejected": -2.2855820655822754, + "logps/chosen": -283.90576171875, + "logps/rejected": -229.03689575195312, + "loss": 0.5436, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.18632188439369202, + "rewards/margins": 0.7588584423065186, + "rewards/rejected": -0.9451802968978882, + "step": 2360 + }, + { + "epoch": 2.45, + "learning_rate": 1.0218140068886336e-07, + "logits/chosen": -2.3788506984710693, + "logits/rejected": -2.282285690307617, + "logps/chosen": -260.7487487792969, + "logps/rejected": -221.43505859375, + "loss": 0.5533, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.21385586261749268, + "rewards/margins": 0.5428717732429504, + "rewards/rejected": -0.7567275762557983, + "step": 2370 + }, + { + "epoch": 2.46, + "learning_rate": 1.0026789131266743e-07, + "logits/chosen": -2.296046018600464, + "logits/rejected": -2.3289477825164795, + "logps/chosen": -252.4747772216797, + "logps/rejected": -242.341796875, + "loss": 0.5406, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.17420583963394165, + "rewards/margins": 0.6258620023727417, + "rewards/rejected": -0.8000679016113281, + "step": 2380 + }, + { + "epoch": 2.47, + "learning_rate": 9.835438193647149e-08, + "logits/chosen": -2.3245911598205566, + "logits/rejected": -2.3727688789367676, + "logps/chosen": -255.92984008789062, + "logps/rejected": -230.47940063476562, + "loss": 0.5594, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.15971948206424713, + "rewards/margins": 0.6732539534568787, + "rewards/rejected": -0.8329733610153198, + "step": 2390 + }, + { + "epoch": 2.48, + "learning_rate": 9.644087256027554e-08, + "logits/chosen": -2.3938796520233154, + "logits/rejected": -2.388028383255005, + "logps/chosen": -239.3699188232422, + "logps/rejected": -218.7798614501953, + "loss": 0.539, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.14847150444984436, + "rewards/margins": 0.6557341814041138, + "rewards/rejected": -0.804205596446991, + "step": 2400 + }, + { + "epoch": 2.49, + "learning_rate": 9.45273631840796e-08, + "logits/chosen": -2.395660877227783, + "logits/rejected": -2.4043540954589844, + "logps/chosen": -278.68353271484375, + "logps/rejected": -231.53103637695312, + "loss": 0.5324, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.08287270367145538, + "rewards/margins": 0.7413903474807739, + "rewards/rejected": -0.8242629766464233, + "step": 2410 + }, + { + "epoch": 2.5, + "learning_rate": 9.261385380788366e-08, + "logits/chosen": -2.2760956287384033, + "logits/rejected": -2.2844595909118652, + "logps/chosen": -239.07382202148438, + "logps/rejected": -222.31161499023438, + "loss": 0.5253, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.2278144657611847, + "rewards/margins": 0.6441665887832642, + "rewards/rejected": -0.871981143951416, + "step": 2420 + }, + { + "epoch": 2.51, + "learning_rate": 9.070034443168771e-08, + "logits/chosen": -2.3876235485076904, + "logits/rejected": -2.3704121112823486, + "logps/chosen": -261.8553161621094, + "logps/rejected": -227.60379028320312, + "loss": 0.5577, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.24931149184703827, + "rewards/margins": 0.6487377285957336, + "rewards/rejected": -0.8980492353439331, + "step": 2430 + }, + { + "epoch": 2.52, + "learning_rate": 8.878683505549177e-08, + "logits/chosen": -2.2906646728515625, + "logits/rejected": -2.32999587059021, + "logps/chosen": -267.1397399902344, + "logps/rejected": -226.90048217773438, + "loss": 0.5109, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.10627348721027374, + "rewards/margins": 0.714811384677887, + "rewards/rejected": -0.8210847973823547, + "step": 2440 + }, + { + "epoch": 2.53, + "learning_rate": 8.687332567929582e-08, + "logits/chosen": -2.3202567100524902, + "logits/rejected": -2.349112033843994, + "logps/chosen": -291.82147216796875, + "logps/rejected": -246.85574340820312, + "loss": 0.5351, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.050464726984500885, + "rewards/margins": 0.8968712091445923, + "rewards/rejected": -0.9473358988761902, + "step": 2450 + }, + { + "epoch": 2.54, + "learning_rate": 8.495981630309988e-08, + "logits/chosen": -2.270942211151123, + "logits/rejected": -2.2897868156433105, + "logps/chosen": -300.76312255859375, + "logps/rejected": -218.22640991210938, + "loss": 0.5467, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.13602502644062042, + "rewards/margins": 0.5781577825546265, + "rewards/rejected": -0.7141829133033752, + "step": 2460 + }, + { + "epoch": 2.55, + "learning_rate": 8.304630692690395e-08, + "logits/chosen": -2.317321300506592, + "logits/rejected": -2.2713263034820557, + "logps/chosen": -262.05743408203125, + "logps/rejected": -205.5304412841797, + "loss": 0.5247, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1445184201002121, + "rewards/margins": 0.7234494090080261, + "rewards/rejected": -0.8679677248001099, + "step": 2470 + }, + { + "epoch": 2.56, + "learning_rate": 8.1132797550708e-08, + "logits/chosen": -2.3970394134521484, + "logits/rejected": -2.3608124256134033, + "logps/chosen": -267.4720458984375, + "logps/rejected": -221.71359252929688, + "loss": 0.5264, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.15024690330028534, + "rewards/margins": 0.7065707445144653, + "rewards/rejected": -0.8568177223205566, + "step": 2480 + }, + { + "epoch": 2.57, + "learning_rate": 7.921928817451206e-08, + "logits/chosen": -2.416393280029297, + "logits/rejected": -2.3220162391662598, + "logps/chosen": -277.0873107910156, + "logps/rejected": -227.70947265625, + "loss": 0.5077, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.17723213136196136, + "rewards/margins": 0.8048456311225891, + "rewards/rejected": -0.9820777177810669, + "step": 2490 + }, + { + "epoch": 2.58, + "learning_rate": 7.73057787983161e-08, + "logits/chosen": -2.395048141479492, + "logits/rejected": -2.3008649349212646, + "logps/chosen": -292.8017883300781, + "logps/rejected": -266.5408020019531, + "loss": 0.516, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.08461178839206696, + "rewards/margins": 0.7476651072502136, + "rewards/rejected": -0.8322768211364746, + "step": 2500 + }, + { + "epoch": 2.59, + "learning_rate": 7.539226942212017e-08, + "logits/chosen": -2.357027053833008, + "logits/rejected": -2.313039541244507, + "logps/chosen": -244.0641326904297, + "logps/rejected": -220.01608276367188, + "loss": 0.5179, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.11702857911586761, + "rewards/margins": 0.8708831071853638, + "rewards/rejected": -0.9879117012023926, + "step": 2510 + }, + { + "epoch": 2.6, + "learning_rate": 7.347876004592423e-08, + "logits/chosen": -2.470712900161743, + "logits/rejected": -2.401108980178833, + "logps/chosen": -280.0924072265625, + "logps/rejected": -232.7683563232422, + "loss": 0.5404, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.1525488644838333, + "rewards/margins": 0.7386666536331177, + "rewards/rejected": -0.8912155032157898, + "step": 2520 + }, + { + "epoch": 2.61, + "learning_rate": 7.156525066972828e-08, + "logits/chosen": -2.299323081970215, + "logits/rejected": -2.362274646759033, + "logps/chosen": -242.1494598388672, + "logps/rejected": -227.5293426513672, + "loss": 0.5408, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.07852064073085785, + "rewards/margins": 0.7071703672409058, + "rewards/rejected": -0.7856910824775696, + "step": 2530 + }, + { + "epoch": 2.62, + "learning_rate": 6.965174129353234e-08, + "logits/chosen": -2.3532567024230957, + "logits/rejected": -2.292245864868164, + "logps/chosen": -302.1033630371094, + "logps/rejected": -258.0881042480469, + "loss": 0.53, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.11693718284368515, + "rewards/margins": 0.8197237253189087, + "rewards/rejected": -0.936660885810852, + "step": 2540 + }, + { + "epoch": 2.63, + "learning_rate": 6.773823191733639e-08, + "logits/chosen": -2.285585403442383, + "logits/rejected": -2.2158350944519043, + "logps/chosen": -259.6685485839844, + "logps/rejected": -222.1896514892578, + "loss": 0.5305, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.23936796188354492, + "rewards/margins": 0.7124180197715759, + "rewards/rejected": -0.9517859220504761, + "step": 2550 + }, + { + "epoch": 2.64, + "learning_rate": 6.582472254114045e-08, + "logits/chosen": -2.305962085723877, + "logits/rejected": -2.3173716068267822, + "logps/chosen": -252.51766967773438, + "logps/rejected": -223.96084594726562, + "loss": 0.565, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.13089394569396973, + "rewards/margins": 0.6327255368232727, + "rewards/rejected": -0.7636195421218872, + "step": 2560 + }, + { + "epoch": 2.65, + "learning_rate": 6.391121316494451e-08, + "logits/chosen": -2.325913667678833, + "logits/rejected": -2.310243606567383, + "logps/chosen": -294.99847412109375, + "logps/rejected": -239.6224822998047, + "loss": 0.5248, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.10585550963878632, + "rewards/margins": 0.8481132388114929, + "rewards/rejected": -0.9539687037467957, + "step": 2570 + }, + { + "epoch": 2.66, + "learning_rate": 6.199770378874856e-08, + "logits/chosen": -2.3760478496551514, + "logits/rejected": -2.2878143787384033, + "logps/chosen": -272.9644775390625, + "logps/rejected": -225.4691619873047, + "loss": 0.5297, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.07944142073392868, + "rewards/margins": 0.8911903500556946, + "rewards/rejected": -0.9706317782402039, + "step": 2580 + }, + { + "epoch": 2.67, + "learning_rate": 6.008419441255262e-08, + "logits/chosen": -2.3183536529541016, + "logits/rejected": -2.2597270011901855, + "logps/chosen": -299.25775146484375, + "logps/rejected": -217.822509765625, + "loss": 0.5237, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.07527098059654236, + "rewards/margins": 0.823529839515686, + "rewards/rejected": -0.8988008499145508, + "step": 2590 + }, + { + "epoch": 2.69, + "learning_rate": 5.817068503635668e-08, + "logits/chosen": -2.4004032611846924, + "logits/rejected": -2.3618216514587402, + "logps/chosen": -288.94622802734375, + "logps/rejected": -222.6877899169922, + "loss": 0.5125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12513655424118042, + "rewards/margins": 0.836874783039093, + "rewards/rejected": -0.9620113372802734, + "step": 2600 + }, + { + "epoch": 2.7, + "learning_rate": 5.6257175660160735e-08, + "logits/chosen": -2.420538902282715, + "logits/rejected": -2.3739724159240723, + "logps/chosen": -295.19580078125, + "logps/rejected": -238.79727172851562, + "loss": 0.5186, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.18515029549598694, + "rewards/margins": 0.6977055668830872, + "rewards/rejected": -0.8828557729721069, + "step": 2610 + }, + { + "epoch": 2.71, + "learning_rate": 5.4343666283964784e-08, + "logits/chosen": -2.4274239540100098, + "logits/rejected": -2.376018762588501, + "logps/chosen": -263.94525146484375, + "logps/rejected": -249.42178344726562, + "loss": 0.5219, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.0742453932762146, + "rewards/margins": 0.7254621386528015, + "rewards/rejected": -0.7997074127197266, + "step": 2620 + }, + { + "epoch": 2.72, + "learning_rate": 5.243015690776884e-08, + "logits/chosen": -2.4671757221221924, + "logits/rejected": -2.4064314365386963, + "logps/chosen": -284.95330810546875, + "logps/rejected": -235.897705078125, + "loss": 0.5148, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.15317563712596893, + "rewards/margins": 0.7068864107131958, + "rewards/rejected": -0.8600620031356812, + "step": 2630 + }, + { + "epoch": 2.73, + "learning_rate": 5.05166475315729e-08, + "logits/chosen": -2.3765015602111816, + "logits/rejected": -2.3243587017059326, + "logps/chosen": -263.02545166015625, + "logps/rejected": -218.81405639648438, + "loss": 0.5407, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.28189724683761597, + "rewards/margins": 0.6195310354232788, + "rewards/rejected": -0.9014283418655396, + "step": 2640 + }, + { + "epoch": 2.74, + "learning_rate": 4.860313815537696e-08, + "logits/chosen": -2.406059741973877, + "logits/rejected": -2.3650240898132324, + "logps/chosen": -284.5108337402344, + "logps/rejected": -266.8323059082031, + "loss": 0.5351, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.12149874866008759, + "rewards/margins": 0.7909864187240601, + "rewards/rejected": -0.9124851226806641, + "step": 2650 + }, + { + "epoch": 2.75, + "learning_rate": 4.668962877918101e-08, + "logits/chosen": -2.3801705837249756, + "logits/rejected": -2.27915620803833, + "logps/chosen": -288.0900573730469, + "logps/rejected": -278.1248474121094, + "loss": 0.5227, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.12331392616033554, + "rewards/margins": 0.8058657646179199, + "rewards/rejected": -0.9291796684265137, + "step": 2660 + }, + { + "epoch": 2.76, + "learning_rate": 4.477611940298507e-08, + "logits/chosen": -2.4003734588623047, + "logits/rejected": -2.3077614307403564, + "logps/chosen": -245.1756591796875, + "logps/rejected": -227.53317260742188, + "loss": 0.4946, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.14258424937725067, + "rewards/margins": 0.8004018068313599, + "rewards/rejected": -0.942986011505127, + "step": 2670 + }, + { + "epoch": 2.77, + "learning_rate": 4.2862610026789124e-08, + "logits/chosen": -2.4123265743255615, + "logits/rejected": -2.295915365219116, + "logps/chosen": -259.74932861328125, + "logps/rejected": -223.12002563476562, + "loss": 0.5486, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.15485641360282898, + "rewards/margins": 0.7069037556648254, + "rewards/rejected": -0.861760139465332, + "step": 2680 + }, + { + "epoch": 2.78, + "learning_rate": 4.0949100650593186e-08, + "logits/chosen": -2.3854644298553467, + "logits/rejected": -2.328233003616333, + "logps/chosen": -256.6706848144531, + "logps/rejected": -243.61880493164062, + "loss": 0.5474, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25512591004371643, + "rewards/margins": 0.539734959602356, + "rewards/rejected": -0.7948609590530396, + "step": 2690 + }, + { + "epoch": 2.79, + "learning_rate": 3.903559127439724e-08, + "logits/chosen": -2.2946548461914062, + "logits/rejected": -2.278347969055176, + "logps/chosen": -273.6446838378906, + "logps/rejected": -230.7551727294922, + "loss": 0.5081, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.12195589393377304, + "rewards/margins": 0.7443081140518188, + "rewards/rejected": -0.8662639856338501, + "step": 2700 + }, + { + "epoch": 2.8, + "learning_rate": 3.71220818982013e-08, + "logits/chosen": -2.405937910079956, + "logits/rejected": -2.371584415435791, + "logps/chosen": -273.60845947265625, + "logps/rejected": -218.2377166748047, + "loss": 0.5477, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.24379794299602509, + "rewards/margins": 0.6154786348342896, + "rewards/rejected": -0.8592765927314758, + "step": 2710 + }, + { + "epoch": 2.81, + "learning_rate": 3.520857252200535e-08, + "logits/chosen": -2.3632619380950928, + "logits/rejected": -2.332373857498169, + "logps/chosen": -300.9534912109375, + "logps/rejected": -242.65316772460938, + "loss": 0.5313, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -0.04076363891363144, + "rewards/margins": 0.986183762550354, + "rewards/rejected": -1.0269473791122437, + "step": 2720 + }, + { + "epoch": 2.82, + "learning_rate": 3.3295063145809414e-08, + "logits/chosen": -2.280702829360962, + "logits/rejected": -2.2251949310302734, + "logps/chosen": -267.03338623046875, + "logps/rejected": -244.3408203125, + "loss": 0.5581, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.13874498009681702, + "rewards/margins": 0.7079328298568726, + "rewards/rejected": -0.8466777801513672, + "step": 2730 + }, + { + "epoch": 2.83, + "learning_rate": 3.138155376961347e-08, + "logits/chosen": -2.2881321907043457, + "logits/rejected": -2.3319568634033203, + "logps/chosen": -234.1023712158203, + "logps/rejected": -226.11300659179688, + "loss": 0.5666, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.13704028725624084, + "rewards/margins": 0.7299059629440308, + "rewards/rejected": -0.8669462203979492, + "step": 2740 + }, + { + "epoch": 2.84, + "learning_rate": 2.9468044393417525e-08, + "logits/chosen": -2.4301745891571045, + "logits/rejected": -2.402632236480713, + "logps/chosen": -273.2855529785156, + "logps/rejected": -262.68792724609375, + "loss": 0.5173, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.061246536672115326, + "rewards/margins": 0.8221151232719421, + "rewards/rejected": -0.8833616971969604, + "step": 2750 + }, + { + "epoch": 2.85, + "learning_rate": 2.755453501722158e-08, + "logits/chosen": -2.3756215572357178, + "logits/rejected": -2.3666157722473145, + "logps/chosen": -285.3559265136719, + "logps/rejected": -228.5872344970703, + "loss": 0.509, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.09171368926763535, + "rewards/margins": 0.8335908055305481, + "rewards/rejected": -0.9253045320510864, + "step": 2760 + }, + { + "epoch": 2.86, + "learning_rate": 2.564102564102564e-08, + "logits/chosen": -2.3984246253967285, + "logits/rejected": -2.3833839893341064, + "logps/chosen": -258.1267395019531, + "logps/rejected": -225.0773468017578, + "loss": 0.524, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.1915823221206665, + "rewards/margins": 0.6331661343574524, + "rewards/rejected": -0.8247483968734741, + "step": 2770 + }, + { + "epoch": 2.87, + "learning_rate": 2.3727516264829695e-08, + "logits/chosen": -2.320146083831787, + "logits/rejected": -2.2947006225585938, + "logps/chosen": -238.8065643310547, + "logps/rejected": -234.02822875976562, + "loss": 0.5354, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.17902429401874542, + "rewards/margins": 0.5986486673355103, + "rewards/rejected": -0.7776729464530945, + "step": 2780 + }, + { + "epoch": 2.88, + "learning_rate": 2.1814006888633754e-08, + "logits/chosen": -2.334745168685913, + "logits/rejected": -2.353066921234131, + "logps/chosen": -263.228515625, + "logps/rejected": -233.03515625, + "loss": 0.5472, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.20434530079364777, + "rewards/margins": 0.656082272529602, + "rewards/rejected": -0.8604275584220886, + "step": 2790 + }, + { + "epoch": 2.89, + "learning_rate": 1.990049751243781e-08, + "logits/chosen": -2.396660327911377, + "logits/rejected": -2.3448100090026855, + "logps/chosen": -285.54638671875, + "logps/rejected": -231.98117065429688, + "loss": 0.5327, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1172541007399559, + "rewards/margins": 0.6794244050979614, + "rewards/rejected": -0.7966784238815308, + "step": 2800 + }, + { + "epoch": 2.9, + "learning_rate": 1.7986988136241865e-08, + "logits/chosen": -2.4138553142547607, + "logits/rejected": -2.357382297515869, + "logps/chosen": -262.9327697753906, + "logps/rejected": -225.3331756591797, + "loss": 0.4929, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.031798310577869415, + "rewards/margins": 0.8773431777954102, + "rewards/rejected": -0.9091414213180542, + "step": 2810 + }, + { + "epoch": 2.91, + "learning_rate": 1.6073478760045924e-08, + "logits/chosen": -2.2996766567230225, + "logits/rejected": -2.2243258953094482, + "logps/chosen": -285.4356384277344, + "logps/rejected": -201.10208129882812, + "loss": 0.5427, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2585764229297638, + "rewards/margins": 0.5490394234657288, + "rewards/rejected": -0.8076158761978149, + "step": 2820 + }, + { + "epoch": 2.92, + "learning_rate": 1.4159969383849981e-08, + "logits/chosen": -2.4938712120056152, + "logits/rejected": -2.4172348976135254, + "logps/chosen": -312.3806457519531, + "logps/rejected": -248.057373046875, + "loss": 0.5262, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.1045370101928711, + "rewards/margins": 0.8157347440719604, + "rewards/rejected": -0.9202718734741211, + "step": 2830 + }, + { + "epoch": 2.93, + "learning_rate": 1.2246460007654037e-08, + "logits/chosen": -2.3658456802368164, + "logits/rejected": -2.304481029510498, + "logps/chosen": -266.4372863769531, + "logps/rejected": -227.3815460205078, + "loss": 0.5701, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.23648759722709656, + "rewards/margins": 0.5965025424957275, + "rewards/rejected": -0.8329901695251465, + "step": 2840 + }, + { + "epoch": 2.94, + "learning_rate": 1.0332950631458094e-08, + "logits/chosen": -2.307523012161255, + "logits/rejected": -2.3531241416931152, + "logps/chosen": -273.6283874511719, + "logps/rejected": -230.62344360351562, + "loss": 0.5356, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.150599867105484, + "rewards/margins": 0.7397447228431702, + "rewards/rejected": -0.8903446197509766, + "step": 2850 + }, + { + "epoch": 2.95, + "learning_rate": 8.419441255262151e-09, + "logits/chosen": -2.298656940460205, + "logits/rejected": -2.319462299346924, + "logps/chosen": -267.07867431640625, + "logps/rejected": -219.8665313720703, + "loss": 0.5007, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.09595485031604767, + "rewards/margins": 0.6610188484191895, + "rewards/rejected": -0.7569736838340759, + "step": 2860 + }, + { + "epoch": 2.96, + "learning_rate": 6.505931879066207e-09, + "logits/chosen": -2.4032797813415527, + "logits/rejected": -2.3461287021636963, + "logps/chosen": -235.07882690429688, + "logps/rejected": -236.2791290283203, + "loss": 0.5338, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.25155526399612427, + "rewards/margins": 0.6543334722518921, + "rewards/rejected": -0.9058888554573059, + "step": 2870 + }, + { + "epoch": 2.97, + "learning_rate": 4.592422502870264e-09, + "logits/chosen": -2.41361141204834, + "logits/rejected": -2.317509651184082, + "logps/chosen": -259.431884765625, + "logps/rejected": -227.45394897460938, + "loss": 0.5457, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.057419806718826294, + "rewards/margins": 0.6843992471694946, + "rewards/rejected": -0.7418190240859985, + "step": 2880 + }, + { + "epoch": 2.98, + "learning_rate": 2.6789131266743202e-09, + "logits/chosen": -2.3416519165039062, + "logits/rejected": -2.272921562194824, + "logps/chosen": -233.4044189453125, + "logps/rejected": -225.3540496826172, + "loss": 0.5175, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.2774786353111267, + "rewards/margins": 0.6129654049873352, + "rewards/rejected": -0.8904439806938171, + "step": 2890 + }, + { + "epoch": 3.0, + "learning_rate": 7.654037504783773e-10, + "logits/chosen": -2.313737630844116, + "logits/rejected": -2.3516671657562256, + "logps/chosen": -238.37814331054688, + "logps/rejected": -245.6781005859375, + "loss": 0.5439, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.1316399872303009, + "rewards/margins": 0.7750416994094849, + "rewards/rejected": -0.9066817164421082, + "step": 2900 + }, { "epoch": 3.0, - "eval_logits/chosen": -2.1211588382720947, - "eval_logits/rejected": -2.000145435333252, - "eval_logps/chosen": -265.46588134765625, - "eval_logps/rejected": -224.6123809814453, - "eval_loss": 0.5657259225845337, - "eval_rewards/accuracies": 0.7020000219345093, - "eval_rewards/chosen": -0.08337792754173279, - "eval_rewards/margins": 0.44957080483436584, - "eval_rewards/rejected": -0.5329487323760986, - "eval_runtime": 600.5346, - "eval_samples_per_second": 3.33, + "eval_logits/chosen": -2.065433979034424, + "eval_logits/rejected": -1.9405803680419922, + "eval_logps/chosen": -266.1706848144531, + "eval_logps/rejected": -228.30780029296875, + "eval_loss": 0.5255534052848816, + "eval_rewards/accuracies": 0.7419999837875366, + "eval_rewards/chosen": -0.15385985374450684, + "eval_rewards/margins": 0.7486297488212585, + "eval_rewards/rejected": -0.9024895429611206, + "eval_runtime": 601.0805, + "eval_samples_per_second": 3.327, "eval_steps_per_second": 0.208, - "step": 726 + "step": 2904 }, { "epoch": 3.0, - "step": 726, + "step": 2904, "total_flos": 0.0, - "train_loss": 0.6034470564241908, - "train_runtime": 84521.5132, - "train_samples_per_second": 2.199, - "train_steps_per_second": 0.009 + "train_loss": 0.5642068754707158, + "train_runtime": 89225.6094, + "train_samples_per_second": 2.083, + "train_steps_per_second": 0.033 } ], "logging_steps": 10, - "max_steps": 726, + "max_steps": 2904, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0,