|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9998757609640949, |
|
"eval_steps": 100, |
|
"global_step": 1509, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 544.525013923645, |
|
"epoch": 0.0033130409574688366, |
|
"grad_norm": 0.2902330414514582, |
|
"kl": 0.00014667510986328126, |
|
"learning_rate": 6.622516556291392e-07, |
|
"loss": 0.0, |
|
"reward": 0.20833333805203438, |
|
"reward_std": 0.1532064698636532, |
|
"rewards/accuracy_reward": 0.20833333805203438, |
|
"rewards/format_reward": 0.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 539.743765258789, |
|
"epoch": 0.006626081914937673, |
|
"grad_norm": 0.4239089441988848, |
|
"kl": 0.0002197861671447754, |
|
"learning_rate": 1.3245033112582784e-06, |
|
"loss": 0.0, |
|
"reward": 0.20000000428408385, |
|
"reward_std": 0.14731391314417125, |
|
"rewards/accuracy_reward": 0.20000000428408385, |
|
"rewards/format_reward": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 555.6875169754028, |
|
"epoch": 0.00993912287240651, |
|
"grad_norm": 0.5514350439282207, |
|
"kl": 0.0004414021968841553, |
|
"learning_rate": 1.9867549668874175e-06, |
|
"loss": 0.0, |
|
"reward": 0.19375000502914191, |
|
"reward_std": 0.13258252199739218, |
|
"rewards/accuracy_reward": 0.1916666716337204, |
|
"rewards/format_reward": 0.002083333395421505, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 569.2666831970215, |
|
"epoch": 0.013252163829875346, |
|
"grad_norm": 0.3228506299764182, |
|
"kl": 0.0017175555229187011, |
|
"learning_rate": 2.6490066225165567e-06, |
|
"loss": 0.0001, |
|
"reward": 0.24375000558793544, |
|
"reward_std": 0.15026019159704446, |
|
"rewards/accuracy_reward": 0.24375000558793544, |
|
"rewards/format_reward": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 547.4958526611329, |
|
"epoch": 0.016565204787344183, |
|
"grad_norm": 0.4765963281561274, |
|
"kl": 0.0036331653594970704, |
|
"learning_rate": 3.311258278145696e-06, |
|
"loss": 0.0001, |
|
"reward": 0.24166667219251395, |
|
"reward_std": 0.13552880007773638, |
|
"rewards/accuracy_reward": 0.24166667219251395, |
|
"rewards/format_reward": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 544.0520998001099, |
|
"epoch": 0.01987824574481302, |
|
"grad_norm": 0.21762305936684645, |
|
"kl": 0.00529632568359375, |
|
"learning_rate": 3.973509933774835e-06, |
|
"loss": 0.0002, |
|
"reward": 0.260416672565043, |
|
"reward_std": 0.10311973933130503, |
|
"rewards/accuracy_reward": 0.260416672565043, |
|
"rewards/format_reward": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 561.004182434082, |
|
"epoch": 0.023191286702281856, |
|
"grad_norm": 0.42754577646802083, |
|
"kl": 0.0049117088317871095, |
|
"learning_rate": 4.635761589403974e-06, |
|
"loss": 0.0002, |
|
"reward": 0.23541667256504298, |
|
"reward_std": 0.15026019141077995, |
|
"rewards/accuracy_reward": 0.23541667256504298, |
|
"rewards/format_reward": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 558.4416833877564, |
|
"epoch": 0.026504327659750693, |
|
"grad_norm": 0.38474962439429805, |
|
"kl": 0.00634613037109375, |
|
"learning_rate": 5.2980132450331135e-06, |
|
"loss": 0.0003, |
|
"reward": 0.23958333898335696, |
|
"reward_std": 0.13258252181112767, |
|
"rewards/accuracy_reward": 0.23958333898335696, |
|
"rewards/format_reward": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 587.2520998001098, |
|
"epoch": 0.02981736861721953, |
|
"grad_norm": 0.38278137652247257, |
|
"kl": 0.008282661437988281, |
|
"learning_rate": 5.960264900662252e-06, |
|
"loss": 0.0003, |
|
"reward": 0.26666667237877845, |
|
"reward_std": 0.12963624354451894, |
|
"rewards/accuracy_reward": 0.26666667237877845, |
|
"rewards/format_reward": 0.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 585.193766784668, |
|
"epoch": 0.033130409574688366, |
|
"grad_norm": 0.46609514480011244, |
|
"kl": 0.011644363403320312, |
|
"learning_rate": 6.622516556291392e-06, |
|
"loss": 0.0005, |
|
"reward": 0.28750000689178706, |
|
"reward_std": 0.1590990262106061, |
|
"rewards/accuracy_reward": 0.28750000689178706, |
|
"rewards/format_reward": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 604.7958530426025, |
|
"epoch": 0.0364434505321572, |
|
"grad_norm": 0.39658289344370634, |
|
"kl": 0.014037322998046876, |
|
"learning_rate": 7.28476821192053e-06, |
|
"loss": 0.0006, |
|
"reward": 0.2145833380520344, |
|
"reward_std": 0.1207974087446928, |
|
"rewards/accuracy_reward": 0.2145833380520344, |
|
"rewards/format_reward": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 573.2937698364258, |
|
"epoch": 0.03975649148962604, |
|
"grad_norm": 0.3562470913913186, |
|
"kl": 0.015195465087890625, |
|
"learning_rate": 7.94701986754967e-06, |
|
"loss": 0.0006, |
|
"reward": 0.2666666736826301, |
|
"reward_std": 0.16499158274382353, |
|
"rewards/accuracy_reward": 0.2666666736826301, |
|
"rewards/format_reward": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 523.7500129699707, |
|
"epoch": 0.043069532447094876, |
|
"grad_norm": 0.44620950890040406, |
|
"kl": 0.021143341064453126, |
|
"learning_rate": 8.609271523178809e-06, |
|
"loss": 0.0008, |
|
"reward": 0.2583333391696215, |
|
"reward_std": 0.1590990262106061, |
|
"rewards/accuracy_reward": 0.2583333391696215, |
|
"rewards/format_reward": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 570.8146015167237, |
|
"epoch": 0.04638257340456371, |
|
"grad_norm": 0.3096717335604667, |
|
"kl": 0.02764739990234375, |
|
"learning_rate": 9.271523178807948e-06, |
|
"loss": 0.0011, |
|
"reward": 0.28541667349636557, |
|
"reward_std": 0.1797229742631316, |
|
"rewards/accuracy_reward": 0.28541667349636557, |
|
"rewards/format_reward": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 560.4291816711426, |
|
"epoch": 0.04969561436203255, |
|
"grad_norm": 0.41540849647955475, |
|
"kl": 0.03727569580078125, |
|
"learning_rate": 9.933774834437086e-06, |
|
"loss": 0.0015, |
|
"reward": 0.3145833402872086, |
|
"reward_std": 0.1738304177299142, |
|
"rewards/accuracy_reward": 0.3145833402872086, |
|
"rewards/format_reward": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 588.3562690734864, |
|
"epoch": 0.053008655319501385, |
|
"grad_norm": 0.3491763818431896, |
|
"kl": 0.0417724609375, |
|
"learning_rate": 1.0596026490066227e-05, |
|
"loss": 0.0017, |
|
"reward": 0.2979166731238365, |
|
"reward_std": 0.16793786101043223, |
|
"rewards/accuracy_reward": 0.2979166731238365, |
|
"rewards/format_reward": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 591.312519454956, |
|
"epoch": 0.05632169627697022, |
|
"grad_norm": 0.2088425343382679, |
|
"kl": 0.05334014892578125, |
|
"learning_rate": 1.1258278145695364e-05, |
|
"loss": 0.0021, |
|
"reward": 0.21458333786576986, |
|
"reward_std": 0.11490485221147537, |
|
"rewards/accuracy_reward": 0.21458333786576986, |
|
"rewards/format_reward": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 559.370846748352, |
|
"epoch": 0.05963473723443906, |
|
"grad_norm": 0.2070181739527896, |
|
"kl": 0.05329437255859375, |
|
"learning_rate": 1.1920529801324505e-05, |
|
"loss": 0.0021, |
|
"reward": 0.2020833384245634, |
|
"reward_std": 0.1561527479439974, |
|
"rewards/accuracy_reward": 0.2020833384245634, |
|
"rewards/format_reward": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 510.6500156402588, |
|
"epoch": 0.0629477781919079, |
|
"grad_norm": 0.37541642613803017, |
|
"kl": 0.0571197509765625, |
|
"learning_rate": 1.2582781456953644e-05, |
|
"loss": 0.0023, |
|
"reward": 0.2145833382382989, |
|
"reward_std": 0.12079740893095732, |
|
"rewards/accuracy_reward": 0.2145833382382989, |
|
"rewards/format_reward": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 525.6958492279052, |
|
"epoch": 0.06626081914937673, |
|
"grad_norm": 0.42588005101800375, |
|
"kl": 0.0678314208984375, |
|
"learning_rate": 1.3245033112582784e-05, |
|
"loss": 0.0027, |
|
"reward": 0.19166667126119136, |
|
"reward_std": 0.12963624373078347, |
|
"rewards/accuracy_reward": 0.19166667126119136, |
|
"rewards/format_reward": 0.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06626081914937673, |
|
"eval_completion_length": 585.5294297162225, |
|
"eval_kl": 0.0954733455882353, |
|
"eval_loss": 0.0038791955448687077, |
|
"eval_reward": 0.1617647111415863, |
|
"eval_reward_std": 0.09012145242270302, |
|
"eval_rewards/accuracy_reward": 0.1617647111415863, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 83.6505, |
|
"eval_samples_per_second": 1.183, |
|
"eval_steps_per_second": 0.108, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 605.8437679290771, |
|
"epoch": 0.06957386010684558, |
|
"grad_norm": 0.38186731473973345, |
|
"kl": 0.087103271484375, |
|
"learning_rate": 1.3907284768211921e-05, |
|
"loss": 0.0035, |
|
"reward": 0.17083333767950534, |
|
"reward_std": 0.12374368719756604, |
|
"rewards/accuracy_reward": 0.17083333767950534, |
|
"rewards/format_reward": 0.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 612.3166820526124, |
|
"epoch": 0.0728869010643144, |
|
"grad_norm": 0.31009682442787734, |
|
"kl": 0.0819427490234375, |
|
"learning_rate": 1.456953642384106e-05, |
|
"loss": 0.0033, |
|
"reward": 0.17916667107492684, |
|
"reward_std": 0.12963624373078347, |
|
"rewards/accuracy_reward": 0.17916667107492684, |
|
"rewards/format_reward": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 588.9187660217285, |
|
"epoch": 0.07619994202178325, |
|
"grad_norm": 0.42260706748908966, |
|
"kl": 0.080755615234375, |
|
"learning_rate": 1.52317880794702e-05, |
|
"loss": 0.0032, |
|
"reward": 0.1541666703298688, |
|
"reward_std": 0.10017346106469631, |
|
"rewards/accuracy_reward": 0.1541666703298688, |
|
"rewards/format_reward": 0.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 499.27501373291017, |
|
"epoch": 0.07951298297925208, |
|
"grad_norm": 0.40235696345652067, |
|
"kl": 0.096820068359375, |
|
"learning_rate": 1.589403973509934e-05, |
|
"loss": 0.0039, |
|
"reward": 0.15000000447034836, |
|
"reward_std": 0.08838834799826145, |
|
"rewards/accuracy_reward": 0.15000000447034836, |
|
"rewards/format_reward": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 500.2541803359985, |
|
"epoch": 0.08282602393672092, |
|
"grad_norm": 0.3313046600076135, |
|
"kl": 0.1076812744140625, |
|
"learning_rate": 1.6556291390728477e-05, |
|
"loss": 0.0043, |
|
"reward": 0.14166667088866233, |
|
"reward_std": 0.10606601759791374, |
|
"rewards/accuracy_reward": 0.14166667088866233, |
|
"rewards/format_reward": 0.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 549.8687664031983, |
|
"epoch": 0.08613906489418975, |
|
"grad_norm": 0.3284282629828314, |
|
"kl": 0.1057586669921875, |
|
"learning_rate": 1.7218543046357617e-05, |
|
"loss": 0.0042, |
|
"reward": 0.14375000391155482, |
|
"reward_std": 0.12668996546417474, |
|
"rewards/accuracy_reward": 0.14375000391155482, |
|
"rewards/format_reward": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 554.0104347229004, |
|
"epoch": 0.0894521058516586, |
|
"grad_norm": 0.3273867814905218, |
|
"kl": 0.1071990966796875, |
|
"learning_rate": 1.7880794701986758e-05, |
|
"loss": 0.0043, |
|
"reward": 0.11041666958481074, |
|
"reward_std": 0.09722718279808759, |
|
"rewards/accuracy_reward": 0.11041666958481074, |
|
"rewards/format_reward": 0.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 580.7979354858398, |
|
"epoch": 0.09276514680912742, |
|
"grad_norm": 0.3375694940246247, |
|
"kl": 0.1095489501953125, |
|
"learning_rate": 1.8543046357615895e-05, |
|
"loss": 0.0044, |
|
"reward": 0.13125000316649676, |
|
"reward_std": 0.10311973933130503, |
|
"rewards/accuracy_reward": 0.13125000316649676, |
|
"rewards/format_reward": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 530.2166820526123, |
|
"epoch": 0.09607818776659627, |
|
"grad_norm": 0.4769828906690383, |
|
"kl": 0.12447509765625, |
|
"learning_rate": 1.9205298013245036e-05, |
|
"loss": 0.005, |
|
"reward": 0.14375000298023224, |
|
"reward_std": 0.12079740893095732, |
|
"rewards/accuracy_reward": 0.14375000298023224, |
|
"rewards/format_reward": 0.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 517.9875141143799, |
|
"epoch": 0.0993912287240651, |
|
"grad_norm": 0.3406117455598423, |
|
"kl": 0.1377685546875, |
|
"learning_rate": 1.9867549668874173e-05, |
|
"loss": 0.0055, |
|
"reward": 0.11250000260770321, |
|
"reward_std": 0.10017346087843179, |
|
"rewards/accuracy_reward": 0.11250000260770321, |
|
"rewards/format_reward": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 529.8146003723144, |
|
"epoch": 0.10270426968153394, |
|
"grad_norm": 0.4711687133275435, |
|
"kl": 0.138232421875, |
|
"learning_rate": 1.999957185872951e-05, |
|
"loss": 0.0055, |
|
"reward": 0.14791667014360427, |
|
"reward_std": 0.1207974087446928, |
|
"rewards/accuracy_reward": 0.14791667014360427, |
|
"rewards/format_reward": 0.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 570.939599609375, |
|
"epoch": 0.10601731063900277, |
|
"grad_norm": 0.25580347999222874, |
|
"kl": 0.1270263671875, |
|
"learning_rate": 1.999783259765003e-05, |
|
"loss": 0.0051, |
|
"reward": 0.12500000298023223, |
|
"reward_std": 0.11195857413113117, |
|
"rewards/accuracy_reward": 0.12500000298023223, |
|
"rewards/format_reward": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 494.0750160217285, |
|
"epoch": 0.10933035159647161, |
|
"grad_norm": 0.4334715879020743, |
|
"kl": 0.19808349609375, |
|
"learning_rate": 1.9994755690455154e-05, |
|
"loss": 0.0079, |
|
"reward": 0.12708333618938922, |
|
"reward_std": 0.12079740893095732, |
|
"rewards/accuracy_reward": 0.12708333618938922, |
|
"rewards/format_reward": 0.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 465.81459465026853, |
|
"epoch": 0.11264339255394044, |
|
"grad_norm": 0.19728139906086956, |
|
"kl": 0.20181884765625, |
|
"learning_rate": 1.99903415488154e-05, |
|
"loss": 0.0081, |
|
"reward": 0.08541666902601719, |
|
"reward_std": 0.06187184359878302, |
|
"rewards/accuracy_reward": 0.08541666902601719, |
|
"rewards/format_reward": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 511.3020980834961, |
|
"epoch": 0.11595643351140929, |
|
"grad_norm": 0.34016272031635464, |
|
"kl": 0.22132568359375, |
|
"learning_rate": 1.9984590763314722e-05, |
|
"loss": 0.0089, |
|
"reward": 0.07083333544433117, |
|
"reward_std": 0.0589255653321743, |
|
"rewards/accuracy_reward": 0.07083333544433117, |
|
"rewards/format_reward": 0.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 591.8833505630494, |
|
"epoch": 0.11926947446887812, |
|
"grad_norm": 0.23300547193780718, |
|
"kl": 0.193109130859375, |
|
"learning_rate": 1.997750410337147e-05, |
|
"loss": 0.0077, |
|
"reward": 0.0416666679084301, |
|
"reward_std": 0.047140452265739444, |
|
"rewards/accuracy_reward": 0.0416666679084301, |
|
"rewards/format_reward": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 607.9291858673096, |
|
"epoch": 0.12258251542634696, |
|
"grad_norm": 0.23907198670526106, |
|
"kl": 0.180279541015625, |
|
"learning_rate": 1.9969082517135463e-05, |
|
"loss": 0.0072, |
|
"reward": 0.07916666846722364, |
|
"reward_std": 0.06481812186539174, |
|
"rewards/accuracy_reward": 0.07916666846722364, |
|
"rewards/format_reward": 0.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 533.2396003723145, |
|
"epoch": 0.1258955563838158, |
|
"grad_norm": 0.32468582498361115, |
|
"kl": 0.1746826171875, |
|
"learning_rate": 1.995932713136112e-05, |
|
"loss": 0.007, |
|
"reward": 0.08125000223517417, |
|
"reward_std": 0.07954951319843531, |
|
"rewards/accuracy_reward": 0.08125000223517417, |
|
"rewards/format_reward": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 578.1500179290772, |
|
"epoch": 0.12920859734128462, |
|
"grad_norm": 0.6318480566887794, |
|
"kl": 0.17467041015625, |
|
"learning_rate": 1.994823925125672e-05, |
|
"loss": 0.007, |
|
"reward": 0.10833333637565375, |
|
"reward_std": 0.10606601759791374, |
|
"rewards/accuracy_reward": 0.10833333637565375, |
|
"rewards/format_reward": 0.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 532.8479331970215, |
|
"epoch": 0.13252163829875346, |
|
"grad_norm": 0.31162375914822543, |
|
"kl": 0.209014892578125, |
|
"learning_rate": 1.993582036030978e-05, |
|
"loss": 0.0084, |
|
"reward": 0.11041666977107525, |
|
"reward_std": 0.09722718279808759, |
|
"rewards/accuracy_reward": 0.11041666977107525, |
|
"rewards/format_reward": 0.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.13252163829875346, |
|
"eval_completion_length": 461.04903097713697, |
|
"eval_kl": 0.22242647058823528, |
|
"eval_loss": 0.009009506553411484, |
|
"eval_reward": 0.1274509846287615, |
|
"eval_reward_std": 0.08318903253358953, |
|
"eval_rewards/accuracy_reward": 0.1274509846287615, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 64.4615, |
|
"eval_samples_per_second": 1.536, |
|
"eval_steps_per_second": 0.14, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 461.4708471298218, |
|
"epoch": 0.1358346792562223, |
|
"grad_norm": 0.941704833763444, |
|
"kl": 0.1978759765625, |
|
"learning_rate": 1.9922072120088537e-05, |
|
"loss": 0.0079, |
|
"reward": 0.08125000242143869, |
|
"reward_std": 0.07954951319843531, |
|
"rewards/accuracy_reward": 0.08125000242143869, |
|
"rewards/format_reward": 0.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 525.7250125885009, |
|
"epoch": 0.13914772021369115, |
|
"grad_norm": 0.429496265642979, |
|
"kl": 0.1792236328125, |
|
"learning_rate": 1.9906996370019692e-05, |
|
"loss": 0.0072, |
|
"reward": 0.0916666692122817, |
|
"reward_std": 0.09428090453147889, |
|
"rewards/accuracy_reward": 0.0916666692122817, |
|
"rewards/format_reward": 0.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 566.6875190734863, |
|
"epoch": 0.14246076117115997, |
|
"grad_norm": 1.1240583407373201, |
|
"kl": 15.97672119140625, |
|
"learning_rate": 1.989059512714227e-05, |
|
"loss": 0.6397, |
|
"reward": 0.0916666690260172, |
|
"reward_std": 0.08838834781199693, |
|
"rewards/accuracy_reward": 0.0916666690260172, |
|
"rewards/format_reward": 0.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 542.343769454956, |
|
"epoch": 0.1457738021286288, |
|
"grad_norm": 0.3232677681544197, |
|
"kl": 0.189068603515625, |
|
"learning_rate": 1.9872870585837757e-05, |
|
"loss": 0.0076, |
|
"reward": 0.12500000298023223, |
|
"reward_std": 0.10606601759791374, |
|
"rewards/accuracy_reward": 0.12500000298023223, |
|
"rewards/format_reward": 0.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 546.4791807174682, |
|
"epoch": 0.14908684308609765, |
|
"grad_norm": 0.8875397199463153, |
|
"kl": 0.188201904296875, |
|
"learning_rate": 1.9853825117536522e-05, |
|
"loss": 0.0075, |
|
"reward": 0.13333333637565375, |
|
"reward_std": 0.0766032349318266, |
|
"rewards/accuracy_reward": 0.13333333637565375, |
|
"rewards/format_reward": 0.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 540.5750137329102, |
|
"epoch": 0.1523998840435665, |
|
"grad_norm": 0.42221258320463867, |
|
"kl": 0.19306640625, |
|
"learning_rate": 1.983346127040053e-05, |
|
"loss": 0.0077, |
|
"reward": 0.12083333656191826, |
|
"reward_std": 0.12374368719756604, |
|
"rewards/accuracy_reward": 0.12083333656191826, |
|
"rewards/format_reward": 0.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 448.9229295730591, |
|
"epoch": 0.1557129250010353, |
|
"grad_norm": 0.3606831391556023, |
|
"kl": 0.24759521484375, |
|
"learning_rate": 1.9811781768982392e-05, |
|
"loss": 0.0099, |
|
"reward": 0.08125000223517417, |
|
"reward_std": 0.10311973914504051, |
|
"rewards/accuracy_reward": 0.08125000223517417, |
|
"rewards/format_reward": 0.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 479.3083478927612, |
|
"epoch": 0.15902596595850416, |
|
"grad_norm": 0.519351205279088, |
|
"kl": 0.19686279296875, |
|
"learning_rate": 1.9788789513860875e-05, |
|
"loss": 0.0079, |
|
"reward": 0.10000000279396773, |
|
"reward_std": 0.09428090453147889, |
|
"rewards/accuracy_reward": 0.10000000279396773, |
|
"rewards/format_reward": 0.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 543.2770980834961, |
|
"epoch": 0.162339006915973, |
|
"grad_norm": 0.2739435166284146, |
|
"kl": 0.192205810546875, |
|
"learning_rate": 1.9764487581252787e-05, |
|
"loss": 0.0077, |
|
"reward": 0.10416666883975267, |
|
"reward_std": 0.10017346087843179, |
|
"rewards/accuracy_reward": 0.10416666883975267, |
|
"rewards/format_reward": 0.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 604.7562683105468, |
|
"epoch": 0.16565204787344184, |
|
"grad_norm": 0.22639850925377564, |
|
"kl": 0.18382568359375, |
|
"learning_rate": 1.9738879222601425e-05, |
|
"loss": 0.0074, |
|
"reward": 0.08750000204890966, |
|
"reward_std": 0.08249579146504402, |
|
"rewards/accuracy_reward": 0.08750000204890966, |
|
"rewards/format_reward": 0.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 508.62084617614744, |
|
"epoch": 0.16896508883091066, |
|
"grad_norm": 0.34102274429245655, |
|
"kl": 26.4016845703125, |
|
"learning_rate": 1.9711967864141542e-05, |
|
"loss": 1.0641, |
|
"reward": 0.12083333600312471, |
|
"reward_std": 0.11195857394486666, |
|
"rewards/accuracy_reward": 0.12083333600312471, |
|
"rewards/format_reward": 0.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 479.06876335144045, |
|
"epoch": 0.1722781297883795, |
|
"grad_norm": 0.3926595209195284, |
|
"kl": 0.193994140625, |
|
"learning_rate": 1.968375710644093e-05, |
|
"loss": 0.0078, |
|
"reward": 0.11875000279396772, |
|
"reward_std": 0.1207974087446928, |
|
"rewards/accuracy_reward": 0.11875000279396772, |
|
"rewards/format_reward": 0.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 587.6750179290772, |
|
"epoch": 0.17559117074584835, |
|
"grad_norm": 0.24108678658537444, |
|
"kl": 0.18203125, |
|
"learning_rate": 1.9654250723918706e-05, |
|
"loss": 0.0073, |
|
"reward": 0.1104166692122817, |
|
"reward_std": 0.08544206954538822, |
|
"rewards/accuracy_reward": 0.1104166692122817, |
|
"rewards/format_reward": 0.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 677.4437683105468, |
|
"epoch": 0.1789042117033172, |
|
"grad_norm": 0.18106832646177998, |
|
"kl": 0.18656005859375, |
|
"learning_rate": 1.9623452664340305e-05, |
|
"loss": 0.0075, |
|
"reward": 0.07708333525806665, |
|
"reward_std": 0.05597928706556558, |
|
"rewards/accuracy_reward": 0.07708333525806665, |
|
"rewards/format_reward": 0.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 580.7875162124634, |
|
"epoch": 0.182217252660786, |
|
"grad_norm": 0.3207457026346929, |
|
"kl": 0.19422607421875, |
|
"learning_rate": 1.9591367048289297e-05, |
|
"loss": 0.0078, |
|
"reward": 0.07916666883975268, |
|
"reward_std": 0.07071067839860916, |
|
"rewards/accuracy_reward": 0.07916666883975268, |
|
"rewards/format_reward": 0.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 529.9458477020264, |
|
"epoch": 0.18553029361825485, |
|
"grad_norm": 0.3268544056694053, |
|
"kl": 0.183233642578125, |
|
"learning_rate": 1.9557998168616087e-05, |
|
"loss": 0.0073, |
|
"reward": 0.11875000316649675, |
|
"reward_std": 0.09133462626487017, |
|
"rewards/accuracy_reward": 0.11875000316649675, |
|
"rewards/format_reward": 0.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 465.0104305267334, |
|
"epoch": 0.1888433345757237, |
|
"grad_norm": 0.48827428252889743, |
|
"kl": 0.17584228515625, |
|
"learning_rate": 1.9523350489863545e-05, |
|
"loss": 0.007, |
|
"reward": 0.1437500037252903, |
|
"reward_std": 0.10901229586452246, |
|
"rewards/accuracy_reward": 0.1437500037252903, |
|
"rewards/format_reward": 0.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 444.8500112533569, |
|
"epoch": 0.19215637553319254, |
|
"grad_norm": 0.4683984486186781, |
|
"kl": 0.18759765625, |
|
"learning_rate": 1.9487428647669688e-05, |
|
"loss": 0.0075, |
|
"reward": 0.11458333637565374, |
|
"reward_std": 0.09722718279808759, |
|
"rewards/accuracy_reward": 0.11458333637565374, |
|
"rewards/format_reward": 0.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 500.3625148773193, |
|
"epoch": 0.19546941649066138, |
|
"grad_norm": 0.1814212886292743, |
|
"kl": 0.18865966796875, |
|
"learning_rate": 1.9450237448147463e-05, |
|
"loss": 0.0076, |
|
"reward": 0.0958333358168602, |
|
"reward_std": 0.08249579146504402, |
|
"rewards/accuracy_reward": 0.0958333358168602, |
|
"rewards/format_reward": 0.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 566.7708499908447, |
|
"epoch": 0.1987824574481302, |
|
"grad_norm": 0.24448480046541934, |
|
"kl": 0.201904296875, |
|
"learning_rate": 1.9411781867241718e-05, |
|
"loss": 0.0081, |
|
"reward": 0.0916666692122817, |
|
"reward_std": 0.09428090453147889, |
|
"rewards/accuracy_reward": 0.0916666692122817, |
|
"rewards/format_reward": 0.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1987824574481302, |
|
"eval_completion_length": 547.3529555376838, |
|
"eval_kl": 0.18393841911764705, |
|
"eval_loss": 0.007359905168414116, |
|
"eval_reward": 0.112745100961012, |
|
"eval_reward_std": 0.10398629176266053, |
|
"eval_rewards/accuracy_reward": 0.112745100961012, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 77.7803, |
|
"eval_samples_per_second": 1.273, |
|
"eval_steps_per_second": 0.116, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 530.7312660217285, |
|
"epoch": 0.20209549840559904, |
|
"grad_norm": 0.20881065313746844, |
|
"kl": 0.196630859375, |
|
"learning_rate": 1.937206705006344e-05, |
|
"loss": 0.0079, |
|
"reward": 0.12708333600312471, |
|
"reward_std": 0.07954951319843531, |
|
"rewards/accuracy_reward": 0.12708333600312471, |
|
"rewards/format_reward": 0.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 467.48334827423093, |
|
"epoch": 0.20540853936306788, |
|
"grad_norm": 0.2581604337535273, |
|
"kl": 0.188153076171875, |
|
"learning_rate": 1.9331098310201392e-05, |
|
"loss": 0.0075, |
|
"reward": 0.10416666921228171, |
|
"reward_std": 0.10017346087843179, |
|
"rewards/accuracy_reward": 0.10416666921228171, |
|
"rewards/format_reward": 0.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 550.3041828155517, |
|
"epoch": 0.20872158032053673, |
|
"grad_norm": 0.3018033078636091, |
|
"kl": 0.183837890625, |
|
"learning_rate": 1.9288881129011177e-05, |
|
"loss": 0.0074, |
|
"reward": 0.12083333656191826, |
|
"reward_std": 0.12374368719756604, |
|
"rewards/accuracy_reward": 0.12083333656191826, |
|
"rewards/format_reward": 0.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 554.0333499908447, |
|
"epoch": 0.21203462127800554, |
|
"grad_norm": 0.37948238789004246, |
|
"kl": 0.2573486328125, |
|
"learning_rate": 1.9245421154881873e-05, |
|
"loss": 0.0103, |
|
"reward": 0.10416666958481073, |
|
"reward_std": 0.10606601759791374, |
|
"rewards/accuracy_reward": 0.10416666958481073, |
|
"rewards/format_reward": 0.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 495.84793186187744, |
|
"epoch": 0.21534766223547439, |
|
"grad_norm": 0.3507554355169013, |
|
"kl": 0.20789794921875, |
|
"learning_rate": 1.9200724202480305e-05, |
|
"loss": 0.0083, |
|
"reward": 0.08125000223517417, |
|
"reward_std": 0.08544206973165273, |
|
"rewards/accuracy_reward": 0.08125000223517417, |
|
"rewards/format_reward": 0.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 439.99584617614744, |
|
"epoch": 0.21866070319294323, |
|
"grad_norm": 0.33415650171071337, |
|
"kl": 0.20986328125, |
|
"learning_rate": 1.9154796251973092e-05, |
|
"loss": 0.0084, |
|
"reward": 0.11250000316649675, |
|
"reward_std": 0.08249579146504402, |
|
"rewards/accuracy_reward": 0.11250000316649675, |
|
"rewards/format_reward": 0.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 502.4770965576172, |
|
"epoch": 0.22197374415041207, |
|
"grad_norm": 0.29077054623985626, |
|
"kl": 0.2102783203125, |
|
"learning_rate": 1.9107643448226536e-05, |
|
"loss": 0.0084, |
|
"reward": 0.12291667032986879, |
|
"reward_std": 0.09722718279808759, |
|
"rewards/accuracy_reward": 0.12291667032986879, |
|
"rewards/format_reward": 0.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 492.98126487731935, |
|
"epoch": 0.2252867851078809, |
|
"grad_norm": 0.32276994073724186, |
|
"kl": 0.2305908203125, |
|
"learning_rate": 1.905927209998447e-05, |
|
"loss": 0.0092, |
|
"reward": 0.13750000335276127, |
|
"reward_std": 0.10606601759791374, |
|
"rewards/accuracy_reward": 0.13750000335276127, |
|
"rewards/format_reward": 0.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 496.520845413208, |
|
"epoch": 0.22859982606534973, |
|
"grad_norm": 0.3397890414322317, |
|
"kl": 0.20244140625, |
|
"learning_rate": 1.900968867902419e-05, |
|
"loss": 0.0081, |
|
"reward": 0.1000000024214387, |
|
"reward_std": 0.10017346106469631, |
|
"rewards/accuracy_reward": 0.1000000024214387, |
|
"rewards/format_reward": 0.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 539.0625156402588, |
|
"epoch": 0.23191286702281858, |
|
"grad_norm": 0.34646557191148036, |
|
"kl": 0.18717041015625, |
|
"learning_rate": 1.8958899819290592e-05, |
|
"loss": 0.0075, |
|
"reward": 0.1020833358168602, |
|
"reward_std": 0.08544206973165273, |
|
"rewards/accuracy_reward": 0.1020833358168602, |
|
"rewards/format_reward": 0.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 534.3270980834961, |
|
"epoch": 0.23522590798028742, |
|
"grad_norm": 0.306741354263696, |
|
"kl": 0.2046142578125, |
|
"learning_rate": 1.890691231600856e-05, |
|
"loss": 0.0082, |
|
"reward": 0.0916666692122817, |
|
"reward_std": 0.09428090434521437, |
|
"rewards/accuracy_reward": 0.0916666692122817, |
|
"rewards/format_reward": 0.0, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 551.9895969390869, |
|
"epoch": 0.23853894893775623, |
|
"grad_norm": 0.293113566609471, |
|
"kl": 0.2113525390625, |
|
"learning_rate": 1.8853733124773837e-05, |
|
"loss": 0.0085, |
|
"reward": 0.12083333656191826, |
|
"reward_std": 0.10017346106469631, |
|
"rewards/accuracy_reward": 0.12083333656191826, |
|
"rewards/format_reward": 0.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 487.33751411437987, |
|
"epoch": 0.24185198989522508, |
|
"grad_norm": 0.3636802453570497, |
|
"kl": 0.20135498046875, |
|
"learning_rate": 1.8799369360622394e-05, |
|
"loss": 0.0081, |
|
"reward": 0.13750000316649674, |
|
"reward_std": 0.10606601722538471, |
|
"rewards/accuracy_reward": 0.13750000316649674, |
|
"rewards/format_reward": 0.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 432.9791788101196, |
|
"epoch": 0.24516503085269392, |
|
"grad_norm": 0.7560117165844005, |
|
"kl": 0.20098876953125, |
|
"learning_rate": 1.8743828297078485e-05, |
|
"loss": 0.008, |
|
"reward": 0.1062500026077032, |
|
"reward_std": 0.09133462626487017, |
|
"rewards/accuracy_reward": 0.1062500026077032, |
|
"rewards/format_reward": 0.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 461.3520961761475, |
|
"epoch": 0.24847807181016277, |
|
"grad_norm": 0.24674434059917383, |
|
"kl": 0.2183349609375, |
|
"learning_rate": 1.8687117365181514e-05, |
|
"loss": 0.0087, |
|
"reward": 0.12916666995733977, |
|
"reward_std": 0.10606601759791374, |
|
"rewards/accuracy_reward": 0.12916666995733977, |
|
"rewards/format_reward": 0.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 571.5291835784913, |
|
"epoch": 0.2517911127676316, |
|
"grad_norm": 0.25861125382341793, |
|
"kl": 0.2226318359375, |
|
"learning_rate": 1.8629244152491773e-05, |
|
"loss": 0.0089, |
|
"reward": 0.07291666846722364, |
|
"reward_std": 0.06187184359878302, |
|
"rewards/accuracy_reward": 0.07291666846722364, |
|
"rewards/format_reward": 0.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 584.9791839599609, |
|
"epoch": 0.2551041537251004, |
|
"grad_norm": 0.37996774263610955, |
|
"kl": 0.20263671875, |
|
"learning_rate": 1.8570216402075326e-05, |
|
"loss": 0.0081, |
|
"reward": 0.10416666958481073, |
|
"reward_std": 0.10606601759791374, |
|
"rewards/accuracy_reward": 0.10416666958481073, |
|
"rewards/format_reward": 0.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"completion_length": 480.7541809082031, |
|
"epoch": 0.25841719468256924, |
|
"grad_norm": 0.4807126789722509, |
|
"kl": 0.2365234375, |
|
"learning_rate": 1.8510042011467978e-05, |
|
"loss": 0.0095, |
|
"reward": 0.0979166692122817, |
|
"reward_std": 0.09722718279808759, |
|
"rewards/accuracy_reward": 0.0979166692122817, |
|
"rewards/format_reward": 0.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 464.12292823791506, |
|
"epoch": 0.2617302356400381, |
|
"grad_norm": 0.2806539928587211, |
|
"kl": 0.22197265625, |
|
"learning_rate": 1.8448729031618687e-05, |
|
"loss": 0.0089, |
|
"reward": 0.07083333525806665, |
|
"reward_std": 0.07071067839860916, |
|
"rewards/accuracy_reward": 0.07083333525806665, |
|
"rewards/format_reward": 0.0, |
|
"step": 395 |
|
}, |
|
{ |
|
"completion_length": 521.7875158309937, |
|
"epoch": 0.2650432765975069, |
|
"grad_norm": 0.11840433673859241, |
|
"kl": 0.1973876953125, |
|
"learning_rate": 1.838628566581236e-05, |
|
"loss": 0.0079, |
|
"reward": 0.08333333544433116, |
|
"reward_std": 0.0766032349318266, |
|
"rewards/accuracy_reward": 0.08333333544433116, |
|
"rewards/format_reward": 0.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2650432765975069, |
|
"eval_completion_length": 560.7108028636259, |
|
"eval_kl": 0.17957261029411764, |
|
"eval_loss": 0.007205521687865257, |
|
"eval_reward": 0.07352941307951422, |
|
"eval_reward_std": 0.06239177505759632, |
|
"eval_rewards/accuracy_reward": 0.07352941307951422, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 83.1386, |
|
"eval_samples_per_second": 1.191, |
|
"eval_steps_per_second": 0.108, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 595.2062665939332, |
|
"epoch": 0.2683563175549758, |
|
"grad_norm": 0.33533763786101467, |
|
"kl": 0.1843994140625, |
|
"learning_rate": 1.8322720268572333e-05, |
|
"loss": 0.0074, |
|
"reward": 0.09375000242143869, |
|
"reward_std": 0.07365695666521788, |
|
"rewards/accuracy_reward": 0.09375000242143869, |
|
"rewards/format_reward": 0.0, |
|
"step": 405 |
|
}, |
|
{ |
|
"completion_length": 549.1041831970215, |
|
"epoch": 0.2716693585124446, |
|
"grad_norm": 0.503465226947629, |
|
"kl": 0.19075927734375, |
|
"learning_rate": 1.8258041344542567e-05, |
|
"loss": 0.0076, |
|
"reward": 0.0895833358168602, |
|
"reward_std": 0.07954951319843531, |
|
"rewards/accuracy_reward": 0.0895833358168602, |
|
"rewards/format_reward": 0.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 539.4666854858399, |
|
"epoch": 0.27498239946991343, |
|
"grad_norm": 0.2542908863882778, |
|
"kl": 0.200390625, |
|
"learning_rate": 1.8192257547349805e-05, |
|
"loss": 0.008, |
|
"reward": 0.08541666883975267, |
|
"reward_std": 0.07365695666521788, |
|
"rewards/accuracy_reward": 0.08541666883975267, |
|
"rewards/format_reward": 0.0, |
|
"step": 415 |
|
}, |
|
{ |
|
"completion_length": 490.5791805267334, |
|
"epoch": 0.2782954404273823, |
|
"grad_norm": 0.3338735472126353, |
|
"kl": 0.19786376953125, |
|
"learning_rate": 1.8125377678445755e-05, |
|
"loss": 0.0079, |
|
"reward": 0.08958333563059569, |
|
"reward_std": 0.07954951319843531, |
|
"rewards/accuracy_reward": 0.08958333563059569, |
|
"rewards/format_reward": 0.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 442.22501316070554, |
|
"epoch": 0.2816084813848511, |
|
"grad_norm": 0.31309794250892437, |
|
"kl": 0.1982177734375, |
|
"learning_rate": 1.8057410685929505e-05, |
|
"loss": 0.0079, |
|
"reward": 0.09375000223517418, |
|
"reward_std": 0.08544206954538822, |
|
"rewards/accuracy_reward": 0.09375000223517418, |
|
"rewards/format_reward": 0.0, |
|
"step": 425 |
|
}, |
|
{ |
|
"completion_length": 610.3291862487793, |
|
"epoch": 0.28492152234231993, |
|
"grad_norm": 0.18677755115855318, |
|
"kl": 0.18875732421875, |
|
"learning_rate": 1.7988365663350352e-05, |
|
"loss": 0.0076, |
|
"reward": 0.09375000279396772, |
|
"reward_std": 0.07954951319843531, |
|
"rewards/accuracy_reward": 0.09375000279396772, |
|
"rewards/format_reward": 0.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 618.6645980834961, |
|
"epoch": 0.2882345632997888, |
|
"grad_norm": 0.4382724584840247, |
|
"kl": 0.201416015625, |
|
"learning_rate": 1.7918251848491118e-05, |
|
"loss": 0.0081, |
|
"reward": 0.09375000279396772, |
|
"reward_std": 0.07954951319843531, |
|
"rewards/accuracy_reward": 0.09375000279396772, |
|
"rewards/format_reward": 0.0, |
|
"step": 435 |
|
}, |
|
{ |
|
"completion_length": 509.11668395996094, |
|
"epoch": 0.2915476042572576, |
|
"grad_norm": 0.741790732534879, |
|
"kl": 0.22264404296875, |
|
"learning_rate": 1.7847078622132202e-05, |
|
"loss": 0.0089, |
|
"reward": 0.11666667014360428, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/accuracy_reward": 0.11666667014360428, |
|
"rewards/format_reward": 0.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 455.8645965576172, |
|
"epoch": 0.2948606452147265, |
|
"grad_norm": 0.22712749456065462, |
|
"kl": 0.2342041015625, |
|
"learning_rate": 1.7774855506796497e-05, |
|
"loss": 0.0094, |
|
"reward": 0.08750000223517418, |
|
"reward_std": 0.07071067839860916, |
|
"rewards/accuracy_reward": 0.08750000223517418, |
|
"rewards/format_reward": 0.0, |
|
"step": 445 |
|
}, |
|
{ |
|
"completion_length": 508.2083498001099, |
|
"epoch": 0.2981736861721953, |
|
"grad_norm": 0.2862921365130213, |
|
"kl": 0.19876708984375, |
|
"learning_rate": 1.770159216547532e-05, |
|
"loss": 0.008, |
|
"reward": 0.10833333637565375, |
|
"reward_std": 0.10017346106469631, |
|
"rewards/accuracy_reward": 0.10833333637565375, |
|
"rewards/format_reward": 0.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 610.0416835784912, |
|
"epoch": 0.3014867271296641, |
|
"grad_norm": 0.20694738981947886, |
|
"kl": 0.18927001953125, |
|
"learning_rate": 1.76272984003356e-05, |
|
"loss": 0.0076, |
|
"reward": 0.09583333600312471, |
|
"reward_std": 0.08249579146504402, |
|
"rewards/accuracy_reward": 0.09583333600312471, |
|
"rewards/format_reward": 0.0, |
|
"step": 455 |
|
}, |
|
{ |
|
"completion_length": 579.829182434082, |
|
"epoch": 0.304799768087133, |
|
"grad_norm": 0.27667252528639497, |
|
"kl": 0.203564453125, |
|
"learning_rate": 1.7551984151408363e-05, |
|
"loss": 0.0081, |
|
"reward": 0.13125000335276127, |
|
"reward_std": 0.1207974087446928, |
|
"rewards/accuracy_reward": 0.13125000335276127, |
|
"rewards/format_reward": 0.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 590.5333518981934, |
|
"epoch": 0.3081128090446018, |
|
"grad_norm": 0.2881508370061625, |
|
"kl": 0.22376708984375, |
|
"learning_rate": 1.7475659495258864e-05, |
|
"loss": 0.0089, |
|
"reward": 0.10416666939854621, |
|
"reward_std": 0.08838834799826145, |
|
"rewards/accuracy_reward": 0.10416666939854621, |
|
"rewards/format_reward": 0.0, |
|
"step": 465 |
|
}, |
|
{ |
|
"completion_length": 533.4229331970215, |
|
"epoch": 0.3114258500020706, |
|
"grad_norm": 0.2567790764625792, |
|
"kl": 0.211279296875, |
|
"learning_rate": 1.739833464363838e-05, |
|
"loss": 0.0085, |
|
"reward": 0.11875000298023224, |
|
"reward_std": 0.10311973933130503, |
|
"rewards/accuracy_reward": 0.11875000298023224, |
|
"rewards/format_reward": 0.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"completion_length": 534.5958490371704, |
|
"epoch": 0.3147388909595395, |
|
"grad_norm": 0.28822211835842837, |
|
"kl": 0.21998291015625, |
|
"learning_rate": 1.7320019942117954e-05, |
|
"loss": 0.0088, |
|
"reward": 0.11666666995733976, |
|
"reward_std": 0.09428090453147889, |
|
"rewards/accuracy_reward": 0.11666666995733976, |
|
"rewards/format_reward": 0.0, |
|
"step": 475 |
|
}, |
|
{ |
|
"completion_length": 502.02084770202634, |
|
"epoch": 0.3180519319170083, |
|
"grad_norm": 0.33502620138635836, |
|
"kl": 0.2275390625, |
|
"learning_rate": 1.7240725868704218e-05, |
|
"loss": 0.0091, |
|
"reward": 0.08541666902601719, |
|
"reward_std": 0.07365695666521788, |
|
"rewards/accuracy_reward": 0.08541666902601719, |
|
"rewards/format_reward": 0.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"completion_length": 524.7958484649658, |
|
"epoch": 0.3213649728744772, |
|
"grad_norm": 0.3472442436711827, |
|
"kl": 0.223516845703125, |
|
"learning_rate": 1.71604630324375e-05, |
|
"loss": 0.0089, |
|
"reward": 0.08958333563059569, |
|
"reward_std": 0.08544206973165273, |
|
"rewards/accuracy_reward": 0.08958333563059569, |
|
"rewards/format_reward": 0.0, |
|
"step": 485 |
|
}, |
|
{ |
|
"completion_length": 524.3166828155518, |
|
"epoch": 0.324678013831946, |
|
"grad_norm": 0.3263087112920367, |
|
"kl": 0.22652587890625, |
|
"learning_rate": 1.7079242171972417e-05, |
|
"loss": 0.0091, |
|
"reward": 0.10416666921228171, |
|
"reward_std": 0.09428090453147889, |
|
"rewards/accuracy_reward": 0.10416666921228171, |
|
"rewards/format_reward": 0.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"completion_length": 571.7645992279053, |
|
"epoch": 0.3279910547894148, |
|
"grad_norm": 0.28845204101541877, |
|
"kl": 0.20472412109375, |
|
"learning_rate": 1.6997074154141097e-05, |
|
"loss": 0.0082, |
|
"reward": 0.10833333600312471, |
|
"reward_std": 0.08838834799826145, |
|
"rewards/accuracy_reward": 0.10833333600312471, |
|
"rewards/format_reward": 0.0, |
|
"step": 495 |
|
}, |
|
{ |
|
"completion_length": 551.5854331970215, |
|
"epoch": 0.3313040957468837, |
|
"grad_norm": 0.2761223611585464, |
|
"kl": 0.1948486328125, |
|
"learning_rate": 1.6913969972499272e-05, |
|
"loss": 0.0078, |
|
"reward": 0.12291666977107525, |
|
"reward_std": 0.10311973914504051, |
|
"rewards/accuracy_reward": 0.12291666977107525, |
|
"rewards/format_reward": 0.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3313040957468837, |
|
"eval_completion_length": 505.10295553768384, |
|
"eval_kl": 0.19996553308823528, |
|
"eval_loss": 0.008075407706201077, |
|
"eval_reward": 0.13725490517476024, |
|
"eval_reward_std": 0.1386483881403418, |
|
"eval_rewards/accuracy_reward": 0.13725490517476024, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 76.805, |
|
"eval_samples_per_second": 1.289, |
|
"eval_steps_per_second": 0.117, |
|
"step": 500 |
|
}, |
|
{ |
|
"completion_length": 499.3750162124634, |
|
"epoch": 0.3346171367043525, |
|
"grad_norm": 0.3713745412028642, |
|
"kl": 0.19395751953125, |
|
"learning_rate": 1.682994074585541e-05, |
|
"loss": 0.0078, |
|
"reward": 0.13750000298023224, |
|
"reward_std": 0.10606601741164923, |
|
"rewards/accuracy_reward": 0.13750000298023224, |
|
"rewards/format_reward": 0.0, |
|
"step": 505 |
|
}, |
|
{ |
|
"completion_length": 504.960432434082, |
|
"epoch": 0.3379301776618213, |
|
"grad_norm": 0.2985183472381298, |
|
"kl": 0.208966064453125, |
|
"learning_rate": 1.674499771678309e-05, |
|
"loss": 0.0084, |
|
"reward": 0.11041666958481074, |
|
"reward_std": 0.06776440013200044, |
|
"rewards/accuracy_reward": 0.11041666958481074, |
|
"rewards/format_reward": 0.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"completion_length": 534.1958522796631, |
|
"epoch": 0.3412432186192902, |
|
"grad_norm": 2600.854085321423, |
|
"kl": 52.59376220703125, |
|
"learning_rate": 1.665915225011681e-05, |
|
"loss": 2.0976, |
|
"reward": 0.11458333600312472, |
|
"reward_std": 0.10901229586452246, |
|
"rewards/accuracy_reward": 0.11458333600312472, |
|
"rewards/format_reward": 0.0, |
|
"step": 515 |
|
}, |
|
{ |
|
"completion_length": 599.9729377746582, |
|
"epoch": 0.344556259576759, |
|
"grad_norm": 0.2386666186123784, |
|
"kl": 0.892706298828125, |
|
"learning_rate": 1.6572415831431466e-05, |
|
"loss": 0.0358, |
|
"reward": 0.1062500026077032, |
|
"reward_std": 0.10311973933130503, |
|
"rewards/accuracy_reward": 0.1062500026077032, |
|
"rewards/format_reward": 0.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"completion_length": 546.0208488464356, |
|
"epoch": 0.3478693005342279, |
|
"grad_norm": 0.34730751193742443, |
|
"kl": 0.313232421875, |
|
"learning_rate": 1.6484800065505627e-05, |
|
"loss": 0.0125, |
|
"reward": 0.07708333563059569, |
|
"reward_std": 0.07954951319843531, |
|
"rewards/accuracy_reward": 0.07708333563059569, |
|
"rewards/format_reward": 0.0, |
|
"step": 525 |
|
}, |
|
{ |
|
"completion_length": 464.66667861938475, |
|
"epoch": 0.3511823414916967, |
|
"grad_norm": 0.39701952165391247, |
|
"kl": 0.2570068359375, |
|
"learning_rate": 1.6396316674768914e-05, |
|
"loss": 0.0103, |
|
"reward": 0.10208333600312472, |
|
"reward_std": 0.10311973933130503, |
|
"rewards/accuracy_reward": 0.10208333600312472, |
|
"rewards/format_reward": 0.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"completion_length": 522.2875156402588, |
|
"epoch": 0.3544953824491655, |
|
"grad_norm": 0.28714956052647667, |
|
"kl": 0.222015380859375, |
|
"learning_rate": 1.630697749773359e-05, |
|
"loss": 0.0089, |
|
"reward": 0.12291666958481073, |
|
"reward_std": 0.12079740893095732, |
|
"rewards/accuracy_reward": 0.12291666958481073, |
|
"rewards/format_reward": 0.0, |
|
"step": 535 |
|
}, |
|
{ |
|
"completion_length": 589.6312671661377, |
|
"epoch": 0.3578084234066344, |
|
"grad_norm": 0.30471851904044167, |
|
"kl": 0.219708251953125, |
|
"learning_rate": 1.621679448741067e-05, |
|
"loss": 0.0088, |
|
"reward": 0.0937500026077032, |
|
"reward_std": 0.07365695666521788, |
|
"rewards/accuracy_reward": 0.0937500026077032, |
|
"rewards/format_reward": 0.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"completion_length": 574.7833515167237, |
|
"epoch": 0.3611214643641032, |
|
"grad_norm": 0.2748841871191923, |
|
"kl": 0.220672607421875, |
|
"learning_rate": 1.6125779709710668e-05, |
|
"loss": 0.0088, |
|
"reward": 0.1104166692122817, |
|
"reward_std": 0.07365695666521788, |
|
"rewards/accuracy_reward": 0.1104166692122817, |
|
"rewards/format_reward": 0.0, |
|
"step": 545 |
|
}, |
|
{ |
|
"completion_length": 610.558349609375, |
|
"epoch": 0.364434505321572, |
|
"grad_norm": 0.17413907087385358, |
|
"kl": 0.19656982421875, |
|
"learning_rate": 1.603394534182925e-05, |
|
"loss": 0.0079, |
|
"reward": 0.12291666939854622, |
|
"reward_std": 0.08544206973165273, |
|
"rewards/accuracy_reward": 0.12291666939854622, |
|
"rewards/format_reward": 0.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"completion_length": 632.8937694549561, |
|
"epoch": 0.3677475462790409, |
|
"grad_norm": 0.28335081427323106, |
|
"kl": 0.18609619140625, |
|
"learning_rate": 1.5941303670618018e-05, |
|
"loss": 0.0074, |
|
"reward": 0.11041666902601718, |
|
"reward_std": 0.10901229586452246, |
|
"rewards/accuracy_reward": 0.11041666902601718, |
|
"rewards/format_reward": 0.0, |
|
"step": 555 |
|
}, |
|
{ |
|
"completion_length": 593.0187686920166, |
|
"epoch": 0.3710605872365097, |
|
"grad_norm": 0.34731414645367464, |
|
"kl": 0.191357421875, |
|
"learning_rate": 1.5847867090940602e-05, |
|
"loss": 0.0077, |
|
"reward": 0.12083333656191826, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/accuracy_reward": 0.12083333656191826, |
|
"rewards/format_reward": 0.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"completion_length": 551.2416843414306, |
|
"epoch": 0.37437362819397857, |
|
"grad_norm": 0.3247113074997217, |
|
"kl": 0.196209716796875, |
|
"learning_rate": 1.57536481040143e-05, |
|
"loss": 0.0078, |
|
"reward": 0.11041666958481074, |
|
"reward_std": 0.08544206973165273, |
|
"rewards/accuracy_reward": 0.11041666958481074, |
|
"rewards/format_reward": 0.0, |
|
"step": 565 |
|
}, |
|
{ |
|
"completion_length": 515.1666835784912, |
|
"epoch": 0.3776866691514474, |
|
"grad_norm": 0.2586024101464412, |
|
"kl": 0.2087890625, |
|
"learning_rate": 1.5658659315737505e-05, |
|
"loss": 0.0083, |
|
"reward": 0.11250000279396773, |
|
"reward_std": 0.10606601759791374, |
|
"rewards/accuracy_reward": 0.11250000279396773, |
|
"rewards/format_reward": 0.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"completion_length": 510.70001316070557, |
|
"epoch": 0.3809997101089162, |
|
"grad_norm": 0.29072360950689996, |
|
"kl": 0.20093994140625, |
|
"learning_rate": 1.5562913435003113e-05, |
|
"loss": 0.008, |
|
"reward": 0.12083333674818278, |
|
"reward_std": 0.09428090453147889, |
|
"rewards/accuracy_reward": 0.12083333674818278, |
|
"rewards/format_reward": 0.0, |
|
"step": 575 |
|
}, |
|
{ |
|
"completion_length": 537.1416828155518, |
|
"epoch": 0.38431275106638507, |
|
"grad_norm": 0.4567470923989011, |
|
"kl": 0.19588623046875, |
|
"learning_rate": 1.5466423271998144e-05, |
|
"loss": 0.0078, |
|
"reward": 0.11666666939854622, |
|
"reward_std": 0.0766032349318266, |
|
"rewards/accuracy_reward": 0.11666666939854622, |
|
"rewards/format_reward": 0.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"completion_length": 545.7729301452637, |
|
"epoch": 0.3876257920238539, |
|
"grad_norm": 0.22942721449166853, |
|
"kl": 0.210009765625, |
|
"learning_rate": 1.536920173648984e-05, |
|
"loss": 0.0084, |
|
"reward": 0.10416666939854621, |
|
"reward_std": 0.08838834781199693, |
|
"rewards/accuracy_reward": 0.10416666939854621, |
|
"rewards/format_reward": 0.0, |
|
"step": 585 |
|
}, |
|
{ |
|
"completion_length": 562.1437660217285, |
|
"epoch": 0.39093883298132276, |
|
"grad_norm": 0.16727493145277006, |
|
"kl": 0.21328125, |
|
"learning_rate": 1.5271261836098403e-05, |
|
"loss": 0.0085, |
|
"reward": 0.07708333544433117, |
|
"reward_std": 0.06776440013200044, |
|
"rewards/accuracy_reward": 0.07708333544433117, |
|
"rewards/format_reward": 0.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"completion_length": 559.0500179290772, |
|
"epoch": 0.3942518739387916, |
|
"grad_norm": 0.26024076771764676, |
|
"kl": 0.210040283203125, |
|
"learning_rate": 1.5172616674556673e-05, |
|
"loss": 0.0084, |
|
"reward": 0.0979166692122817, |
|
"reward_std": 0.09133462626487017, |
|
"rewards/accuracy_reward": 0.0979166692122817, |
|
"rewards/format_reward": 0.0, |
|
"step": 595 |
|
}, |
|
{ |
|
"completion_length": 555.5812656402588, |
|
"epoch": 0.3975649148962604, |
|
"grad_norm": 0.2500300158618388, |
|
"kl": 0.17904052734375, |
|
"learning_rate": 1.5073279449956916e-05, |
|
"loss": 0.0072, |
|
"reward": 0.14375000298023224, |
|
"reward_std": 0.10311973914504051, |
|
"rewards/accuracy_reward": 0.14375000298023224, |
|
"rewards/format_reward": 0.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3975649148962604, |
|
"eval_completion_length": 543.9804059196921, |
|
"eval_kl": 0.17497702205882354, |
|
"eval_loss": 0.006994770839810371, |
|
"eval_reward": 0.15196078928077922, |
|
"eval_reward_std": 0.09012145242270302, |
|
"eval_rewards/accuracy_reward": 0.15196078928077922, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 78.4023, |
|
"eval_samples_per_second": 1.263, |
|
"eval_steps_per_second": 0.115, |
|
"step": 600 |
|
}, |
|
{ |
|
"completion_length": 567.908349609375, |
|
"epoch": 0.40087795585372926, |
|
"grad_norm": 0.30328216491481214, |
|
"kl": 0.16656494140625, |
|
"learning_rate": 1.4973263452985023e-05, |
|
"loss": 0.0067, |
|
"reward": 0.15208333749324082, |
|
"reward_std": 0.12079740893095732, |
|
"rewards/accuracy_reward": 0.15208333749324082, |
|
"rewards/format_reward": 0.0, |
|
"step": 605 |
|
}, |
|
{ |
|
"completion_length": 587.0479328155518, |
|
"epoch": 0.4041909968111981, |
|
"grad_norm": 0.28902742765864486, |
|
"kl": 0.166668701171875, |
|
"learning_rate": 1.4872582065142285e-05, |
|
"loss": 0.0067, |
|
"reward": 0.12291666958481073, |
|
"reward_std": 0.10311973914504051, |
|
"rewards/accuracy_reward": 0.12291666958481073, |
|
"rewards/format_reward": 0.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"completion_length": 580.5958499908447, |
|
"epoch": 0.4075040377686669, |
|
"grad_norm": 0.26061142390184844, |
|
"kl": 0.179718017578125, |
|
"learning_rate": 1.4771248756955042e-05, |
|
"loss": 0.0072, |
|
"reward": 0.15416667014360427, |
|
"reward_std": 0.11785113047808408, |
|
"rewards/accuracy_reward": 0.15416667014360427, |
|
"rewards/format_reward": 0.0, |
|
"step": 615 |
|
}, |
|
{ |
|
"completion_length": 621.1146015167236, |
|
"epoch": 0.41081707872613576, |
|
"grad_norm": 0.3726707315873621, |
|
"kl": 0.20567626953125, |
|
"learning_rate": 1.4669277086172406e-05, |
|
"loss": 0.0082, |
|
"reward": 0.10416666958481073, |
|
"reward_std": 0.10017346106469631, |
|
"rewards/accuracy_reward": 0.10416666958481073, |
|
"rewards/format_reward": 0.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"completion_length": 566.5250190734863, |
|
"epoch": 0.4141301196836046, |
|
"grad_norm": 0.28592154315160423, |
|
"kl": 0.21568603515625, |
|
"learning_rate": 1.4566680695952333e-05, |
|
"loss": 0.0086, |
|
"reward": 0.10625000279396772, |
|
"reward_std": 0.09722718279808759, |
|
"rewards/accuracy_reward": 0.10625000279396772, |
|
"rewards/format_reward": 0.0, |
|
"step": 625 |
|
}, |
|
{ |
|
"completion_length": 470.1937656402588, |
|
"epoch": 0.41744316064107345, |
|
"grad_norm": 0.3545452795184588, |
|
"kl": 0.217047119140625, |
|
"learning_rate": 1.4463473313036241e-05, |
|
"loss": 0.0087, |
|
"reward": 0.10625000298023224, |
|
"reward_std": 0.09722718279808759, |
|
"rewards/accuracy_reward": 0.10625000298023224, |
|
"rewards/format_reward": 0.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"completion_length": 489.2375156402588, |
|
"epoch": 0.42075620159854227, |
|
"grad_norm": 0.281093922778775, |
|
"kl": 0.183544921875, |
|
"learning_rate": 1.4359668745912472e-05, |
|
"loss": 0.0073, |
|
"reward": 0.15416667014360427, |
|
"reward_std": 0.10017346106469631, |
|
"rewards/accuracy_reward": 0.15416667014360427, |
|
"rewards/format_reward": 0.0, |
|
"step": 635 |
|
}, |
|
{ |
|
"completion_length": 509.7854309082031, |
|
"epoch": 0.4240692425560111, |
|
"grad_norm": 0.3083814307787709, |
|
"kl": 0.183203125, |
|
"learning_rate": 1.4255280882968787e-05, |
|
"loss": 0.0073, |
|
"reward": 0.1416666701436043, |
|
"reward_std": 0.11195857413113117, |
|
"rewards/accuracy_reward": 0.1416666701436043, |
|
"rewards/format_reward": 0.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"completion_length": 531.0479331970215, |
|
"epoch": 0.42738228351347995, |
|
"grad_norm": 0.4133177147977361, |
|
"kl": 0.219439697265625, |
|
"learning_rate": 1.415032369063422e-05, |
|
"loss": 0.0088, |
|
"reward": 0.1458333371207118, |
|
"reward_std": 0.15320646949112415, |
|
"rewards/accuracy_reward": 0.1458333371207118, |
|
"rewards/format_reward": 0.0, |
|
"step": 645 |
|
}, |
|
{ |
|
"completion_length": 534.3729320526123, |
|
"epoch": 0.43069532447094877, |
|
"grad_norm": 0.24836791560619703, |
|
"kl": 0.194482421875, |
|
"learning_rate": 1.4044811211510419e-05, |
|
"loss": 0.0078, |
|
"reward": 0.11666666977107525, |
|
"reward_std": 0.10017346106469631, |
|
"rewards/accuracy_reward": 0.11666666977107525, |
|
"rewards/format_reward": 0.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"completion_length": 495.2333480834961, |
|
"epoch": 0.4340083654284176, |
|
"grad_norm": 0.26953166407001816, |
|
"kl": 0.20745849609375, |
|
"learning_rate": 1.3938757562492873e-05, |
|
"loss": 0.0083, |
|
"reward": 0.10416666977107525, |
|
"reward_std": 0.09428090453147889, |
|
"rewards/accuracy_reward": 0.10416666977107525, |
|
"rewards/format_reward": 0.0, |
|
"step": 655 |
|
}, |
|
{ |
|
"completion_length": 448.07918033599856, |
|
"epoch": 0.43732140638588646, |
|
"grad_norm": 0.2813525701476749, |
|
"kl": 0.20928955078125, |
|
"learning_rate": 1.3832176932882136e-05, |
|
"loss": 0.0084, |
|
"reward": 0.1250000024214387, |
|
"reward_std": 0.09428090453147889, |
|
"rewards/accuracy_reward": 0.1250000024214387, |
|
"rewards/format_reward": 0.0, |
|
"step": 660 |
|
}, |
|
{ |
|
"completion_length": 482.58334884643557, |
|
"epoch": 0.4406344473433553, |
|
"grad_norm": 0.23983882743615745, |
|
"kl": 0.20230712890625, |
|
"learning_rate": 1.3725083582485397e-05, |
|
"loss": 0.0081, |
|
"reward": 0.12083333618938923, |
|
"reward_std": 0.10017346106469631, |
|
"rewards/accuracy_reward": 0.12083333618938923, |
|
"rewards/format_reward": 0.0, |
|
"step": 665 |
|
}, |
|
{ |
|
"completion_length": 531.0250171661377, |
|
"epoch": 0.44394748830082414, |
|
"grad_norm": 0.23057904466712248, |
|
"kl": 0.18568115234375, |
|
"learning_rate": 1.3617491839708614e-05, |
|
"loss": 0.0074, |
|
"reward": 0.11875000316649675, |
|
"reward_std": 0.08544206973165273, |
|
"rewards/accuracy_reward": 0.11875000316649675, |
|
"rewards/format_reward": 0.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"completion_length": 546.8812679290771, |
|
"epoch": 0.44726052925829296, |
|
"grad_norm": 0.2918960689101493, |
|
"kl": 0.17314453125, |
|
"learning_rate": 1.3509416099639456e-05, |
|
"loss": 0.0069, |
|
"reward": 0.14375000316649675, |
|
"reward_std": 0.13847507834434508, |
|
"rewards/accuracy_reward": 0.14375000316649675, |
|
"rewards/format_reward": 0.0, |
|
"step": 675 |
|
}, |
|
{ |
|
"completion_length": 525.9000158309937, |
|
"epoch": 0.4505735702157618, |
|
"grad_norm": 0.33464307998263254, |
|
"kl": 0.18521728515625, |
|
"learning_rate": 1.3400870822121348e-05, |
|
"loss": 0.0074, |
|
"reward": 0.11250000298023224, |
|
"reward_std": 0.10017346106469631, |
|
"rewards/accuracy_reward": 0.11250000298023224, |
|
"rewards/format_reward": 0.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"completion_length": 496.16668090820315, |
|
"epoch": 0.45388661117323065, |
|
"grad_norm": 0.24577312673005983, |
|
"kl": 0.2126708984375, |
|
"learning_rate": 1.3291870529818809e-05, |
|
"loss": 0.0085, |
|
"reward": 0.08333333544433116, |
|
"reward_std": 0.07071067839860916, |
|
"rewards/accuracy_reward": 0.08333333544433116, |
|
"rewards/format_reward": 0.0, |
|
"step": 685 |
|
}, |
|
{ |
|
"completion_length": 516.3520980834961, |
|
"epoch": 0.45719965213069946, |
|
"grad_norm": 0.28279663282344375, |
|
"kl": 0.241748046875, |
|
"learning_rate": 1.3182429806274442e-05, |
|
"loss": 0.0097, |
|
"reward": 0.1145833358168602, |
|
"reward_std": 0.10311973914504051, |
|
"rewards/accuracy_reward": 0.1145833358168602, |
|
"rewards/format_reward": 0.0, |
|
"step": 690 |
|
}, |
|
{ |
|
"completion_length": 489.8125160217285, |
|
"epoch": 0.4605126930881683, |
|
"grad_norm": 0.24489140060416123, |
|
"kl": 0.17784423828125, |
|
"learning_rate": 1.3072563293957725e-05, |
|
"loss": 0.0071, |
|
"reward": 0.10625000279396772, |
|
"reward_std": 0.09722718279808759, |
|
"rewards/accuracy_reward": 0.10625000279396772, |
|
"rewards/format_reward": 0.0, |
|
"step": 695 |
|
}, |
|
{ |
|
"completion_length": 515.912516784668, |
|
"epoch": 0.46382573404563715, |
|
"grad_norm": 0.25229498956295204, |
|
"kl": 0.17552490234375, |
|
"learning_rate": 1.2962285692305964e-05, |
|
"loss": 0.007, |
|
"reward": 0.12916666958481074, |
|
"reward_std": 0.12374368701130152, |
|
"rewards/accuracy_reward": 0.12916666958481074, |
|
"rewards/format_reward": 0.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.46382573404563715, |
|
"eval_completion_length": 477.3774611529182, |
|
"eval_kl": 0.1923828125, |
|
"eval_loss": 0.007697770372033119, |
|
"eval_reward": 0.15196078752770142, |
|
"eval_reward_std": 0.10398629176266053, |
|
"eval_rewards/accuracy_reward": 0.15196078752770142, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 73.7384, |
|
"eval_samples_per_second": 1.343, |
|
"eval_steps_per_second": 0.122, |
|
"step": 700 |
|
}, |
|
{ |
|
"completion_length": 521.3458486557007, |
|
"epoch": 0.46713877500310597, |
|
"grad_norm": 0.3114651168291986, |
|
"kl": 0.19656982421875, |
|
"learning_rate": 1.2851611755757587e-05, |
|
"loss": 0.0079, |
|
"reward": 0.08750000186264514, |
|
"reward_std": 0.0766032349318266, |
|
"rewards/accuracy_reward": 0.08750000186264514, |
|
"rewards/format_reward": 0.0, |
|
"step": 705 |
|
}, |
|
{ |
|
"completion_length": 505.2958469390869, |
|
"epoch": 0.47045181596057484, |
|
"grad_norm": 0.2616041148953221, |
|
"kl": 0.18330078125, |
|
"learning_rate": 1.2740556291778096e-05, |
|
"loss": 0.0073, |
|
"reward": 0.1479166703298688, |
|
"reward_std": 0.10311973914504051, |
|
"rewards/accuracy_reward": 0.1479166703298688, |
|
"rewards/format_reward": 0.0, |
|
"step": 710 |
|
}, |
|
{ |
|
"completion_length": 501.8895986557007, |
|
"epoch": 0.47376485691804365, |
|
"grad_norm": 0.23670699846471074, |
|
"kl": 0.20030517578125, |
|
"learning_rate": 1.2629134158878919e-05, |
|
"loss": 0.008, |
|
"reward": 0.11666666939854622, |
|
"reward_std": 0.09428090453147889, |
|
"rewards/accuracy_reward": 0.11666666939854622, |
|
"rewards/format_reward": 0.0, |
|
"step": 715 |
|
}, |
|
{ |
|
"completion_length": 569.3500164031982, |
|
"epoch": 0.47707789787551247, |
|
"grad_norm": 0.1767297956772261, |
|
"kl": 0.184912109375, |
|
"learning_rate": 1.2517360264629463e-05, |
|
"loss": 0.0074, |
|
"reward": 0.07083333525806665, |
|
"reward_std": 0.05303300879895687, |
|
"rewards/accuracy_reward": 0.07083333525806665, |
|
"rewards/format_reward": 0.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"completion_length": 549.1854320526123, |
|
"epoch": 0.48039093883298134, |
|
"grad_norm": 0.2515191189191733, |
|
"kl": 0.198480224609375, |
|
"learning_rate": 1.2405249563662539e-05, |
|
"loss": 0.0079, |
|
"reward": 0.07500000204890966, |
|
"reward_std": 0.08838834799826145, |
|
"rewards/accuracy_reward": 0.07500000204890966, |
|
"rewards/format_reward": 0.0, |
|
"step": 725 |
|
}, |
|
{ |
|
"completion_length": 502.370849609375, |
|
"epoch": 0.48370397979045016, |
|
"grad_norm": 0.32374848605233897, |
|
"kl": 0.21995849609375, |
|
"learning_rate": 1.2292817055673543e-05, |
|
"loss": 0.0088, |
|
"reward": 0.08125000223517417, |
|
"reward_std": 0.07954951319843531, |
|
"rewards/accuracy_reward": 0.08125000223517417, |
|
"rewards/format_reward": 0.0, |
|
"step": 730 |
|
}, |
|
{ |
|
"completion_length": 496.44584503173826, |
|
"epoch": 0.48701702074791897, |
|
"grad_norm": 0.3225185847351344, |
|
"kl": 0.195855712890625, |
|
"learning_rate": 1.2180077783413601e-05, |
|
"loss": 0.0078, |
|
"reward": 0.08125000223517417, |
|
"reward_std": 0.06776440013200044, |
|
"rewards/accuracy_reward": 0.08125000223517417, |
|
"rewards/format_reward": 0.0, |
|
"step": 735 |
|
}, |
|
{ |
|
"completion_length": 478.5145969390869, |
|
"epoch": 0.49033006170538784, |
|
"grad_norm": 0.3624783956571456, |
|
"kl": 0.192364501953125, |
|
"learning_rate": 1.2067046830676947e-05, |
|
"loss": 0.0077, |
|
"reward": 0.1479166705161333, |
|
"reward_std": 0.13258252181112767, |
|
"rewards/accuracy_reward": 0.1479166705161333, |
|
"rewards/format_reward": 0.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"completion_length": 498.3958461761475, |
|
"epoch": 0.49364310266285666, |
|
"grad_norm": 0.29628298606274384, |
|
"kl": 0.195697021484375, |
|
"learning_rate": 1.1953739320282778e-05, |
|
"loss": 0.0078, |
|
"reward": 0.12708333674818278, |
|
"reward_std": 0.09722718279808759, |
|
"rewards/accuracy_reward": 0.12708333674818278, |
|
"rewards/format_reward": 0.0, |
|
"step": 745 |
|
}, |
|
{ |
|
"completion_length": 522.5895965576171, |
|
"epoch": 0.49695614362032553, |
|
"grad_norm": 0.273049970135215, |
|
"kl": 0.18468017578125, |
|
"learning_rate": 1.1840170412051957e-05, |
|
"loss": 0.0074, |
|
"reward": 0.10833333544433117, |
|
"reward_std": 0.08838834799826145, |
|
"rewards/accuracy_reward": 0.10833333544433117, |
|
"rewards/format_reward": 0.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"completion_length": 523.5166801452636, |
|
"epoch": 0.5002691845777943, |
|
"grad_norm": 0.4360470753971284, |
|
"kl": 0.20830078125, |
|
"learning_rate": 1.1726355300778693e-05, |
|
"loss": 0.0083, |
|
"reward": 0.10416666939854621, |
|
"reward_std": 0.12374368719756604, |
|
"rewards/accuracy_reward": 0.10416666939854621, |
|
"rewards/format_reward": 0.0, |
|
"step": 755 |
|
}, |
|
{ |
|
"completion_length": 585.5500165939332, |
|
"epoch": 0.5035822255352632, |
|
"grad_norm": 0.18267493572339938, |
|
"kl": 0.21558837890625, |
|
"learning_rate": 1.1612309214197599e-05, |
|
"loss": 0.0086, |
|
"reward": 0.10000000260770321, |
|
"reward_std": 0.0766032349318266, |
|
"rewards/accuracy_reward": 0.10000000260770321, |
|
"rewards/format_reward": 0.0, |
|
"step": 760 |
|
}, |
|
{ |
|
"completion_length": 567.3645992279053, |
|
"epoch": 0.506895266492732, |
|
"grad_norm": 0.24654038356285773, |
|
"kl": 0.23746337890625, |
|
"learning_rate": 1.1498047410946307e-05, |
|
"loss": 0.0095, |
|
"reward": 0.0875000024214387, |
|
"reward_std": 0.10017346087843179, |
|
"rewards/accuracy_reward": 0.0875000024214387, |
|
"rewards/format_reward": 0.0, |
|
"step": 765 |
|
}, |
|
{ |
|
"completion_length": 611.8812675476074, |
|
"epoch": 0.5102083074502008, |
|
"grad_norm": 0.17786553342995753, |
|
"kl": 0.222509765625, |
|
"learning_rate": 1.1383585178523955e-05, |
|
"loss": 0.0089, |
|
"reward": 0.09583333600312471, |
|
"reward_std": 0.07071067839860916, |
|
"rewards/accuracy_reward": 0.09583333600312471, |
|
"rewards/format_reward": 0.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"completion_length": 556.170849609375, |
|
"epoch": 0.5135213484076697, |
|
"grad_norm": 0.32776722432627464, |
|
"kl": 0.197369384765625, |
|
"learning_rate": 1.126893783124583e-05, |
|
"loss": 0.0079, |
|
"reward": 0.11875000335276127, |
|
"reward_std": 0.11490485239773988, |
|
"rewards/accuracy_reward": 0.11875000335276127, |
|
"rewards/format_reward": 0.0, |
|
"step": 775 |
|
}, |
|
{ |
|
"completion_length": 489.2875148773193, |
|
"epoch": 0.5168343893651385, |
|
"grad_norm": 0.22661050449666004, |
|
"kl": 0.199169921875, |
|
"learning_rate": 1.1154120708194398e-05, |
|
"loss": 0.008, |
|
"reward": 0.08333333563059568, |
|
"reward_std": 0.07071067839860916, |
|
"rewards/accuracy_reward": 0.08333333563059568, |
|
"rewards/format_reward": 0.0, |
|
"step": 780 |
|
}, |
|
{ |
|
"completion_length": 483.7916805267334, |
|
"epoch": 0.5201474303226074, |
|
"grad_norm": 0.3631761942043462, |
|
"kl": 0.200616455078125, |
|
"learning_rate": 1.1039149171167046e-05, |
|
"loss": 0.008, |
|
"reward": 0.12291666977107525, |
|
"reward_std": 0.10901229586452246, |
|
"rewards/accuracy_reward": 0.12291666977107525, |
|
"rewards/format_reward": 0.0, |
|
"step": 785 |
|
}, |
|
{ |
|
"completion_length": 473.125016784668, |
|
"epoch": 0.5234604712800762, |
|
"grad_norm": 0.26699712541162623, |
|
"kl": 0.192742919921875, |
|
"learning_rate": 1.0924038602620757e-05, |
|
"loss": 0.0077, |
|
"reward": 0.11041666958481074, |
|
"reward_std": 0.09133462626487017, |
|
"rewards/accuracy_reward": 0.11041666958481074, |
|
"rewards/format_reward": 0.0, |
|
"step": 790 |
|
}, |
|
{ |
|
"completion_length": 500.1333484649658, |
|
"epoch": 0.526773512237545, |
|
"grad_norm": 0.41463867334904875, |
|
"kl": 0.18306884765625, |
|
"learning_rate": 1.0808804403614044e-05, |
|
"loss": 0.0073, |
|
"reward": 0.12291666939854622, |
|
"reward_std": 0.10901229567825794, |
|
"rewards/accuracy_reward": 0.12291666939854622, |
|
"rewards/format_reward": 0.0, |
|
"step": 795 |
|
}, |
|
{ |
|
"completion_length": 500.9916820526123, |
|
"epoch": 0.5300865531950139, |
|
"grad_norm": 0.35472513450823207, |
|
"kl": 0.18797607421875, |
|
"learning_rate": 1.0693461991746389e-05, |
|
"loss": 0.0075, |
|
"reward": 0.1416666703298688, |
|
"reward_std": 0.10017346106469631, |
|
"rewards/accuracy_reward": 0.1416666703298688, |
|
"rewards/format_reward": 0.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5300865531950139, |
|
"eval_completion_length": 496.34805118336396, |
|
"eval_kl": 0.20209099264705882, |
|
"eval_loss": 0.007962403818964958, |
|
"eval_reward": 0.1421568661051638, |
|
"eval_reward_std": 0.1178511306643486, |
|
"eval_rewards/accuracy_reward": 0.1421568661051638, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 74.1818, |
|
"eval_samples_per_second": 1.335, |
|
"eval_steps_per_second": 0.121, |
|
"step": 800 |
|
}, |
|
{ |
|
"completion_length": 525.1229301452637, |
|
"epoch": 0.5333995941524827, |
|
"grad_norm": 0.34019503442109345, |
|
"kl": 0.232171630859375, |
|
"learning_rate": 1.0578026799095464e-05, |
|
"loss": 0.0093, |
|
"reward": 0.14583333693444728, |
|
"reward_std": 0.12374368701130152, |
|
"rewards/accuracy_reward": 0.14583333693444728, |
|
"rewards/format_reward": 0.0, |
|
"step": 805 |
|
}, |
|
{ |
|
"completion_length": 615.4854366302491, |
|
"epoch": 0.5367126351099516, |
|
"grad_norm": 0.28699140537614237, |
|
"kl": 0.2682373046875, |
|
"learning_rate": 1.046251427015241e-05, |
|
"loss": 0.0107, |
|
"reward": 0.1416666705161333, |
|
"reward_std": 0.10017346106469631, |
|
"rewards/accuracy_reward": 0.1416666705161333, |
|
"rewards/format_reward": 0.0, |
|
"step": 810 |
|
}, |
|
{ |
|
"completion_length": 609.1979358673095, |
|
"epoch": 0.5400256760674204, |
|
"grad_norm": 0.29816057971503035, |
|
"kl": 0.22767333984375, |
|
"learning_rate": 1.0346939859755481e-05, |
|
"loss": 0.0091, |
|
"reward": 0.10416666921228171, |
|
"reward_std": 0.11195857413113117, |
|
"rewards/accuracy_reward": 0.10416666921228171, |
|
"rewards/format_reward": 0.0, |
|
"step": 815 |
|
}, |
|
{ |
|
"completion_length": 649.9166870117188, |
|
"epoch": 0.5433387170248892, |
|
"grad_norm": 1.1960413989590513, |
|
"kl": 0.33731689453125, |
|
"learning_rate": 1.023131903102226e-05, |
|
"loss": 0.0135, |
|
"reward": 0.09791666902601719, |
|
"reward_std": 0.10311973933130503, |
|
"rewards/accuracy_reward": 0.09791666902601719, |
|
"rewards/format_reward": 0.0, |
|
"step": 820 |
|
}, |
|
{ |
|
"completion_length": 554.097931098938, |
|
"epoch": 0.5466517579823581, |
|
"grad_norm": 0.30872099739539816, |
|
"kl": 0.247076416015625, |
|
"learning_rate": 1.0115667253280817e-05, |
|
"loss": 0.0099, |
|
"reward": 0.11041666958481074, |
|
"reward_std": 0.09722718279808759, |
|
"rewards/accuracy_reward": 0.11041666958481074, |
|
"rewards/format_reward": 0.0, |
|
"step": 825 |
|
}, |
|
{ |
|
"completion_length": 552.6354309082031, |
|
"epoch": 0.5499647989398269, |
|
"grad_norm": 0.20054998735675889, |
|
"kl": 0.206219482421875, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0082, |
|
"reward": 0.12500000298023223, |
|
"reward_std": 0.09428090434521437, |
|
"rewards/accuracy_reward": 0.12500000298023223, |
|
"rewards/format_reward": 0.0, |
|
"step": 830 |
|
}, |
|
{ |
|
"completion_length": 546.9625179290772, |
|
"epoch": 0.5532778398972957, |
|
"grad_norm": 0.5641613086876952, |
|
"kl": 0.255999755859375, |
|
"learning_rate": 9.884332746719186e-06, |
|
"loss": 0.0102, |
|
"reward": 0.12708333674818278, |
|
"reward_std": 0.1207974087446928, |
|
"rewards/accuracy_reward": 0.12708333674818278, |
|
"rewards/format_reward": 0.0, |
|
"step": 835 |
|
}, |
|
{ |
|
"completion_length": 554.7666828155518, |
|
"epoch": 0.5565908808547646, |
|
"grad_norm": 0.3738960340913415, |
|
"kl": 0.1666259765625, |
|
"learning_rate": 9.768680968977743e-06, |
|
"loss": 0.0067, |
|
"reward": 0.12083333674818278, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/accuracy_reward": 0.12083333674818278, |
|
"rewards/format_reward": 0.0, |
|
"step": 840 |
|
}, |
|
{ |
|
"completion_length": 566.0395988464355, |
|
"epoch": 0.5599039218122334, |
|
"grad_norm": 0.22732871482096867, |
|
"kl": 0.190960693359375, |
|
"learning_rate": 9.653060140244524e-06, |
|
"loss": 0.0076, |
|
"reward": 0.10000000223517418, |
|
"reward_std": 0.08838834799826145, |
|
"rewards/accuracy_reward": 0.10000000223517418, |
|
"rewards/format_reward": 0.0, |
|
"step": 845 |
|
}, |
|
{ |
|
"completion_length": 618.0916870117187, |
|
"epoch": 0.5632169627697022, |
|
"grad_norm": 0.40863278743755727, |
|
"kl": 0.255059814453125, |
|
"learning_rate": 9.537485729847594e-06, |
|
"loss": 0.0102, |
|
"reward": 0.12500000298023223, |
|
"reward_std": 0.11195857413113117, |
|
"rewards/accuracy_reward": 0.12500000298023223, |
|
"rewards/format_reward": 0.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"completion_length": 615.8875164031982, |
|
"epoch": 0.5665300037271711, |
|
"grad_norm": 0.29927787657044663, |
|
"kl": 0.259979248046875, |
|
"learning_rate": 9.421973200904538e-06, |
|
"loss": 0.0104, |
|
"reward": 0.12291667051613331, |
|
"reward_std": 0.12668996546417474, |
|
"rewards/accuracy_reward": 0.12291667051613331, |
|
"rewards/format_reward": 0.0, |
|
"step": 855 |
|
}, |
|
{ |
|
"completion_length": 564.025016784668, |
|
"epoch": 0.5698430446846399, |
|
"grad_norm": 0.5404660152394563, |
|
"kl": 0.21617431640625, |
|
"learning_rate": 9.306538008253611e-06, |
|
"loss": 0.0086, |
|
"reward": 0.13541667014360428, |
|
"reward_std": 0.1266899650916457, |
|
"rewards/accuracy_reward": 0.13541667014360428, |
|
"rewards/format_reward": 0.0, |
|
"step": 860 |
|
}, |
|
{ |
|
"completion_length": 544.654182434082, |
|
"epoch": 0.5731560856421087, |
|
"grad_norm": 0.3233447008213159, |
|
"kl": 0.181341552734375, |
|
"learning_rate": 9.19119559638596e-06, |
|
"loss": 0.0073, |
|
"reward": 0.14583333749324084, |
|
"reward_std": 0.10017346106469631, |
|
"rewards/accuracy_reward": 0.14583333749324084, |
|
"rewards/format_reward": 0.0, |
|
"step": 865 |
|
}, |
|
{ |
|
"completion_length": 572.0687698364258, |
|
"epoch": 0.5764691265995776, |
|
"grad_norm": 0.3691524796796649, |
|
"kl": 0.217767333984375, |
|
"learning_rate": 9.075961397379247e-06, |
|
"loss": 0.0087, |
|
"reward": 0.13333333656191826, |
|
"reward_std": 0.10017346106469631, |
|
"rewards/accuracy_reward": 0.13333333656191826, |
|
"rewards/format_reward": 0.0, |
|
"step": 870 |
|
}, |
|
{ |
|
"completion_length": 615.9208526611328, |
|
"epoch": 0.5797821675570465, |
|
"grad_norm": 0.2985439339832434, |
|
"kl": 0.21961669921875, |
|
"learning_rate": 8.960850828832958e-06, |
|
"loss": 0.0088, |
|
"reward": 0.12708333618938922, |
|
"reward_std": 0.09133462607860565, |
|
"rewards/accuracy_reward": 0.12708333618938922, |
|
"rewards/format_reward": 0.0, |
|
"step": 875 |
|
}, |
|
{ |
|
"completion_length": 651.279183959961, |
|
"epoch": 0.5830952085145152, |
|
"grad_norm": 0.2039953541987364, |
|
"kl": 0.229315185546875, |
|
"learning_rate": 8.845879291805605e-06, |
|
"loss": 0.0092, |
|
"reward": 0.11250000298023224, |
|
"reward_std": 0.0766032349318266, |
|
"rewards/accuracy_reward": 0.11250000298023224, |
|
"rewards/format_reward": 0.0, |
|
"step": 880 |
|
}, |
|
{ |
|
"completion_length": 635.308352279663, |
|
"epoch": 0.5864082494719841, |
|
"grad_norm": 0.33333680886000144, |
|
"kl": 0.180218505859375, |
|
"learning_rate": 8.731062168754174e-06, |
|
"loss": 0.0072, |
|
"reward": 0.13125000335276127, |
|
"reward_std": 0.12079740893095732, |
|
"rewards/accuracy_reward": 0.13125000335276127, |
|
"rewards/format_reward": 0.0, |
|
"step": 885 |
|
}, |
|
{ |
|
"completion_length": 575.0104343414307, |
|
"epoch": 0.589721290429453, |
|
"grad_norm": 8.973633703652538, |
|
"kl": 0.467108154296875, |
|
"learning_rate": 8.616414821476048e-06, |
|
"loss": 0.0188, |
|
"reward": 0.10416666921228171, |
|
"reward_std": 0.0589255653321743, |
|
"rewards/accuracy_reward": 0.10416666921228171, |
|
"rewards/format_reward": 0.0, |
|
"step": 890 |
|
}, |
|
{ |
|
"completion_length": 574.4812694549561, |
|
"epoch": 0.5930343313869217, |
|
"grad_norm": 0.26182341985351215, |
|
"kl": 0.16265869140625, |
|
"learning_rate": 8.501952589053694e-06, |
|
"loss": 0.0065, |
|
"reward": 0.13541667014360428, |
|
"reward_std": 0.10901229586452246, |
|
"rewards/accuracy_reward": 0.13541667014360428, |
|
"rewards/format_reward": 0.0, |
|
"step": 895 |
|
}, |
|
{ |
|
"completion_length": 579.7250156402588, |
|
"epoch": 0.5963473723443906, |
|
"grad_norm": 0.3188530106914609, |
|
"kl": 0.18770751953125, |
|
"learning_rate": 8.387690785802403e-06, |
|
"loss": 0.0075, |
|
"reward": 0.1500000037252903, |
|
"reward_std": 0.12963624373078347, |
|
"rewards/accuracy_reward": 0.1500000037252903, |
|
"rewards/format_reward": 0.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5963473723443906, |
|
"eval_completion_length": 560.4804077148438, |
|
"eval_kl": 0.21737132352941177, |
|
"eval_loss": 0.008583012968301773, |
|
"eval_reward": 0.17156863168758504, |
|
"eval_reward_std": 0.1455808080294553, |
|
"eval_rewards/accuracy_reward": 0.17156863168758504, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 80.338, |
|
"eval_samples_per_second": 1.232, |
|
"eval_steps_per_second": 0.112, |
|
"step": 900 |
|
}, |
|
{ |
|
"completion_length": 593.8625198364258, |
|
"epoch": 0.5996604133018595, |
|
"grad_norm": 0.2613535270080971, |
|
"kl": 0.201318359375, |
|
"learning_rate": 8.273644699221309e-06, |
|
"loss": 0.0081, |
|
"reward": 0.08958333600312471, |
|
"reward_std": 0.08544206973165273, |
|
"rewards/accuracy_reward": 0.08958333600312471, |
|
"rewards/format_reward": 0.0, |
|
"step": 905 |
|
}, |
|
{ |
|
"completion_length": 599.837515258789, |
|
"epoch": 0.6029734542593282, |
|
"grad_norm": 0.19506591405221774, |
|
"kl": 0.188922119140625, |
|
"learning_rate": 8.159829587948048e-06, |
|
"loss": 0.0076, |
|
"reward": 0.12291666977107525, |
|
"reward_std": 0.12079740893095732, |
|
"rewards/accuracy_reward": 0.12291666977107525, |
|
"rewards/format_reward": 0.0, |
|
"step": 910 |
|
}, |
|
{ |
|
"completion_length": 650.3021049499512, |
|
"epoch": 0.6062864952167971, |
|
"grad_norm": 0.27346090434561215, |
|
"kl": 0.176104736328125, |
|
"learning_rate": 8.046260679717225e-06, |
|
"loss": 0.007, |
|
"reward": 0.1395833369344473, |
|
"reward_std": 0.10901229567825794, |
|
"rewards/accuracy_reward": 0.1395833369344473, |
|
"rewards/format_reward": 0.0, |
|
"step": 915 |
|
}, |
|
{ |
|
"completion_length": 615.7187690734863, |
|
"epoch": 0.609599536174266, |
|
"grad_norm": 0.34804926597561486, |
|
"kl": 0.18919677734375, |
|
"learning_rate": 7.932953169323057e-06, |
|
"loss": 0.0076, |
|
"reward": 0.14375000353902578, |
|
"reward_std": 0.1207974087446928, |
|
"rewards/accuracy_reward": 0.14375000353902578, |
|
"rewards/format_reward": 0.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"completion_length": 565.602099609375, |
|
"epoch": 0.6129125771317347, |
|
"grad_norm": 0.23449254684560134, |
|
"kl": 0.1919677734375, |
|
"learning_rate": 7.8199222165864e-06, |
|
"loss": 0.0077, |
|
"reward": 0.12291666995733977, |
|
"reward_std": 0.10311973933130503, |
|
"rewards/accuracy_reward": 0.12291666995733977, |
|
"rewards/format_reward": 0.0, |
|
"step": 925 |
|
}, |
|
{ |
|
"completion_length": 589.8625164031982, |
|
"epoch": 0.6162256180892036, |
|
"grad_norm": 0.2764473971277821, |
|
"kl": 0.21224365234375, |
|
"learning_rate": 7.70718294432646e-06, |
|
"loss": 0.0085, |
|
"reward": 0.11875000279396772, |
|
"reward_std": 0.10311973933130503, |
|
"rewards/accuracy_reward": 0.11875000279396772, |
|
"rewards/format_reward": 0.0, |
|
"step": 930 |
|
}, |
|
{ |
|
"completion_length": 549.8187686920166, |
|
"epoch": 0.6195386590466725, |
|
"grad_norm": 1.0463426634032484, |
|
"kl": 0.55484619140625, |
|
"learning_rate": 7.594750436337467e-06, |
|
"loss": 0.0222, |
|
"reward": 0.15000000335276126, |
|
"reward_std": 0.13552880026400088, |
|
"rewards/accuracy_reward": 0.15000000335276126, |
|
"rewards/format_reward": 0.0, |
|
"step": 935 |
|
}, |
|
{ |
|
"completion_length": 557.4937660217286, |
|
"epoch": 0.6228517000041413, |
|
"grad_norm": 0.18999745413152938, |
|
"kl": 0.24376220703125, |
|
"learning_rate": 7.482639735370536e-06, |
|
"loss": 0.0098, |
|
"reward": 0.15833333767950536, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/accuracy_reward": 0.15833333767950536, |
|
"rewards/format_reward": 0.0, |
|
"step": 940 |
|
}, |
|
{ |
|
"completion_length": 565.875016784668, |
|
"epoch": 0.6261647409616101, |
|
"grad_norm": 0.2942645244909035, |
|
"kl": 0.235906982421875, |
|
"learning_rate": 7.37086584112108e-06, |
|
"loss": 0.0094, |
|
"reward": 0.1416666705161333, |
|
"reward_std": 0.15320646967738866, |
|
"rewards/accuracy_reward": 0.1416666705161333, |
|
"rewards/format_reward": 0.0, |
|
"step": 945 |
|
}, |
|
{ |
|
"completion_length": 515.4104309082031, |
|
"epoch": 0.629477781919079, |
|
"grad_norm": 0.284453920534699, |
|
"kl": 0.207122802734375, |
|
"learning_rate": 7.2594437082219074e-06, |
|
"loss": 0.0083, |
|
"reward": 0.13333333656191826, |
|
"reward_std": 0.10606601741164923, |
|
"rewards/accuracy_reward": 0.13333333656191826, |
|
"rewards/format_reward": 0.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"completion_length": 519.2750148773193, |
|
"epoch": 0.6327908228765479, |
|
"grad_norm": 0.2551459658462794, |
|
"kl": 0.213092041015625, |
|
"learning_rate": 7.148388244242414e-06, |
|
"loss": 0.0085, |
|
"reward": 0.1354166703298688, |
|
"reward_std": 0.11490485239773988, |
|
"rewards/accuracy_reward": 0.1354166703298688, |
|
"rewards/format_reward": 0.0, |
|
"step": 955 |
|
}, |
|
{ |
|
"completion_length": 551.9812677383422, |
|
"epoch": 0.6361038638340166, |
|
"grad_norm": 0.23334554711541383, |
|
"kl": 0.188629150390625, |
|
"learning_rate": 7.037714307694038e-06, |
|
"loss": 0.0075, |
|
"reward": 0.11250000335276127, |
|
"reward_std": 0.08838834799826145, |
|
"rewards/accuracy_reward": 0.11250000335276127, |
|
"rewards/format_reward": 0.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"completion_length": 575.2812675476074, |
|
"epoch": 0.6394169047914855, |
|
"grad_norm": 0.21745310342706664, |
|
"kl": 0.201898193359375, |
|
"learning_rate": 6.927436706042276e-06, |
|
"loss": 0.0081, |
|
"reward": 0.12708333618938922, |
|
"reward_std": 0.10311973933130503, |
|
"rewards/accuracy_reward": 0.12708333618938922, |
|
"rewards/format_reward": 0.0, |
|
"step": 965 |
|
}, |
|
{ |
|
"completion_length": 544.3521011352539, |
|
"epoch": 0.6427299457489544, |
|
"grad_norm": 0.21392354942012623, |
|
"kl": 0.193035888671875, |
|
"learning_rate": 6.8175701937255645e-06, |
|
"loss": 0.0077, |
|
"reward": 0.1395833369344473, |
|
"reward_std": 0.10901229586452246, |
|
"rewards/accuracy_reward": 0.1395833369344473, |
|
"rewards/format_reward": 0.0, |
|
"step": 970 |
|
}, |
|
{ |
|
"completion_length": 594.6437686920166, |
|
"epoch": 0.6460429867064231, |
|
"grad_norm": 0.3297497458421446, |
|
"kl": 0.20833740234375, |
|
"learning_rate": 6.708129470181197e-06, |
|
"loss": 0.0083, |
|
"reward": 0.1541666707023978, |
|
"reward_std": 0.14731391314417125, |
|
"rewards/accuracy_reward": 0.1541666707023978, |
|
"rewards/format_reward": 0.0, |
|
"step": 975 |
|
}, |
|
{ |
|
"completion_length": 612.2333488464355, |
|
"epoch": 0.649356027663892, |
|
"grad_norm": 0.25579146126693203, |
|
"kl": 0.24373779296875, |
|
"learning_rate": 6.5991291778786556e-06, |
|
"loss": 0.0097, |
|
"reward": 0.12291666977107525, |
|
"reward_std": 0.07954951319843531, |
|
"rewards/accuracy_reward": 0.12291666977107525, |
|
"rewards/format_reward": 0.0, |
|
"step": 980 |
|
}, |
|
{ |
|
"completion_length": 593.9500152587891, |
|
"epoch": 0.6526690686213609, |
|
"grad_norm": 0.308305547519083, |
|
"kl": 0.262152099609375, |
|
"learning_rate": 6.490583900360543e-06, |
|
"loss": 0.0105, |
|
"reward": 0.11875000260770321, |
|
"reward_std": 0.1207974087446928, |
|
"rewards/accuracy_reward": 0.11875000260770321, |
|
"rewards/format_reward": 0.0, |
|
"step": 985 |
|
}, |
|
{ |
|
"completion_length": 569.4208499908448, |
|
"epoch": 0.6559821095788296, |
|
"grad_norm": 0.3215020178086238, |
|
"kl": 0.2141845703125, |
|
"learning_rate": 6.38250816029139e-06, |
|
"loss": 0.0086, |
|
"reward": 0.11458333637565374, |
|
"reward_std": 0.10901229586452246, |
|
"rewards/accuracy_reward": 0.11458333637565374, |
|
"rewards/format_reward": 0.0, |
|
"step": 990 |
|
}, |
|
{ |
|
"completion_length": 569.345849609375, |
|
"epoch": 0.6592951505362985, |
|
"grad_norm": 0.3323215086416121, |
|
"kl": 0.219927978515625, |
|
"learning_rate": 6.274916417514605e-06, |
|
"loss": 0.0088, |
|
"reward": 0.13125000353902577, |
|
"reward_std": 0.1384750785306096, |
|
"rewards/accuracy_reward": 0.13125000353902577, |
|
"rewards/format_reward": 0.0, |
|
"step": 995 |
|
}, |
|
{ |
|
"completion_length": 611.7208526611328, |
|
"epoch": 0.6626081914937674, |
|
"grad_norm": 0.3161012671184911, |
|
"kl": 0.3405517578125, |
|
"learning_rate": 6.167823067117868e-06, |
|
"loss": 0.0136, |
|
"reward": 0.08958333600312471, |
|
"reward_std": 0.09722718279808759, |
|
"rewards/accuracy_reward": 0.08958333600312471, |
|
"rewards/format_reward": 0.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6626081914937674, |
|
"eval_completion_length": 612.1666834214154, |
|
"eval_kl": 0.27849264705882354, |
|
"eval_loss": 0.01135108433663845, |
|
"eval_reward": 0.11764706145314609, |
|
"eval_reward_std": 0.11091871077523512, |
|
"eval_rewards/accuracy_reward": 0.11764706145314609, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 87.2908, |
|
"eval_samples_per_second": 1.134, |
|
"eval_steps_per_second": 0.103, |
|
"step": 1000 |
|
}, |
|
{ |
|
"completion_length": 591.775019454956, |
|
"epoch": 0.6659212324512361, |
|
"grad_norm": 0.32740344668707216, |
|
"kl": 0.218963623046875, |
|
"learning_rate": 6.061242437507131e-06, |
|
"loss": 0.0088, |
|
"reward": 0.1104166692122817, |
|
"reward_std": 0.10311973933130503, |
|
"rewards/accuracy_reward": 0.1104166692122817, |
|
"rewards/format_reward": 0.0, |
|
"step": 1005 |
|
}, |
|
{ |
|
"completion_length": 578.4812664031982, |
|
"epoch": 0.669234273408705, |
|
"grad_norm": 0.2198210321894444, |
|
"kl": 0.179779052734375, |
|
"learning_rate": 5.955188788489583e-06, |
|
"loss": 0.0072, |
|
"reward": 0.1729166705161333, |
|
"reward_std": 0.12668996527791024, |
|
"rewards/accuracy_reward": 0.1729166705161333, |
|
"rewards/format_reward": 0.0, |
|
"step": 1010 |
|
}, |
|
{ |
|
"completion_length": 633.9875148773193, |
|
"epoch": 0.6725473143661739, |
|
"grad_norm": 0.24817253204515607, |
|
"kl": 0.180908203125, |
|
"learning_rate": 5.849676309365786e-06, |
|
"loss": 0.0072, |
|
"reward": 0.1395833369344473, |
|
"reward_std": 0.09722718279808759, |
|
"rewards/accuracy_reward": 0.1395833369344473, |
|
"rewards/format_reward": 0.0, |
|
"step": 1015 |
|
}, |
|
{ |
|
"completion_length": 625.5687683105468, |
|
"epoch": 0.6758603553236426, |
|
"grad_norm": 0.26537873271258117, |
|
"kl": 0.197027587890625, |
|
"learning_rate": 5.744719117031217e-06, |
|
"loss": 0.0079, |
|
"reward": 0.15833333767950536, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/accuracy_reward": 0.15833333767950536, |
|
"rewards/format_reward": 0.0, |
|
"step": 1020 |
|
}, |
|
{ |
|
"completion_length": 642.2458499908447, |
|
"epoch": 0.6791733962811115, |
|
"grad_norm": 0.2811132286389355, |
|
"kl": 0.227191162109375, |
|
"learning_rate": 5.6403312540875325e-06, |
|
"loss": 0.0091, |
|
"reward": 0.10416666939854621, |
|
"reward_std": 0.09428090453147889, |
|
"rewards/accuracy_reward": 0.10416666939854621, |
|
"rewards/format_reward": 0.0, |
|
"step": 1025 |
|
}, |
|
{ |
|
"completion_length": 605.6250198364257, |
|
"epoch": 0.6824864372385804, |
|
"grad_norm": 0.29826246716685967, |
|
"kl": 0.1943115234375, |
|
"learning_rate": 5.536526686963762e-06, |
|
"loss": 0.0078, |
|
"reward": 0.14375000391155482, |
|
"reward_std": 0.12079740893095732, |
|
"rewards/accuracy_reward": 0.14375000391155482, |
|
"rewards/format_reward": 0.0, |
|
"step": 1030 |
|
}, |
|
{ |
|
"completion_length": 572.9312675476074, |
|
"epoch": 0.6857994781960493, |
|
"grad_norm": 0.19368256939132425, |
|
"kl": 0.156805419921875, |
|
"learning_rate": 5.433319304047666e-06, |
|
"loss": 0.0063, |
|
"reward": 0.15208333730697632, |
|
"reward_std": 0.09722718261182309, |
|
"rewards/accuracy_reward": 0.15208333730697632, |
|
"rewards/format_reward": 0.0, |
|
"step": 1035 |
|
}, |
|
{ |
|
"completion_length": 556.6062641143799, |
|
"epoch": 0.689112519153518, |
|
"grad_norm": 0.3992963343728779, |
|
"kl": 0.2367431640625, |
|
"learning_rate": 5.330722913827594e-06, |
|
"loss": 0.0095, |
|
"reward": 0.17291667088866233, |
|
"reward_std": 0.10311973914504051, |
|
"rewards/accuracy_reward": 0.17291667088866233, |
|
"rewards/format_reward": 0.0, |
|
"step": 1040 |
|
}, |
|
{ |
|
"completion_length": 560.204182434082, |
|
"epoch": 0.6924255601109869, |
|
"grad_norm": 0.3434721576041372, |
|
"kl": 0.209393310546875, |
|
"learning_rate": 5.228751243044961e-06, |
|
"loss": 0.0084, |
|
"reward": 0.16041667051613331, |
|
"reward_std": 0.12079740893095732, |
|
"rewards/accuracy_reward": 0.16041667051613331, |
|
"rewards/format_reward": 0.0, |
|
"step": 1045 |
|
}, |
|
{ |
|
"completion_length": 605.5666820526124, |
|
"epoch": 0.6957386010684558, |
|
"grad_norm": 0.2306193550339744, |
|
"kl": 0.222296142578125, |
|
"learning_rate": 5.127417934857718e-06, |
|
"loss": 0.0089, |
|
"reward": 0.13333333600312472, |
|
"reward_std": 0.123743686825037, |
|
"rewards/accuracy_reward": 0.13333333600312472, |
|
"rewards/format_reward": 0.0, |
|
"step": 1050 |
|
}, |
|
{ |
|
"completion_length": 526.712515258789, |
|
"epoch": 0.6990516420259245, |
|
"grad_norm": 0.6065517464028967, |
|
"kl": 0.213092041015625, |
|
"learning_rate": 5.026736547014981e-06, |
|
"loss": 0.0085, |
|
"reward": 0.13958333674818277, |
|
"reward_std": 0.12079740893095732, |
|
"rewards/accuracy_reward": 0.13958333674818277, |
|
"rewards/format_reward": 0.0, |
|
"step": 1055 |
|
}, |
|
{ |
|
"completion_length": 547.4145992279052, |
|
"epoch": 0.7023646829833934, |
|
"grad_norm": 0.28705779880249954, |
|
"kl": 0.23438720703125, |
|
"learning_rate": 4.926720550043089e-06, |
|
"loss": 0.0094, |
|
"reward": 0.1166666692122817, |
|
"reward_std": 0.08249579127877951, |
|
"rewards/accuracy_reward": 0.1166666692122817, |
|
"rewards/format_reward": 0.0, |
|
"step": 1060 |
|
}, |
|
{ |
|
"completion_length": 534.8395999908447, |
|
"epoch": 0.7056777239408623, |
|
"grad_norm": 0.2921526699948002, |
|
"kl": 0.238226318359375, |
|
"learning_rate": 4.827383325443331e-06, |
|
"loss": 0.0095, |
|
"reward": 0.1479166703298688, |
|
"reward_std": 0.12079740893095732, |
|
"rewards/accuracy_reward": 0.1479166703298688, |
|
"rewards/format_reward": 0.0, |
|
"step": 1065 |
|
}, |
|
{ |
|
"completion_length": 556.1479335784912, |
|
"epoch": 0.708990764898331, |
|
"grad_norm": 0.3462960302006907, |
|
"kl": 0.23875732421875, |
|
"learning_rate": 4.728738163901597e-06, |
|
"loss": 0.0095, |
|
"reward": 0.13541667014360428, |
|
"reward_std": 0.10311973933130503, |
|
"rewards/accuracy_reward": 0.13541667014360428, |
|
"rewards/format_reward": 0.0, |
|
"step": 1070 |
|
}, |
|
{ |
|
"completion_length": 561.583348083496, |
|
"epoch": 0.7123038058557999, |
|
"grad_norm": 0.3220348313143619, |
|
"kl": 0.186767578125, |
|
"learning_rate": 4.630798263510162e-06, |
|
"loss": 0.0075, |
|
"reward": 0.14375000353902578, |
|
"reward_std": 0.12079740893095732, |
|
"rewards/accuracy_reward": 0.14375000353902578, |
|
"rewards/format_reward": 0.0, |
|
"step": 1075 |
|
}, |
|
{ |
|
"completion_length": 574.2791835784913, |
|
"epoch": 0.7156168468132688, |
|
"grad_norm": 0.361410974713343, |
|
"kl": 0.313189697265625, |
|
"learning_rate": 4.533576728001858e-06, |
|
"loss": 0.0125, |
|
"reward": 0.1458333371207118, |
|
"reward_std": 0.10606601759791374, |
|
"rewards/accuracy_reward": 0.1458333371207118, |
|
"rewards/format_reward": 0.0, |
|
"step": 1080 |
|
}, |
|
{ |
|
"completion_length": 584.5437686920166, |
|
"epoch": 0.7189298877707375, |
|
"grad_norm": 0.3433650794021713, |
|
"kl": 0.16435546875, |
|
"learning_rate": 4.437086564996891e-06, |
|
"loss": 0.0066, |
|
"reward": 0.1479166705161333, |
|
"reward_std": 0.12079740893095732, |
|
"rewards/accuracy_reward": 0.1479166705161333, |
|
"rewards/format_reward": 0.0, |
|
"step": 1085 |
|
}, |
|
{ |
|
"completion_length": 589.4520965576172, |
|
"epoch": 0.7222429287282064, |
|
"grad_norm": 0.28664499371278185, |
|
"kl": 0.362109375, |
|
"learning_rate": 4.341340684262498e-06, |
|
"loss": 0.0145, |
|
"reward": 0.16666667070239782, |
|
"reward_std": 0.12963624354451894, |
|
"rewards/accuracy_reward": 0.16666667070239782, |
|
"rewards/format_reward": 0.0, |
|
"step": 1090 |
|
}, |
|
{ |
|
"completion_length": 589.1021018981934, |
|
"epoch": 0.7255559696856753, |
|
"grad_norm": 0.6726902176410475, |
|
"kl": 0.17657470703125, |
|
"learning_rate": 4.246351895985702e-06, |
|
"loss": 0.0071, |
|
"reward": 0.1687500037252903, |
|
"reward_std": 0.14436763487756252, |
|
"rewards/accuracy_reward": 0.1687500037252903, |
|
"rewards/format_reward": 0.0, |
|
"step": 1095 |
|
}, |
|
{ |
|
"completion_length": 608.1708499908448, |
|
"epoch": 0.728869010643144, |
|
"grad_norm": 0.19762245990532015, |
|
"kl": 0.151568603515625, |
|
"learning_rate": 4.152132909059402e-06, |
|
"loss": 0.0061, |
|
"reward": 0.15000000316649675, |
|
"reward_std": 0.10017346087843179, |
|
"rewards/accuracy_reward": 0.15000000316649675, |
|
"rewards/format_reward": 0.0, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.728869010643144, |
|
"eval_completion_length": 567.0490381577436, |
|
"eval_kl": 0.16104664522058823, |
|
"eval_loss": 0.006509152241051197, |
|
"eval_reward": 0.18627451579360402, |
|
"eval_reward_std": 0.1386483881403418, |
|
"eval_rewards/accuracy_reward": 0.18627451579360402, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 85.6746, |
|
"eval_samples_per_second": 1.156, |
|
"eval_steps_per_second": 0.105, |
|
"step": 1100 |
|
}, |
|
{ |
|
"completion_length": 586.3041831970215, |
|
"epoch": 0.7321820516006129, |
|
"grad_norm": 0.26497157645231056, |
|
"kl": 0.153692626953125, |
|
"learning_rate": 4.058696329381987e-06, |
|
"loss": 0.0061, |
|
"reward": 0.22500000540167092, |
|
"reward_std": 0.11785113029181957, |
|
"rewards/accuracy_reward": 0.22500000540167092, |
|
"rewards/format_reward": 0.0, |
|
"step": 1105 |
|
}, |
|
{ |
|
"completion_length": 640.6333503723145, |
|
"epoch": 0.7354950925580818, |
|
"grad_norm": 0.175788774023308, |
|
"kl": 0.155810546875, |
|
"learning_rate": 3.966054658170754e-06, |
|
"loss": 0.0062, |
|
"reward": 0.12916667014360428, |
|
"reward_std": 0.11195857413113117, |
|
"rewards/accuracy_reward": 0.12916667014360428, |
|
"rewards/format_reward": 0.0, |
|
"step": 1110 |
|
}, |
|
{ |
|
"completion_length": 614.8416843414307, |
|
"epoch": 0.7388081335155506, |
|
"grad_norm": 0.39182645951451167, |
|
"kl": 0.19365234375, |
|
"learning_rate": 3.874220290289337e-06, |
|
"loss": 0.0077, |
|
"reward": 0.17500000391155482, |
|
"reward_std": 0.11785113047808408, |
|
"rewards/accuracy_reward": 0.17500000391155482, |
|
"rewards/format_reward": 0.0, |
|
"step": 1115 |
|
}, |
|
{ |
|
"completion_length": 597.1479331970215, |
|
"epoch": 0.7421211744730194, |
|
"grad_norm": 0.3847191431848802, |
|
"kl": 0.201531982421875, |
|
"learning_rate": 3.7832055125893318e-06, |
|
"loss": 0.0081, |
|
"reward": 0.13750000316649674, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/accuracy_reward": 0.13750000316649674, |
|
"rewards/format_reward": 0.0, |
|
"step": 1120 |
|
}, |
|
{ |
|
"completion_length": 584.2437702178955, |
|
"epoch": 0.7454342154304883, |
|
"grad_norm": 0.42174504116546924, |
|
"kl": 0.158538818359375, |
|
"learning_rate": 3.6930225022664136e-06, |
|
"loss": 0.0063, |
|
"reward": 0.17500000409781932, |
|
"reward_std": 0.12374368719756604, |
|
"rewards/accuracy_reward": 0.17500000409781932, |
|
"rewards/format_reward": 0.0, |
|
"step": 1125 |
|
}, |
|
{ |
|
"completion_length": 580.464596939087, |
|
"epoch": 0.7487472563879571, |
|
"grad_norm": 0.2445425181891289, |
|
"kl": 0.18485107421875, |
|
"learning_rate": 3.6036833252310887e-06, |
|
"loss": 0.0074, |
|
"reward": 0.18750000447034837, |
|
"reward_std": 0.12374368719756604, |
|
"rewards/accuracy_reward": 0.18750000447034837, |
|
"rewards/format_reward": 0.0, |
|
"step": 1130 |
|
}, |
|
{ |
|
"completion_length": 580.1958534240723, |
|
"epoch": 0.7520602973454259, |
|
"grad_norm": 0.36465004329450484, |
|
"kl": 0.198504638671875, |
|
"learning_rate": 3.515199934494373e-06, |
|
"loss": 0.0079, |
|
"reward": 0.1625000037252903, |
|
"reward_std": 0.14142135679721832, |
|
"rewards/accuracy_reward": 0.1625000037252903, |
|
"rewards/format_reward": 0.0, |
|
"step": 1135 |
|
}, |
|
{ |
|
"completion_length": 601.0125175476074, |
|
"epoch": 0.7553733383028948, |
|
"grad_norm": 0.2483965145552254, |
|
"kl": 0.170068359375, |
|
"learning_rate": 3.427584168568535e-06, |
|
"loss": 0.0068, |
|
"reward": 0.1687500037252903, |
|
"reward_std": 0.12079740893095732, |
|
"rewards/accuracy_reward": 0.1687500037252903, |
|
"rewards/format_reward": 0.0, |
|
"step": 1140 |
|
}, |
|
{ |
|
"completion_length": 599.8041851043702, |
|
"epoch": 0.7586863792603636, |
|
"grad_norm": 0.21066990688347378, |
|
"kl": 0.18428955078125, |
|
"learning_rate": 3.3408477498831917e-06, |
|
"loss": 0.0074, |
|
"reward": 0.1333333369344473, |
|
"reward_std": 0.10606601759791374, |
|
"rewards/accuracy_reward": 0.1333333369344473, |
|
"rewards/format_reward": 0.0, |
|
"step": 1145 |
|
}, |
|
{ |
|
"completion_length": 586.0396009445191, |
|
"epoch": 0.7619994202178324, |
|
"grad_norm": 0.28233525146520194, |
|
"kl": 0.170452880859375, |
|
"learning_rate": 3.2550022832169125e-06, |
|
"loss": 0.0068, |
|
"reward": 0.18125000465661287, |
|
"reward_std": 0.10311973933130503, |
|
"rewards/accuracy_reward": 0.18125000465661287, |
|
"rewards/format_reward": 0.0, |
|
"step": 1150 |
|
}, |
|
{ |
|
"completion_length": 590.0333492279053, |
|
"epoch": 0.7653124611753013, |
|
"grad_norm": 0.3931119421182273, |
|
"kl": 0.184490966796875, |
|
"learning_rate": 3.170059254144593e-06, |
|
"loss": 0.0074, |
|
"reward": 0.17291667070239783, |
|
"reward_std": 0.1266899650916457, |
|
"rewards/accuracy_reward": 0.17291667070239783, |
|
"rewards/format_reward": 0.0, |
|
"step": 1155 |
|
}, |
|
{ |
|
"completion_length": 609.7041851043701, |
|
"epoch": 0.7686255021327701, |
|
"grad_norm": 0.25191777564937673, |
|
"kl": 0.2096435546875, |
|
"learning_rate": 3.086030027500728e-06, |
|
"loss": 0.0084, |
|
"reward": 0.11666666995733976, |
|
"reward_std": 0.0766032349318266, |
|
"rewards/accuracy_reward": 0.11666666995733976, |
|
"rewards/format_reward": 0.0, |
|
"step": 1160 |
|
}, |
|
{ |
|
"completion_length": 585.3791835784912, |
|
"epoch": 0.7719385430902389, |
|
"grad_norm": 0.48874888580232095, |
|
"kl": 0.167364501953125, |
|
"learning_rate": 3.002925845858905e-06, |
|
"loss": 0.0067, |
|
"reward": 0.19166667181998492, |
|
"reward_std": 0.14142135679721832, |
|
"rewards/accuracy_reward": 0.19166667181998492, |
|
"rewards/format_reward": 0.0, |
|
"step": 1165 |
|
}, |
|
{ |
|
"completion_length": 603.406270980835, |
|
"epoch": 0.7752515840477078, |
|
"grad_norm": 0.31203818869127303, |
|
"kl": 0.171881103515625, |
|
"learning_rate": 2.920757828027586e-06, |
|
"loss": 0.0069, |
|
"reward": 0.16250000298023223, |
|
"reward_std": 0.11195857394486666, |
|
"rewards/accuracy_reward": 0.16250000298023223, |
|
"rewards/format_reward": 0.0, |
|
"step": 1170 |
|
}, |
|
{ |
|
"completion_length": 597.1666839599609, |
|
"epoch": 0.7785646250051766, |
|
"grad_norm": 0.22045928574301496, |
|
"kl": 0.1704345703125, |
|
"learning_rate": 2.839536967562504e-06, |
|
"loss": 0.0068, |
|
"reward": 0.16250000428408384, |
|
"reward_std": 0.10606601759791374, |
|
"rewards/accuracy_reward": 0.16250000428408384, |
|
"rewards/format_reward": 0.0, |
|
"step": 1175 |
|
}, |
|
{ |
|
"completion_length": 582.6396030426025, |
|
"epoch": 0.7818776659626455, |
|
"grad_norm": 0.31490753838871305, |
|
"kl": 0.1723388671875, |
|
"learning_rate": 2.759274131295787e-06, |
|
"loss": 0.0069, |
|
"reward": 0.17708333749324084, |
|
"reward_std": 0.14436763506382705, |
|
"rewards/accuracy_reward": 0.17708333749324084, |
|
"rewards/format_reward": 0.0, |
|
"step": 1180 |
|
}, |
|
{ |
|
"completion_length": 607.9500186920166, |
|
"epoch": 0.7851907069201143, |
|
"grad_norm": 0.2500499737554532, |
|
"kl": 0.198590087890625, |
|
"learning_rate": 2.679980057882049e-06, |
|
"loss": 0.0079, |
|
"reward": 0.14791667107492684, |
|
"reward_std": 0.09722718279808759, |
|
"rewards/accuracy_reward": 0.14791667107492684, |
|
"rewards/format_reward": 0.0, |
|
"step": 1185 |
|
}, |
|
{ |
|
"completion_length": 590.0312683105469, |
|
"epoch": 0.7885037478775831, |
|
"grad_norm": 0.3890063180094862, |
|
"kl": 0.21483154296875, |
|
"learning_rate": 2.60166535636162e-06, |
|
"loss": 0.0086, |
|
"reward": 0.12916667032986878, |
|
"reward_std": 0.12963624373078347, |
|
"rewards/accuracy_reward": 0.12916667032986878, |
|
"rewards/format_reward": 0.0, |
|
"step": 1190 |
|
}, |
|
{ |
|
"completion_length": 592.3979331970215, |
|
"epoch": 0.791816788835052, |
|
"grad_norm": 0.34955868788134764, |
|
"kl": 0.20556640625, |
|
"learning_rate": 2.5243405047411353e-06, |
|
"loss": 0.0082, |
|
"reward": 0.15208333730697632, |
|
"reward_std": 0.13258252199739218, |
|
"rewards/accuracy_reward": 0.15208333730697632, |
|
"rewards/format_reward": 0.0, |
|
"step": 1195 |
|
}, |
|
{ |
|
"completion_length": 602.2000190734864, |
|
"epoch": 0.7951298297925208, |
|
"grad_norm": 0.320862144828303, |
|
"kl": 0.1858642578125, |
|
"learning_rate": 2.448015848591638e-06, |
|
"loss": 0.0074, |
|
"reward": 0.16458333786576987, |
|
"reward_std": 0.16204530466347933, |
|
"rewards/accuracy_reward": 0.16458333786576987, |
|
"rewards/format_reward": 0.0, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7951298297925208, |
|
"eval_completion_length": 595.4313946892233, |
|
"eval_kl": 0.16259765625, |
|
"eval_loss": 0.006604722701013088, |
|
"eval_reward": 0.20588235907694874, |
|
"eval_reward_std": 0.12478355011519264, |
|
"eval_rewards/accuracy_reward": 0.20588235907694874, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 82.8847, |
|
"eval_samples_per_second": 1.194, |
|
"eval_steps_per_second": 0.109, |
|
"step": 1200 |
|
}, |
|
{ |
|
"completion_length": 613.1854331970214, |
|
"epoch": 0.7984428707499897, |
|
"grad_norm": 0.2661889633504761, |
|
"kl": 0.18961181640625, |
|
"learning_rate": 2.3727015996644043e-06, |
|
"loss": 0.0076, |
|
"reward": 0.12708333656191825, |
|
"reward_std": 0.10311973933130503, |
|
"rewards/accuracy_reward": 0.12708333656191825, |
|
"rewards/format_reward": 0.0, |
|
"step": 1205 |
|
}, |
|
{ |
|
"completion_length": 594.962519454956, |
|
"epoch": 0.8017559117074585, |
|
"grad_norm": 0.33195445099683457, |
|
"kl": 0.17481689453125, |
|
"learning_rate": 2.298407834524682e-06, |
|
"loss": 0.007, |
|
"reward": 0.16041667051613331, |
|
"reward_std": 0.12079740893095732, |
|
"rewards/accuracy_reward": 0.16041667051613331, |
|
"rewards/format_reward": 0.0, |
|
"step": 1210 |
|
}, |
|
{ |
|
"completion_length": 562.3083518981933, |
|
"epoch": 0.8050689526649273, |
|
"grad_norm": 0.36379765983184786, |
|
"kl": 1.331353759765625, |
|
"learning_rate": 2.2251444932035094e-06, |
|
"loss": 0.0532, |
|
"reward": 0.1875000050291419, |
|
"reward_std": 0.13552880007773638, |
|
"rewards/accuracy_reward": 0.1875000050291419, |
|
"rewards/format_reward": 0.0, |
|
"step": 1215 |
|
}, |
|
{ |
|
"completion_length": 535.5166809082032, |
|
"epoch": 0.8083819936223962, |
|
"grad_norm": 0.22606794243792797, |
|
"kl": 0.16956787109375, |
|
"learning_rate": 2.1529213778677993e-06, |
|
"loss": 0.0068, |
|
"reward": 0.1666666703298688, |
|
"reward_std": 0.10606601759791374, |
|
"rewards/accuracy_reward": 0.1666666703298688, |
|
"rewards/format_reward": 0.0, |
|
"step": 1220 |
|
}, |
|
{ |
|
"completion_length": 559.0458503723145, |
|
"epoch": 0.811695034579865, |
|
"grad_norm": 0.3145793732636785, |
|
"kl": 0.15970458984375, |
|
"learning_rate": 2.081748151508883e-06, |
|
"loss": 0.0064, |
|
"reward": 0.18750000335276126, |
|
"reward_std": 0.10606601741164923, |
|
"rewards/accuracy_reward": 0.18750000335276126, |
|
"rewards/format_reward": 0.0, |
|
"step": 1225 |
|
}, |
|
{ |
|
"completion_length": 546.4958501815796, |
|
"epoch": 0.8150080755373338, |
|
"grad_norm": 0.1573772953559393, |
|
"kl": 0.17611083984375, |
|
"learning_rate": 2.0116343366496493e-06, |
|
"loss": 0.007, |
|
"reward": 0.14791667070239783, |
|
"reward_std": 0.08544206973165273, |
|
"rewards/accuracy_reward": 0.14791667070239783, |
|
"rewards/format_reward": 0.0, |
|
"step": 1230 |
|
}, |
|
{ |
|
"completion_length": 549.3166814804077, |
|
"epoch": 0.8183211164948027, |
|
"grad_norm": 0.45017301026338924, |
|
"kl": 0.1666015625, |
|
"learning_rate": 1.942589314070494e-06, |
|
"loss": 0.0067, |
|
"reward": 0.18541667126119138, |
|
"reward_std": 0.12668996546417474, |
|
"rewards/accuracy_reward": 0.18541667126119138, |
|
"rewards/format_reward": 0.0, |
|
"step": 1235 |
|
}, |
|
{ |
|
"completion_length": 595.6729320526123, |
|
"epoch": 0.8216341574522715, |
|
"grad_norm": 0.2792656285293251, |
|
"kl": 0.1830078125, |
|
"learning_rate": 1.8746223215542482e-06, |
|
"loss": 0.0073, |
|
"reward": 0.16041666977107524, |
|
"reward_std": 0.10901229567825794, |
|
"rewards/accuracy_reward": 0.16041666977107524, |
|
"rewards/format_reward": 0.0, |
|
"step": 1240 |
|
}, |
|
{ |
|
"completion_length": 589.0687671661377, |
|
"epoch": 0.8249471984097403, |
|
"grad_norm": 0.34000796378112624, |
|
"kl": 0.1674072265625, |
|
"learning_rate": 1.8077424526501964e-06, |
|
"loss": 0.0067, |
|
"reward": 0.16875000428408385, |
|
"reward_std": 0.15026019159704446, |
|
"rewards/accuracy_reward": 0.16875000428408385, |
|
"rewards/format_reward": 0.0, |
|
"step": 1245 |
|
}, |
|
{ |
|
"completion_length": 590.1375190734864, |
|
"epoch": 0.8282602393672092, |
|
"grad_norm": 0.1827720037966532, |
|
"kl": 0.1564697265625, |
|
"learning_rate": 1.7419586554574364e-06, |
|
"loss": 0.0063, |
|
"reward": 0.15833333674818278, |
|
"reward_std": 0.11195857413113117, |
|
"rewards/accuracy_reward": 0.15833333674818278, |
|
"rewards/format_reward": 0.0, |
|
"step": 1250 |
|
}, |
|
{ |
|
"completion_length": 547.691682434082, |
|
"epoch": 0.831573280324678, |
|
"grad_norm": 0.2333616789960661, |
|
"kl": 0.148370361328125, |
|
"learning_rate": 1.6772797314276712e-06, |
|
"loss": 0.0059, |
|
"reward": 0.18750000428408384, |
|
"reward_std": 0.12963624373078347, |
|
"rewards/accuracy_reward": 0.18750000428408384, |
|
"rewards/format_reward": 0.0, |
|
"step": 1255 |
|
}, |
|
{ |
|
"completion_length": 590.2541835784912, |
|
"epoch": 0.8348863212821469, |
|
"grad_norm": 0.4458824925413339, |
|
"kl": 0.154559326171875, |
|
"learning_rate": 1.6137143341876439e-06, |
|
"loss": 0.0062, |
|
"reward": 0.19166667051613331, |
|
"reward_std": 0.12374368701130152, |
|
"rewards/accuracy_reward": 0.19166667051613331, |
|
"rewards/format_reward": 0.0, |
|
"step": 1260 |
|
}, |
|
{ |
|
"completion_length": 578.4666858673096, |
|
"epoch": 0.8381993622396157, |
|
"grad_norm": 0.2507774814936557, |
|
"kl": 0.16309814453125, |
|
"learning_rate": 1.5512709683813165e-06, |
|
"loss": 0.0065, |
|
"reward": 0.17083333767950534, |
|
"reward_std": 0.11195857413113117, |
|
"rewards/accuracy_reward": 0.17083333767950534, |
|
"rewards/format_reward": 0.0, |
|
"step": 1265 |
|
}, |
|
{ |
|
"completion_length": 578.7125144958496, |
|
"epoch": 0.8415124031970845, |
|
"grad_norm": 0.29941656086318214, |
|
"kl": 0.2466796875, |
|
"learning_rate": 1.4899579885320237e-06, |
|
"loss": 0.0099, |
|
"reward": 0.1958333371207118, |
|
"reward_std": 0.12374368719756604, |
|
"rewards/accuracy_reward": 0.1958333371207118, |
|
"rewards/format_reward": 0.0, |
|
"step": 1270 |
|
}, |
|
{ |
|
"completion_length": 611.9833530426025, |
|
"epoch": 0.8448254441545534, |
|
"grad_norm": 0.5120540878644168, |
|
"kl": 0.161627197265625, |
|
"learning_rate": 1.4297835979246777e-06, |
|
"loss": 0.0065, |
|
"reward": 0.11875000316649675, |
|
"reward_std": 0.09722718279808759, |
|
"rewards/accuracy_reward": 0.11875000316649675, |
|
"rewards/format_reward": 0.0, |
|
"step": 1275 |
|
}, |
|
{ |
|
"completion_length": 554.3375129699707, |
|
"epoch": 0.8481384851120222, |
|
"grad_norm": 0.24272356502293516, |
|
"kl": 0.166864013671875, |
|
"learning_rate": 1.370755847508226e-06, |
|
"loss": 0.0067, |
|
"reward": 0.1750000048428774, |
|
"reward_std": 0.12374368719756604, |
|
"rewards/accuracy_reward": 0.1750000048428774, |
|
"rewards/format_reward": 0.0, |
|
"step": 1280 |
|
}, |
|
{ |
|
"completion_length": 566.3208484649658, |
|
"epoch": 0.851451526069491, |
|
"grad_norm": 0.33649577414263443, |
|
"kl": 0.176239013671875, |
|
"learning_rate": 1.3128826348184886e-06, |
|
"loss": 0.007, |
|
"reward": 0.21458333749324082, |
|
"reward_std": 0.15615274775773286, |
|
"rewards/accuracy_reward": 0.21458333749324082, |
|
"rewards/format_reward": 0.0, |
|
"step": 1285 |
|
}, |
|
{ |
|
"completion_length": 570.1416812896729, |
|
"epoch": 0.8547645670269599, |
|
"grad_norm": 0.2521899833365297, |
|
"kl": 0.175726318359375, |
|
"learning_rate": 1.256171702921516e-06, |
|
"loss": 0.007, |
|
"reward": 0.1979166716337204, |
|
"reward_std": 0.13847507834434508, |
|
"rewards/accuracy_reward": 0.1979166716337204, |
|
"rewards/format_reward": 0.0, |
|
"step": 1290 |
|
}, |
|
{ |
|
"completion_length": 585.5687664031982, |
|
"epoch": 0.8580776079844287, |
|
"grad_norm": 0.26502078855241107, |
|
"kl": 0.19686279296875, |
|
"learning_rate": 1.200630639377609e-06, |
|
"loss": 0.0079, |
|
"reward": 0.15625000316649676, |
|
"reward_std": 0.10901229567825794, |
|
"rewards/accuracy_reward": 0.15625000316649676, |
|
"rewards/format_reward": 0.0, |
|
"step": 1295 |
|
}, |
|
{ |
|
"completion_length": 595.7041839599609, |
|
"epoch": 0.8613906489418975, |
|
"grad_norm": 0.24839009821426125, |
|
"kl": 0.20594482421875, |
|
"learning_rate": 1.1462668752261652e-06, |
|
"loss": 0.0082, |
|
"reward": 0.17083333767950534, |
|
"reward_std": 0.13552880007773638, |
|
"rewards/accuracy_reward": 0.17083333767950534, |
|
"rewards/format_reward": 0.0, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8613906489418975, |
|
"eval_completion_length": 596.6029609231388, |
|
"eval_kl": 0.24149816176470587, |
|
"eval_loss": 0.009774969890713692, |
|
"eval_reward": 0.13235294380608728, |
|
"eval_reward_std": 0.09012145286097246, |
|
"eval_rewards/accuracy_reward": 0.13235294380608728, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 84.0185, |
|
"eval_samples_per_second": 1.178, |
|
"eval_steps_per_second": 0.107, |
|
"step": 1300 |
|
}, |
|
{ |
|
"completion_length": 579.6208499908447, |
|
"epoch": 0.8647036898993664, |
|
"grad_norm": 0.23156140004960932, |
|
"kl": 0.208966064453125, |
|
"learning_rate": 1.0930876839914418e-06, |
|
"loss": 0.0084, |
|
"reward": 0.16458333730697633, |
|
"reward_std": 0.12668996527791024, |
|
"rewards/accuracy_reward": 0.16458333730697633, |
|
"rewards/format_reward": 0.0, |
|
"step": 1305 |
|
}, |
|
{ |
|
"completion_length": 581.4416831970215, |
|
"epoch": 0.8680167308568352, |
|
"grad_norm": 0.28452284185332194, |
|
"kl": 0.2054443359375, |
|
"learning_rate": 1.04110018070941e-06, |
|
"loss": 0.0082, |
|
"reward": 0.16666667088866233, |
|
"reward_std": 0.12374368719756604, |
|
"rewards/accuracy_reward": 0.16666667088866233, |
|
"rewards/format_reward": 0.0, |
|
"step": 1310 |
|
}, |
|
{ |
|
"completion_length": 597.6521030426026, |
|
"epoch": 0.871329771814304, |
|
"grad_norm": 0.2454825397955897, |
|
"kl": 0.21685791015625, |
|
"learning_rate": 9.903113209758098e-07, |
|
"loss": 0.0087, |
|
"reward": 0.11458333618938923, |
|
"reward_std": 0.09133462607860565, |
|
"rewards/accuracy_reward": 0.11458333618938923, |
|
"rewards/format_reward": 0.0, |
|
"step": 1315 |
|
}, |
|
{ |
|
"completion_length": 571.9479333877564, |
|
"epoch": 0.8746428127717729, |
|
"grad_norm": 0.3731994534971755, |
|
"kl": 0.22552490234375, |
|
"learning_rate": 9.407279000155311e-07, |
|
"loss": 0.009, |
|
"reward": 0.15000000335276126, |
|
"reward_std": 0.09428090434521437, |
|
"rewards/accuracy_reward": 0.15000000335276126, |
|
"rewards/format_reward": 0.0, |
|
"step": 1320 |
|
}, |
|
{ |
|
"completion_length": 599.3375160217286, |
|
"epoch": 0.8779558537292417, |
|
"grad_norm": 0.312791764382651, |
|
"kl": 0.19879150390625, |
|
"learning_rate": 8.923565517734633e-07, |
|
"loss": 0.0079, |
|
"reward": 0.10416666958481073, |
|
"reward_std": 0.07071067839860916, |
|
"rewards/accuracy_reward": 0.10416666958481073, |
|
"rewards/format_reward": 0.0, |
|
"step": 1325 |
|
}, |
|
{ |
|
"completion_length": 568.7083507537842, |
|
"epoch": 0.8812688946867105, |
|
"grad_norm": 0.25551838738681937, |
|
"kl": 0.21905517578125, |
|
"learning_rate": 8.452037480269082e-07, |
|
"loss": 0.0088, |
|
"reward": 0.20000000447034835, |
|
"reward_std": 0.11195857413113117, |
|
"rewards/accuracy_reward": 0.20000000447034835, |
|
"rewards/format_reward": 0.0, |
|
"step": 1330 |
|
}, |
|
{ |
|
"completion_length": 544.368769454956, |
|
"epoch": 0.8845819356441794, |
|
"grad_norm": 0.26604711389347285, |
|
"kl": 0.16533203125, |
|
"learning_rate": 7.992757975196974e-07, |
|
"loss": 0.0066, |
|
"reward": 0.1708333371207118, |
|
"reward_std": 0.10606601759791374, |
|
"rewards/accuracy_reward": 0.1708333371207118, |
|
"rewards/format_reward": 0.0, |
|
"step": 1335 |
|
}, |
|
{ |
|
"completion_length": 561.4187679290771, |
|
"epoch": 0.8878949766016483, |
|
"grad_norm": 0.3433532227955832, |
|
"kl": 0.174151611328125, |
|
"learning_rate": 7.545788451181313e-07, |
|
"loss": 0.007, |
|
"reward": 0.17916667070239783, |
|
"reward_std": 0.12963624373078347, |
|
"rewards/accuracy_reward": 0.17916667070239783, |
|
"rewards/format_reward": 0.0, |
|
"step": 1340 |
|
}, |
|
{ |
|
"completion_length": 574.2041835784912, |
|
"epoch": 0.891208017559117, |
|
"grad_norm": 0.271754666321704, |
|
"kl": 0.160504150390625, |
|
"learning_rate": 7.11118870988825e-07, |
|
"loss": 0.0064, |
|
"reward": 0.13750000298023224, |
|
"reward_std": 0.10017346106469631, |
|
"rewards/accuracy_reward": 0.13750000298023224, |
|
"rewards/format_reward": 0.0, |
|
"step": 1345 |
|
}, |
|
{ |
|
"completion_length": 567.6791843414306, |
|
"epoch": 0.8945210585165859, |
|
"grad_norm": 0.3130447466581642, |
|
"kl": 0.164306640625, |
|
"learning_rate": 6.689016897986123e-07, |
|
"loss": 0.0066, |
|
"reward": 0.14166666977107525, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/accuracy_reward": 0.14166666977107525, |
|
"rewards/format_reward": 0.0, |
|
"step": 1350 |
|
}, |
|
{ |
|
"completion_length": 585.5437667846679, |
|
"epoch": 0.8978340994740548, |
|
"grad_norm": 0.19957089977636847, |
|
"kl": 0.1621734619140625, |
|
"learning_rate": 6.279329499365649e-07, |
|
"loss": 0.0065, |
|
"reward": 0.17500000447034836, |
|
"reward_std": 0.10017346106469631, |
|
"rewards/accuracy_reward": 0.17500000447034836, |
|
"rewards/format_reward": 0.0, |
|
"step": 1355 |
|
}, |
|
{ |
|
"completion_length": 568.7625167846679, |
|
"epoch": 0.9011471404315236, |
|
"grad_norm": 0.24101026537901626, |
|
"kl": 0.144744873046875, |
|
"learning_rate": 5.88218132758287e-07, |
|
"loss": 0.0058, |
|
"reward": 0.15833333730697632, |
|
"reward_std": 0.12963624335825444, |
|
"rewards/accuracy_reward": 0.15833333730697632, |
|
"rewards/format_reward": 0.0, |
|
"step": 1360 |
|
}, |
|
{ |
|
"completion_length": 553.6021018981934, |
|
"epoch": 0.9044601813889924, |
|
"grad_norm": 0.31615520494124105, |
|
"kl": 0.160650634765625, |
|
"learning_rate": 5.497625518525374e-07, |
|
"loss": 0.0064, |
|
"reward": 0.12291666995733977, |
|
"reward_std": 0.10311973933130503, |
|
"rewards/accuracy_reward": 0.12291666995733977, |
|
"rewards/format_reward": 0.0, |
|
"step": 1365 |
|
}, |
|
{ |
|
"completion_length": 563.3395992279053, |
|
"epoch": 0.9077732223464613, |
|
"grad_norm": 0.26385870637392084, |
|
"kl": 0.143408203125, |
|
"learning_rate": 5.125713523303133e-07, |
|
"loss": 0.0057, |
|
"reward": 0.19583333767950534, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/accuracy_reward": 0.19583333767950534, |
|
"rewards/format_reward": 0.0, |
|
"step": 1370 |
|
}, |
|
{ |
|
"completion_length": 583.402102279663, |
|
"epoch": 0.91108626330393, |
|
"grad_norm": 0.20653175003039265, |
|
"kl": 0.169732666015625, |
|
"learning_rate": 4.7664951013645875e-07, |
|
"loss": 0.0068, |
|
"reward": 0.15625000409781933, |
|
"reward_std": 0.10311973914504051, |
|
"rewards/accuracy_reward": 0.15625000409781933, |
|
"rewards/format_reward": 0.0, |
|
"step": 1375 |
|
}, |
|
{ |
|
"completion_length": 576.2833488464355, |
|
"epoch": 0.9143993042613989, |
|
"grad_norm": 0.33444825058309613, |
|
"kl": 0.15838623046875, |
|
"learning_rate": 4.420018313839147e-07, |
|
"loss": 0.0063, |
|
"reward": 0.17708333767950535, |
|
"reward_std": 0.1384750785306096, |
|
"rewards/accuracy_reward": 0.17708333767950535, |
|
"rewards/format_reward": 0.0, |
|
"step": 1380 |
|
}, |
|
{ |
|
"completion_length": 564.4771015167237, |
|
"epoch": 0.9177123452188678, |
|
"grad_norm": 0.38336632998772435, |
|
"kl": 0.175140380859375, |
|
"learning_rate": 4.086329517107046e-07, |
|
"loss": 0.007, |
|
"reward": 0.19583333786576987, |
|
"reward_std": 0.14731391333043575, |
|
"rewards/accuracy_reward": 0.19583333786576987, |
|
"rewards/format_reward": 0.0, |
|
"step": 1385 |
|
}, |
|
{ |
|
"completion_length": 579.0646015167237, |
|
"epoch": 0.9210253861763366, |
|
"grad_norm": 0.2664468063435788, |
|
"kl": 0.169805908203125, |
|
"learning_rate": 3.7654733565969826e-07, |
|
"loss": 0.0068, |
|
"reward": 0.16041666977107524, |
|
"reward_std": 0.13258252162486314, |
|
"rewards/accuracy_reward": 0.16041666977107524, |
|
"rewards/format_reward": 0.0, |
|
"step": 1390 |
|
}, |
|
{ |
|
"completion_length": 572.5937644958497, |
|
"epoch": 0.9243384271338054, |
|
"grad_norm": 0.29069246784999064, |
|
"kl": 0.172479248046875, |
|
"learning_rate": 3.457492760812975e-07, |
|
"loss": 0.0069, |
|
"reward": 0.20833333749324084, |
|
"reward_std": 0.10606601759791374, |
|
"rewards/accuracy_reward": 0.20833333749324084, |
|
"rewards/format_reward": 0.0, |
|
"step": 1395 |
|
}, |
|
{ |
|
"completion_length": 582.8104347229004, |
|
"epoch": 0.9276514680912743, |
|
"grad_norm": 0.3338040728206735, |
|
"kl": 0.18824462890625, |
|
"learning_rate": 3.1624289355907334e-07, |
|
"loss": 0.0075, |
|
"reward": 0.18125000428408383, |
|
"reward_std": 0.12668996546417474, |
|
"rewards/accuracy_reward": 0.18125000428408383, |
|
"rewards/format_reward": 0.0, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.9276514680912743, |
|
"eval_completion_length": 557.0784409466912, |
|
"eval_kl": 0.15639361213235295, |
|
"eval_loss": 0.006327385548502207, |
|
"eval_reward": 0.1960784367778722, |
|
"eval_reward_std": 0.13864838857861125, |
|
"eval_rewards/accuracy_reward": 0.1960784367778722, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 86.7655, |
|
"eval_samples_per_second": 1.141, |
|
"eval_steps_per_second": 0.104, |
|
"step": 1400 |
|
}, |
|
{ |
|
"completion_length": 589.4166835784912, |
|
"epoch": 0.9309645090487431, |
|
"grad_norm": 0.353780019491935, |
|
"kl": 0.178558349609375, |
|
"learning_rate": 2.8803213585846036e-07, |
|
"loss": 0.0071, |
|
"reward": 0.16458333767950534, |
|
"reward_std": 0.11490485221147537, |
|
"rewards/accuracy_reward": 0.16458333767950534, |
|
"rewards/format_reward": 0.0, |
|
"step": 1405 |
|
}, |
|
{ |
|
"completion_length": 579.3000144958496, |
|
"epoch": 0.9342775500062119, |
|
"grad_norm": 0.34672099101912396, |
|
"kl": 0.158709716796875, |
|
"learning_rate": 2.6112077739857465e-07, |
|
"loss": 0.0063, |
|
"reward": 0.15833333674818278, |
|
"reward_std": 0.12374368701130152, |
|
"rewards/accuracy_reward": 0.15833333674818278, |
|
"rewards/format_reward": 0.0, |
|
"step": 1410 |
|
}, |
|
{ |
|
"completion_length": 578.6437679290772, |
|
"epoch": 0.9375905909636808, |
|
"grad_norm": 0.2557223102198384, |
|
"kl": 0.175360107421875, |
|
"learning_rate": 2.3551241874721353e-07, |
|
"loss": 0.007, |
|
"reward": 0.11666666977107525, |
|
"reward_std": 0.0766032349318266, |
|
"rewards/accuracy_reward": 0.11666666977107525, |
|
"rewards/format_reward": 0.0, |
|
"step": 1415 |
|
}, |
|
{ |
|
"completion_length": 564.343769454956, |
|
"epoch": 0.9409036319211497, |
|
"grad_norm": 0.31991282109346414, |
|
"kl": 0.162542724609375, |
|
"learning_rate": 2.1121048613912843e-07, |
|
"loss": 0.0065, |
|
"reward": 0.19583333749324083, |
|
"reward_std": 0.14142135679721832, |
|
"rewards/accuracy_reward": 0.19583333749324083, |
|
"rewards/format_reward": 0.0, |
|
"step": 1420 |
|
}, |
|
{ |
|
"completion_length": 574.3541858673095, |
|
"epoch": 0.9442166728786184, |
|
"grad_norm": 0.28826580975238403, |
|
"kl": 0.19622802734375, |
|
"learning_rate": 1.8821823101760949e-07, |
|
"loss": 0.0079, |
|
"reward": 0.1812500037252903, |
|
"reward_std": 0.12079740893095732, |
|
"rewards/accuracy_reward": 0.1812500037252903, |
|
"rewards/format_reward": 0.0, |
|
"step": 1425 |
|
}, |
|
{ |
|
"completion_length": 562.9854351043701, |
|
"epoch": 0.9475297138360873, |
|
"grad_norm": 0.34971148204114416, |
|
"kl": 0.149298095703125, |
|
"learning_rate": 1.665387295994747e-07, |
|
"loss": 0.006, |
|
"reward": 0.1541666705161333, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/accuracy_reward": 0.1541666705161333, |
|
"rewards/format_reward": 0.0, |
|
"step": 1430 |
|
}, |
|
{ |
|
"completion_length": 562.6541831970214, |
|
"epoch": 0.9508427547935562, |
|
"grad_norm": 0.36546930503986563, |
|
"kl": 0.158831787109375, |
|
"learning_rate": 1.4617488246348012e-07, |
|
"loss": 0.0064, |
|
"reward": 0.20208333767950534, |
|
"reward_std": 0.13258252199739218, |
|
"rewards/accuracy_reward": 0.20208333767950534, |
|
"rewards/format_reward": 0.0, |
|
"step": 1435 |
|
}, |
|
{ |
|
"completion_length": 590.1208526611329, |
|
"epoch": 0.9541557957510249, |
|
"grad_norm": 0.208162230921283, |
|
"kl": 0.180303955078125, |
|
"learning_rate": 1.271294141622459e-07, |
|
"loss": 0.0072, |
|
"reward": 0.1395833369344473, |
|
"reward_std": 0.08544206973165273, |
|
"rewards/accuracy_reward": 0.1395833369344473, |
|
"rewards/format_reward": 0.0, |
|
"step": 1440 |
|
}, |
|
{ |
|
"completion_length": 587.8396018981933, |
|
"epoch": 0.9574688367084938, |
|
"grad_norm": 0.2594006165397625, |
|
"kl": 0.173162841796875, |
|
"learning_rate": 1.094048728577346e-07, |
|
"loss": 0.0069, |
|
"reward": 0.14583333674818277, |
|
"reward_std": 0.10017346106469631, |
|
"rewards/accuracy_reward": 0.14583333674818277, |
|
"rewards/format_reward": 0.0, |
|
"step": 1445 |
|
}, |
|
{ |
|
"completion_length": 567.8000190734863, |
|
"epoch": 0.9607818776659627, |
|
"grad_norm": 0.32429036574157794, |
|
"kl": 0.1535888671875, |
|
"learning_rate": 9.300362998030832e-08, |
|
"loss": 0.0061, |
|
"reward": 0.17291667070239783, |
|
"reward_std": 0.12668996546417474, |
|
"rewards/accuracy_reward": 0.17291667070239783, |
|
"rewards/format_reward": 0.0, |
|
"step": 1450 |
|
}, |
|
{ |
|
"completion_length": 565.8521003723145, |
|
"epoch": 0.9640949186234314, |
|
"grad_norm": 0.2978650282959838, |
|
"kl": 0.150830078125, |
|
"learning_rate": 7.792787991146356e-08, |
|
"loss": 0.006, |
|
"reward": 0.1958333382382989, |
|
"reward_std": 0.11195857394486666, |
|
"rewards/accuracy_reward": 0.1958333382382989, |
|
"rewards/format_reward": 0.0, |
|
"step": 1455 |
|
}, |
|
{ |
|
"completion_length": 561.8375183105469, |
|
"epoch": 0.9674079595809003, |
|
"grad_norm": 0.2897071588264315, |
|
"kl": 0.1702880859375, |
|
"learning_rate": 6.417963969022389e-08, |
|
"loss": 0.0068, |
|
"reward": 0.21250000540167094, |
|
"reward_std": 0.13552880026400088, |
|
"rewards/accuracy_reward": 0.21250000540167094, |
|
"rewards/format_reward": 0.0, |
|
"step": 1460 |
|
}, |
|
{ |
|
"completion_length": 586.7416843414306, |
|
"epoch": 0.9707210005383692, |
|
"grad_norm": 0.38925090389086237, |
|
"kl": 0.15640869140625, |
|
"learning_rate": 5.176074874327919e-08, |
|
"loss": 0.0063, |
|
"reward": 0.18541667088866234, |
|
"reward_std": 0.1384750785306096, |
|
"rewards/accuracy_reward": 0.18541667088866234, |
|
"rewards/format_reward": 0.0, |
|
"step": 1465 |
|
}, |
|
{ |
|
"completion_length": 561.2791835784913, |
|
"epoch": 0.9740340414958379, |
|
"grad_norm": 0.34721664371729727, |
|
"kl": 0.1898681640625, |
|
"learning_rate": 4.067286863888131e-08, |
|
"loss": 0.0076, |
|
"reward": 0.1895833384245634, |
|
"reward_std": 0.15026019141077995, |
|
"rewards/accuracy_reward": 0.1895833384245634, |
|
"rewards/format_reward": 0.0, |
|
"step": 1470 |
|
}, |
|
{ |
|
"completion_length": 590.7000167846679, |
|
"epoch": 0.9773470824533068, |
|
"grad_norm": 0.2964515178540727, |
|
"kl": 0.168414306640625, |
|
"learning_rate": 3.091748286453866e-08, |
|
"loss": 0.0067, |
|
"reward": 0.1541666707023978, |
|
"reward_std": 0.13552880026400088, |
|
"rewards/accuracy_reward": 0.1541666707023978, |
|
"rewards/format_reward": 0.0, |
|
"step": 1475 |
|
}, |
|
{ |
|
"completion_length": 567.4812641143799, |
|
"epoch": 0.9806601234107757, |
|
"grad_norm": 0.22436357518659494, |
|
"kl": 0.182574462890625, |
|
"learning_rate": 2.2495896628529355e-08, |
|
"loss": 0.0073, |
|
"reward": 0.1833333382382989, |
|
"reward_std": 0.12374368719756604, |
|
"rewards/accuracy_reward": 0.1833333382382989, |
|
"rewards/format_reward": 0.0, |
|
"step": 1480 |
|
}, |
|
{ |
|
"completion_length": 576.3062686920166, |
|
"epoch": 0.9839731643682446, |
|
"grad_norm": 0.2682867728590303, |
|
"kl": 0.160321044921875, |
|
"learning_rate": 1.5409236685277608e-08, |
|
"loss": 0.0064, |
|
"reward": 0.15625000335276126, |
|
"reward_std": 0.13258252199739218, |
|
"rewards/accuracy_reward": 0.15625000335276126, |
|
"rewards/format_reward": 0.0, |
|
"step": 1485 |
|
}, |
|
{ |
|
"completion_length": 578.2854351043701, |
|
"epoch": 0.9872862053257133, |
|
"grad_norm": 0.3291475555349833, |
|
"kl": 0.1758056640625, |
|
"learning_rate": 9.658451184600959e-09, |
|
"loss": 0.007, |
|
"reward": 0.16041667032986878, |
|
"reward_std": 0.13258252181112767, |
|
"rewards/accuracy_reward": 0.16041667032986878, |
|
"rewards/format_reward": 0.0, |
|
"step": 1490 |
|
}, |
|
{ |
|
"completion_length": 577.6250164031983, |
|
"epoch": 0.9905992462831822, |
|
"grad_norm": 0.28807275157813206, |
|
"kl": 0.1435791015625, |
|
"learning_rate": 5.2443095448506674e-09, |
|
"loss": 0.0057, |
|
"reward": 0.16875000428408385, |
|
"reward_std": 0.12079740893095732, |
|
"rewards/accuracy_reward": 0.16875000428408385, |
|
"rewards/format_reward": 0.0, |
|
"step": 1495 |
|
}, |
|
{ |
|
"completion_length": 565.6062662124634, |
|
"epoch": 0.9939122872406511, |
|
"grad_norm": 0.34303096364094354, |
|
"kl": 0.1605682373046875, |
|
"learning_rate": 2.167402349972925e-09, |
|
"loss": 0.0064, |
|
"reward": 0.19791667070239782, |
|
"reward_std": 0.1679378604516387, |
|
"rewards/accuracy_reward": 0.19791667070239782, |
|
"rewards/format_reward": 0.0, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.9939122872406511, |
|
"eval_completion_length": 554.8088396857767, |
|
"eval_kl": 0.18235868566176472, |
|
"eval_loss": 0.007235472556203604, |
|
"eval_reward": 0.21078431869254394, |
|
"eval_reward_std": 0.13171596868949778, |
|
"eval_rewards/accuracy_reward": 0.21078431869254394, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 83.8442, |
|
"eval_samples_per_second": 1.181, |
|
"eval_steps_per_second": 0.107, |
|
"step": 1500 |
|
}, |
|
{ |
|
"completion_length": 581.8271011352539, |
|
"epoch": 0.9972253281981198, |
|
"grad_norm": 0.21304650345487455, |
|
"kl": 0.1817626953125, |
|
"learning_rate": 4.2814127048873553e-10, |
|
"loss": 0.0073, |
|
"reward": 0.2020833371207118, |
|
"reward_std": 0.15026019141077995, |
|
"rewards/accuracy_reward": 0.2020833371207118, |
|
"rewards/format_reward": 0.0, |
|
"step": 1505 |
|
}, |
|
{ |
|
"completion_length": 576.2317910194397, |
|
"epoch": 0.9998757609640949, |
|
"kl": 0.15965652465820312, |
|
"reward": 0.21093750488944352, |
|
"reward_std": 0.1215339784976095, |
|
"rewards/accuracy_reward": 0.21093750488944352, |
|
"rewards/format_reward": 0.0, |
|
"step": 1509, |
|
"total_flos": 0.0, |
|
"train_loss": 0.020143496153197993, |
|
"train_runtime": 113002.1292, |
|
"train_samples_per_second": 0.641, |
|
"train_steps_per_second": 0.013 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1509, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|