{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998757609640949, "eval_steps": 100, "global_step": 1509, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 547.2583505630494, "epoch": 0.0033130409574688366, "grad_norm": 0.8986426559690995, "kl": 0.00011590123176574707, "learning_rate": 6.622516556291392e-07, "loss": 0.0, "reward": 0.4578872682526708, "reward_std": 0.49882074976339935, "rewards/accuracy_reward": 0.1979166718199849, "rewards/cosine_scaled_reward": -0.07614052973804064, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.3361111299134791, "step": 5 }, { "completion_length": 542.4854345321655, "epoch": 0.006626081914937673, "grad_norm": 1.4547975592321127, "kl": 0.0002397090196609497, "learning_rate": 1.3245033112582784e-06, "loss": 0.0, "reward": 0.44909522240632216, "reward_std": 0.5454491914715618, "rewards/accuracy_reward": 0.20000000465661288, "rewards/cosine_scaled_reward": -0.06965479369682726, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.31875001527369023, "step": 10 }, { "completion_length": 565.6958498001098, "epoch": 0.00993912287240651, "grad_norm": 1.0157502001683818, "kl": 0.0002588212490081787, "learning_rate": 1.9867549668874175e-06, "loss": 0.0, "reward": 0.5228961614891887, "reward_std": 0.4716573230922222, "rewards/accuracy_reward": 0.20000000465661288, "rewards/cosine_scaled_reward": -0.06738163912377786, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.39027779633179305, "step": 15 }, { "completion_length": 580.3354320526123, "epoch": 0.013252163829875346, "grad_norm": 0.8203971253087375, "kl": 0.0018373370170593263, "learning_rate": 2.6490066225165567e-06, "loss": 0.0001, "reward": 0.5548897766973824, "reward_std": 0.5027584770694375, "rewards/accuracy_reward": 0.20833333861082792, "rewards/cosine_scaled_reward": -0.06524913519970141, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.4118055749684572, "step": 20 }, { "completion_length": 584.9479331970215, "epoch": 0.016565204787344183, "grad_norm": 0.7674240830324554, "kl": 0.007563400268554688, "learning_rate": 3.311258278145696e-06, "loss": 0.0003, "reward": 0.5626075943931937, "reward_std": 0.47724440647289157, "rewards/accuracy_reward": 0.21458333861082793, "rewards/cosine_scaled_reward": -0.0610035399266053, "rewards/format_reward": 0.002083333395421505, "rewards/reasoning_steps_reward": 0.4069444644264877, "step": 25 }, { "completion_length": 608.933349609375, "epoch": 0.01987824574481302, "grad_norm": 0.7330645414163709, "kl": 0.0036226749420166016, "learning_rate": 3.973509933774835e-06, "loss": 0.0001, "reward": 0.7388441126793623, "reward_std": 0.5278409453574568, "rewards/accuracy_reward": 0.268750006519258, "rewards/cosine_scaled_reward": -0.007683685462689027, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.47777780089527366, "step": 30 }, { "completion_length": 678.1875186920166, "epoch": 0.023191286702281856, "grad_norm": 0.7178503291549978, "kl": 0.006669425964355468, "learning_rate": 4.635761589403974e-06, "loss": 0.0003, "reward": 0.7627530416473747, "reward_std": 0.5493246266618371, "rewards/accuracy_reward": 0.21666667144745588, "rewards/cosine_scaled_reward": -0.0441914358350914, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.590277810767293, "step": 35 }, { "completion_length": 692.6146026611328, "epoch": 0.026504327659750693, "grad_norm": 0.7681929928897241, "kl": 0.010077667236328126, "learning_rate": 5.2980132450331135e-06, "loss": 0.0004, "reward": 0.8794046219438314, "reward_std": 0.43218649495393036, "rewards/accuracy_reward": 0.22083333935588598, "rewards/cosine_scaled_reward": -0.0372620829322841, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.6958333738148212, "step": 40 }, { "completion_length": 707.8916873931885, "epoch": 0.02981736861721953, "grad_norm": 0.6613367538742354, "kl": 0.01708984375, "learning_rate": 5.960264900662252e-06, "loss": 0.0007, "reward": 1.0503661509603262, "reward_std": 0.42615648852661253, "rewards/accuracy_reward": 0.2604166720062494, "rewards/cosine_scaled_reward": 0.0003661068942165002, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.7895833693444729, "step": 45 }, { "completion_length": 711.1687713623047, "epoch": 0.033130409574688366, "grad_norm": 0.6995404944499967, "kl": 0.02463531494140625, "learning_rate": 6.622516556291392e-06, "loss": 0.001, "reward": 1.153285625949502, "reward_std": 0.48801933908835055, "rewards/accuracy_reward": 0.2562500057742, "rewards/cosine_scaled_reward": 0.026202249895140993, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8708333753049373, "step": 50 }, { "completion_length": 704.6083515167236, "epoch": 0.0364434505321572, "grad_norm": 0.701294558257362, "kl": 0.029018402099609375, "learning_rate": 7.28476821192053e-06, "loss": 0.0012, "reward": 1.1085936680436135, "reward_std": 0.33183646101970227, "rewards/accuracy_reward": 0.2041666718199849, "rewards/cosine_scaled_reward": -0.010156369340256788, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9145833604037762, "step": 55 }, { "completion_length": 669.9291900634765, "epoch": 0.03975649148962604, "grad_norm": 0.7163748198816227, "kl": 0.0327178955078125, "learning_rate": 7.94701986754967e-06, "loss": 0.0013, "reward": 1.1198216035962105, "reward_std": 0.44725179062224923, "rewards/accuracy_reward": 0.21250000465661287, "rewards/cosine_scaled_reward": 0.010793794272467494, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8965278092771769, "step": 60 }, { "completion_length": 628.4291805267334, "epoch": 0.043069532447094876, "grad_norm": 0.7340068885080956, "kl": 0.0371856689453125, "learning_rate": 8.609271523178809e-06, "loss": 0.0015, "reward": 1.0585288569331168, "reward_std": 0.40788733437657354, "rewards/accuracy_reward": 0.20416667144745587, "rewards/cosine_scaled_reward": -0.015776735625695436, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8701389238238335, "step": 65 }, { "completion_length": 705.650016784668, "epoch": 0.04638257340456371, "grad_norm": 0.7775239926804596, "kl": 0.04803619384765625, "learning_rate": 9.271523178807948e-06, "loss": 0.0019, "reward": 1.0490481086075305, "reward_std": 0.4179804825922474, "rewards/accuracy_reward": 0.1875000052154064, "rewards/cosine_scaled_reward": -0.04331303813378327, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9048611462116242, "step": 70 }, { "completion_length": 733.939603805542, "epoch": 0.04969561436203255, "grad_norm": 0.6694211632473527, "kl": 0.0597625732421875, "learning_rate": 9.933774834437086e-06, "loss": 0.0024, "reward": 1.1030454210937024, "reward_std": 0.3557716819923371, "rewards/accuracy_reward": 0.19375000484287738, "rewards/cosine_scaled_reward": -0.04903793919365853, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9583333536982537, "step": 75 }, { "completion_length": 713.7625190734864, "epoch": 0.053008655319501385, "grad_norm": 0.6679732945910538, "kl": 0.0666534423828125, "learning_rate": 1.0596026490066227e-05, "loss": 0.0027, "reward": 1.0669297687709332, "reward_std": 0.3843187237624079, "rewards/accuracy_reward": 0.16250000409781934, "rewards/cosine_scaled_reward": -0.06779248882085084, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9722222372889519, "step": 80 }, { "completion_length": 687.8125183105469, "epoch": 0.05632169627697022, "grad_norm": 0.718469855101791, "kl": 0.069549560546875, "learning_rate": 1.1258278145695364e-05, "loss": 0.0028, "reward": 1.031835040077567, "reward_std": 0.3375629215617664, "rewards/accuracy_reward": 0.15000000391155482, "rewards/cosine_scaled_reward": -0.06191499504493549, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9437500238418579, "step": 85 }, { "completion_length": 677.968773651123, "epoch": 0.05963473723443906, "grad_norm": 0.6952114166553607, "kl": 0.081231689453125, "learning_rate": 1.1920529801324505e-05, "loss": 0.0032, "reward": 0.9767402917146683, "reward_std": 0.37912332522682846, "rewards/accuracy_reward": 0.1395833369344473, "rewards/cosine_scaled_reward": -0.08089861996413675, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9180555887520313, "step": 90 }, { "completion_length": 746.2979343414306, "epoch": 0.0629477781919079, "grad_norm": 0.6723631089581203, "kl": 0.0932037353515625, "learning_rate": 1.2582781456953644e-05, "loss": 0.0037, "reward": 0.9129165161401034, "reward_std": 0.2931412507314235, "rewards/accuracy_reward": 0.08750000260770321, "rewards/cosine_scaled_reward": -0.1197224066912895, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9451389148831367, "step": 95 }, { "completion_length": 718.5521018981933, "epoch": 0.06626081914937673, "grad_norm": 0.7389417855175663, "kl": 0.097125244140625, "learning_rate": 1.3245033112582784e-05, "loss": 0.0039, "reward": 0.9509775288403034, "reward_std": 0.30276247672736645, "rewards/accuracy_reward": 0.08750000204890966, "rewards/cosine_scaled_reward": -0.09971694777195808, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9631944663822651, "step": 100 }, { "epoch": 0.06626081914937673, "eval_completion_length": 677.9068818933823, "eval_kl": 0.10776654411764706, "eval_loss": 0.0042822412215173244, "eval_reward": 1.091089995468364, "eval_reward_std": 0.31282253870192694, "eval_rewards/accuracy_reward": 0.15196078665116253, "eval_rewards/cosine_scaled_reward": -0.03799499067313531, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9771242001477409, "eval_runtime": 84.9934, "eval_samples_per_second": 1.165, "eval_steps_per_second": 0.106, "step": 100 }, { "completion_length": 707.7458526611329, "epoch": 0.06957386010684558, "grad_norm": 112588.77048160137, "kl": 358.50518188476565, "learning_rate": 1.3907284768211921e-05, "loss": 14.353, "reward": 1.0418762266635895, "reward_std": 0.3698788687819615, "rewards/accuracy_reward": 0.13750000353902578, "rewards/cosine_scaled_reward": -0.0602071454290126, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9645833536982537, "step": 105 }, { "completion_length": 709.3520999908447, "epoch": 0.0728869010643144, "grad_norm": 2.4125892421397426, "kl": 0.183880615234375, "learning_rate": 1.456953642384106e-05, "loss": 0.0074, "reward": 0.8919016800820827, "reward_std": 0.23831837207544596, "rewards/accuracy_reward": 0.05416666828095913, "rewards/cosine_scaled_reward": -0.12823724588379265, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9659722425043583, "step": 110 }, { "completion_length": 704.5229377746582, "epoch": 0.07619994202178325, "grad_norm": 3.8262682559687327, "kl": 0.150738525390625, "learning_rate": 1.52317880794702e-05, "loss": 0.006, "reward": 0.9812888875603676, "reward_std": 0.28556704440852626, "rewards/accuracy_reward": 0.08958333525806665, "rewards/cosine_scaled_reward": -0.06523891516262666, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9569444641470909, "step": 115 }, { "completion_length": 707.235432434082, "epoch": 0.07951298297925208, "grad_norm": 0.7028359955724703, "kl": 0.158172607421875, "learning_rate": 1.589403973509934e-05, "loss": 0.0063, "reward": 0.9831134401261806, "reward_std": 0.376210459030699, "rewards/accuracy_reward": 0.11666666958481073, "rewards/cosine_scaled_reward": -0.04188658911152743, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9083333566784859, "step": 120 }, { "completion_length": 669.6041843414307, "epoch": 0.08282602393672092, "grad_norm": 0.7161321894207794, "kl": 0.141949462890625, "learning_rate": 1.6556291390728477e-05, "loss": 0.0057, "reward": 0.8790107492357493, "reward_std": 0.3467244952917099, "rewards/accuracy_reward": 0.08541666902601719, "rewards/cosine_scaled_reward": -0.09529483446385711, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8888889268040657, "step": 125 }, { "completion_length": 689.345851135254, "epoch": 0.08613906489418975, "grad_norm": 0.8296412895947082, "kl": 241.67970581054686, "learning_rate": 1.7218543046357617e-05, "loss": 9.7154, "reward": 0.8923697136342525, "reward_std": 0.275027786870487, "rewards/accuracy_reward": 0.08333333563059568, "rewards/cosine_scaled_reward": -0.13054698700434528, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9395833604037762, "step": 130 }, { "completion_length": 754.8604354858398, "epoch": 0.0894521058516586, "grad_norm": 1.11431074259932, "kl": 0.213934326171875, "learning_rate": 1.7880794701986758e-05, "loss": 0.0086, "reward": 0.8914617910981179, "reward_std": 0.25304291686043145, "rewards/accuracy_reward": 0.052083334513008596, "rewards/cosine_scaled_reward": -0.11756601680535823, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9569444641470909, "step": 135 }, { "completion_length": 706.718773651123, "epoch": 0.09276514680912742, "grad_norm": 2.6197216794244094, "kl": 0.197021484375, "learning_rate": 1.8543046357615895e-05, "loss": 0.0079, "reward": 0.8964800350368023, "reward_std": 0.25830878962296994, "rewards/accuracy_reward": 0.05208333469927311, "rewards/cosine_scaled_reward": -0.12088110755430534, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9652777940034867, "step": 140 }, { "completion_length": 619.495846939087, "epoch": 0.09607818776659627, "grad_norm": 3645.6405236903074, "kl": 7.352001953125, "learning_rate": 1.9205298013245036e-05, "loss": 0.2935, "reward": 0.9159494280815125, "reward_std": 0.34424629651475697, "rewards/accuracy_reward": 0.08125000223517417, "rewards/cosine_scaled_reward": -0.11668948565493338, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9513889029622078, "step": 145 }, { "completion_length": 518.2521003723144, "epoch": 0.0993912287240651, "grad_norm": 5.571628636500797, "kl": 6.260498046875, "learning_rate": 1.9867549668874173e-05, "loss": 0.2497, "reward": 0.8889851249754429, "reward_std": 0.377276139636524, "rewards/accuracy_reward": 0.0958333358168602, "rewards/cosine_scaled_reward": -0.1394871225958923, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9326389119029045, "step": 150 }, { "completion_length": 456.8520965576172, "epoch": 0.10270426968153394, "grad_norm": 8.594579288315838, "kl": 1.3730712890625, "learning_rate": 1.999957185872951e-05, "loss": 0.055, "reward": 0.7455938890576362, "reward_std": 0.31601569671183827, "rewards/accuracy_reward": 0.05625000167638063, "rewards/cosine_scaled_reward": -0.22454502481268718, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9138889156281949, "step": 155 }, { "completion_length": 449.2375144958496, "epoch": 0.10601731063900277, "grad_norm": 48.524435493723146, "kl": 1.4756103515625, "learning_rate": 1.999783259765003e-05, "loss": 0.059, "reward": 0.733327554166317, "reward_std": 0.3105776087380946, "rewards/accuracy_reward": 0.0541666679084301, "rewards/cosine_scaled_reward": -0.21875581443309783, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8979167003184557, "step": 160 }, { "completion_length": 643.7875164031982, "epoch": 0.10933035159647161, "grad_norm": 13.898710966497136, "kl": 0.92965087890625, "learning_rate": 1.9994755690455154e-05, "loss": 0.0372, "reward": 0.758381636068225, "reward_std": 0.3267297722399235, "rewards/accuracy_reward": 0.054166668094694614, "rewards/cosine_scaled_reward": -0.18467394965700806, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8888889133930207, "step": 165 }, { "completion_length": 759.6958511352539, "epoch": 0.11264339255394044, "grad_norm": 5.448571386230047, "kl": 0.31900634765625, "learning_rate": 1.99903415488154e-05, "loss": 0.0128, "reward": 0.9591526392847299, "reward_std": 0.3521417257492431, "rewards/accuracy_reward": 0.0958333358168602, "rewards/cosine_scaled_reward": -0.05473627850296907, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9180555880069733, "step": 170 }, { "completion_length": 702.1166870117188, "epoch": 0.11595643351140929, "grad_norm": 0.979463656825196, "kl": 0.30997314453125, "learning_rate": 1.9984590763314722e-05, "loss": 0.0124, "reward": 0.8980235911905765, "reward_std": 0.27625240066554396, "rewards/accuracy_reward": 0.07083333488553763, "rewards/cosine_scaled_reward": -0.08253199499840776, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9097222477197647, "step": 175 }, { "completion_length": 793.5208518981933, "epoch": 0.11926947446887812, "grad_norm": 1.0698933651829063, "kl": 0.24013671875, "learning_rate": 1.997750410337147e-05, "loss": 0.0096, "reward": 0.9569418981671334, "reward_std": 0.26245332225225865, "rewards/accuracy_reward": 0.07708333525806665, "rewards/cosine_scaled_reward": -0.0944470182817895, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9743055656552315, "step": 180 }, { "completion_length": 714.4875190734863, "epoch": 0.12258251542634696, "grad_norm": 4082.5381992062476, "kl": 4.44453125, "learning_rate": 1.9969082517135463e-05, "loss": 0.1771, "reward": 0.9678208701312542, "reward_std": 0.2673031146405265, "rewards/accuracy_reward": 0.07500000204890966, "rewards/cosine_scaled_reward": -0.07592915609129705, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9687500119209289, "step": 185 }, { "completion_length": 644.762520980835, "epoch": 0.1258955563838158, "grad_norm": 173.01479696996375, "kl": 1.59151611328125, "learning_rate": 1.995932713136112e-05, "loss": 0.0637, "reward": 0.8781557366251945, "reward_std": 0.2831448505516164, "rewards/accuracy_reward": 0.05833333488553762, "rewards/cosine_scaled_reward": -0.11003873231820763, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9298611305654049, "step": 190 }, { "completion_length": 575.5854351043702, "epoch": 0.12920859734128462, "grad_norm": 1.1564416677212213, "kl": 194.8779052734375, "learning_rate": 1.994823925125672e-05, "loss": 7.7893, "reward": 0.7465804230421782, "reward_std": 0.31073944554664196, "rewards/accuracy_reward": 0.03958333451300859, "rewards/cosine_scaled_reward": -0.18050292830448597, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8875000294297933, "step": 195 }, { "completion_length": 694.4604377746582, "epoch": 0.13252163829875346, "grad_norm": 11.001706999056374, "kl": 3.44097900390625, "learning_rate": 1.993582036030978e-05, "loss": 0.1373, "reward": 0.881216985732317, "reward_std": 0.2966700808610767, "rewards/accuracy_reward": 0.06458333525806666, "rewards/cosine_scaled_reward": -0.10072748601669446, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9173611357808114, "step": 200 }, { "epoch": 0.13252163829875346, "eval_completion_length": 821.8774737189798, "eval_kl": 114.56985294117646, "eval_loss": 4.744396209716797, "eval_reward": 0.9304876257391537, "eval_reward_std": 0.31497081150026884, "eval_rewards/accuracy_reward": 0.11274510227582034, "eval_rewards/cosine_scaled_reward": -0.11526403554222163, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.933006556595073, "eval_runtime": 87.4223, "eval_samples_per_second": 1.132, "eval_steps_per_second": 0.103, "step": 200 }, { "completion_length": 852.4646064758301, "epoch": 0.1358346792562223, "grad_norm": 12.02648798076259, "kl": 167.95625, "learning_rate": 1.9922072120088537e-05, "loss": 6.7495, "reward": 0.8722536094486714, "reward_std": 0.29699398775119334, "rewards/accuracy_reward": 0.07916666883975268, "rewards/cosine_scaled_reward": -0.15274642113945447, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9458333559334278, "step": 205 }, { "completion_length": 958.8625160217285, "epoch": 0.13914772021369115, "grad_norm": 2.118391117846703, "kl": 1.94228515625, "learning_rate": 1.9906996370019692e-05, "loss": 0.0777, "reward": 0.7352574944496155, "reward_std": 0.2676800017012283, "rewards/accuracy_reward": 0.05833333507180214, "rewards/cosine_scaled_reward": -0.2550203081838845, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9319444663822651, "step": 210 }, { "completion_length": 842.6812675476074, "epoch": 0.14246076117115997, "grad_norm": 5.021910520085136, "kl": 0.9705078125, "learning_rate": 1.989059512714227e-05, "loss": 0.0388, "reward": 0.8104459330439567, "reward_std": 0.20855108068790287, "rewards/accuracy_reward": 0.05833333507180214, "rewards/cosine_scaled_reward": -0.22219298486488698, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9743055701255798, "step": 215 }, { "completion_length": 814.9333541870117, "epoch": 0.1457738021286288, "grad_norm": 1.2340527302565767, "kl": 0.8401123046875, "learning_rate": 1.9872870585837757e-05, "loss": 0.0336, "reward": 0.8514514803886414, "reward_std": 0.22183397794142365, "rewards/accuracy_reward": 0.05208333469927311, "rewards/cosine_scaled_reward": -0.1853540993994102, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9847222313284874, "step": 220 }, { "completion_length": 855.1104377746582, "epoch": 0.14908684308609765, "grad_norm": 3.1109097307292806, "kl": 0.7322265625, "learning_rate": 1.9853825117536522e-05, "loss": 0.0293, "reward": 0.8423201210796833, "reward_std": 0.28028559472877534, "rewards/accuracy_reward": 0.06666666865348816, "rewards/cosine_scaled_reward": -0.17642990179592744, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9520833566784859, "step": 225 }, { "completion_length": 798.1166862487793, "epoch": 0.1523998840435665, "grad_norm": 2.2217097114702002, "kl": 0.7458984375, "learning_rate": 1.983346127040053e-05, "loss": 0.0299, "reward": 0.8415744449943304, "reward_std": 0.28888633460737767, "rewards/accuracy_reward": 0.05625000149011612, "rewards/cosine_scaled_reward": -0.14106446508085355, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9263889186084271, "step": 230 }, { "completion_length": 877.2771034240723, "epoch": 0.1557129250010353, "grad_norm": 1.3758324592869524, "kl": 0.630810546875, "learning_rate": 1.9811781768982392e-05, "loss": 0.0252, "reward": 0.7912506945431232, "reward_std": 0.2919431045651436, "rewards/accuracy_reward": 0.05416666828095913, "rewards/cosine_scaled_reward": -0.21083267419598997, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9479166895151139, "step": 235 }, { "completion_length": 849.1750221252441, "epoch": 0.15902596595850416, "grad_norm": 2.905524359766465, "kl": 0.723193359375, "learning_rate": 1.9788789513860875e-05, "loss": 0.0289, "reward": 0.7299305066466332, "reward_std": 0.20652824777644127, "rewards/accuracy_reward": 0.014583333767950535, "rewards/cosine_scaled_reward": -0.2547917439602315, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9701389014720917, "step": 240 }, { "completion_length": 848.8312675476075, "epoch": 0.162339006915973, "grad_norm": 588.3184991862764, "kl": 1.9171630859375, "learning_rate": 1.9764487581252787e-05, "loss": 0.0767, "reward": 0.7883537173271179, "reward_std": 0.2473137264372781, "rewards/accuracy_reward": 0.03333333432674408, "rewards/cosine_scaled_reward": -0.19150742106721735, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9465277999639511, "step": 245 }, { "completion_length": 893.8854400634766, "epoch": 0.16565204787344184, "grad_norm": 0.9361617050000692, "kl": 0.573095703125, "learning_rate": 1.9738879222601425e-05, "loss": 0.0229, "reward": 0.7728720743209123, "reward_std": 0.1881862965412438, "rewards/accuracy_reward": 0.018750000558793545, "rewards/cosine_scaled_reward": -0.19726684388588184, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9513889059424401, "step": 250 }, { "completion_length": 823.8416877746582, "epoch": 0.16896508883091066, "grad_norm": 0.8726402716561719, "kl": 0.54619140625, "learning_rate": 1.9711967864141542e-05, "loss": 0.0218, "reward": 0.842857076972723, "reward_std": 0.2485630498966202, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.14186516671907157, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.943055585771799, "step": 255 }, { "completion_length": 728.3729362487793, "epoch": 0.1722781297883795, "grad_norm": 0.7944406292942889, "kl": 0.5627685546875, "learning_rate": 1.968375710644093e-05, "loss": 0.0225, "reward": 0.9059418775141239, "reward_std": 0.26922084379475564, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.09336370739620178, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9368055798113346, "step": 260 }, { "completion_length": 763.4500190734864, "epoch": 0.17559117074584835, "grad_norm": 3.5384859223110934, "kl": 0.545751953125, "learning_rate": 1.9654250723918706e-05, "loss": 0.0218, "reward": 0.8759777404367923, "reward_std": 0.2127035611309111, "rewards/accuracy_reward": 0.05625000149011612, "rewards/cosine_scaled_reward": -0.14138339518103749, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9611111342906952, "step": 265 }, { "completion_length": 774.3666862487793, "epoch": 0.1789042117033172, "grad_norm": 3.8609409271792345, "kl": 0.6284912109375, "learning_rate": 1.9623452664340305e-05, "loss": 0.0251, "reward": 0.8751954860985279, "reward_std": 0.26233239779248835, "rewards/accuracy_reward": 0.05625000149011612, "rewards/cosine_scaled_reward": -0.14077677052118814, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.959722239524126, "step": 270 }, { "completion_length": 987.2000114440918, "epoch": 0.182217252660786, "grad_norm": 0.9651452421892472, "kl": 0.4758544921875, "learning_rate": 1.9591367048289297e-05, "loss": 0.019, "reward": 0.7257929421961308, "reward_std": 0.18397955526015722, "rewards/accuracy_reward": 0.03958333432674408, "rewards/cosine_scaled_reward": -0.2811515292618424, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9673611268401145, "step": 275 }, { "completion_length": 977.872932434082, "epoch": 0.18553029361825485, "grad_norm": 3.4763402895334465, "kl": 0.599560546875, "learning_rate": 1.9557998168616087e-05, "loss": 0.024, "reward": 0.7476332891732455, "reward_std": 0.24956524579320102, "rewards/accuracy_reward": 0.03333333432674408, "rewards/cosine_scaled_reward": -0.2079222982225474, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9222222462296485, "step": 280 }, { "completion_length": 914.1083526611328, "epoch": 0.1888433345757237, "grad_norm": 1.026272141063443, "kl": 0.85498046875, "learning_rate": 1.9523350489863545e-05, "loss": 0.0342, "reward": 0.7307914365082979, "reward_std": 0.30679253828711806, "rewards/accuracy_reward": 0.05208333469927311, "rewards/cosine_scaled_reward": -0.17684747959137895, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8555555917322636, "step": 285 }, { "completion_length": 908.7771080017089, "epoch": 0.19215637553319254, "grad_norm": 233.32991199504036, "kl": 3.9479736328125, "learning_rate": 1.9487428647669688e-05, "loss": 0.158, "reward": 0.6776140451431274, "reward_std": 0.30821275734342635, "rewards/accuracy_reward": 0.04375000130385161, "rewards/cosine_scaled_reward": -0.2522470945958048, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8861111424863338, "step": 290 }, { "completion_length": 856.7187698364257, "epoch": 0.19546941649066138, "grad_norm": 3.5644168232067917, "kl": 0.70654296875, "learning_rate": 1.9450237448147463e-05, "loss": 0.0282, "reward": 0.8137124609202147, "reward_std": 0.23462606756947935, "rewards/accuracy_reward": 0.0479166679084301, "rewards/cosine_scaled_reward": -0.18628756331745536, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9520833544433117, "step": 295 }, { "completion_length": 802.3104370117187, "epoch": 0.1987824574481302, "grad_norm": 0.9222175770663644, "kl": 0.6181884765625, "learning_rate": 1.9411781867241718e-05, "loss": 0.0247, "reward": 0.8852791294455529, "reward_std": 0.2076218407601118, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.09458201387897133, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9486111268401146, "step": 300 }, { "epoch": 0.1987824574481302, "eval_completion_length": 867.7745361328125, "eval_kl": 0.8291590073529411, "eval_loss": 0.03293205425143242, "eval_reward": 0.8574772932950188, "eval_reward_std": 0.26536867986707124, "eval_rewards/accuracy_reward": 0.034313726512824785, "eval_rewards/cosine_scaled_reward": -0.09023515945848297, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9133987356634701, "eval_runtime": 89.8763, "eval_samples_per_second": 1.102, "eval_steps_per_second": 0.1, "step": 300 }, { "completion_length": 862.4416877746582, "epoch": 0.20209549840559904, "grad_norm": 28.884606955886465, "kl": 1.3546875, "learning_rate": 1.937206705006344e-05, "loss": 0.0541, "reward": 0.7903165742754936, "reward_std": 0.25898472802946343, "rewards/accuracy_reward": 0.01666666716337204, "rewards/cosine_scaled_reward": -0.11871122810989618, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8923611469566822, "step": 305 }, { "completion_length": 827.6416873931885, "epoch": 0.20540853936306788, "grad_norm": 0.9862494765342581, "kl": 0.840625, "learning_rate": 1.9331098310201392e-05, "loss": 0.0336, "reward": 0.7768306069076061, "reward_std": 0.2837255179416388, "rewards/accuracy_reward": 0.02500000074505806, "rewards/cosine_scaled_reward": -0.1169194189074915, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8687500365078449, "step": 310 }, { "completion_length": 792.6521026611329, "epoch": 0.20872158032053673, "grad_norm": 2.363279046198838, "kl": 2.4453857421875, "learning_rate": 1.9288881129011177e-05, "loss": 0.0975, "reward": 0.7581698017194867, "reward_std": 0.2753759054467082, "rewards/accuracy_reward": 0.018750000558793545, "rewards/cosine_scaled_reward": -0.14599688613088802, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8854167036712169, "step": 315 }, { "completion_length": 887.1166854858399, "epoch": 0.21203462127800554, "grad_norm": 1.8536489928198185, "kl": 0.9609130859375, "learning_rate": 1.9245421154881873e-05, "loss": 0.0385, "reward": 0.7548770979046822, "reward_std": 0.2714485161937773, "rewards/accuracy_reward": 0.02916666753590107, "rewards/cosine_scaled_reward": -0.1888729233876802, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.914583358168602, "step": 320 }, { "completion_length": 779.2416854858399, "epoch": 0.21534766223547439, "grad_norm": 2.208860059222166, "kl": 0.9648681640625, "learning_rate": 1.9200724202480305e-05, "loss": 0.0386, "reward": 0.7854821491986513, "reward_std": 0.3320905451197177, "rewards/accuracy_reward": 0.05000000149011612, "rewards/cosine_scaled_reward": -0.1061845439762692, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8416666977107525, "step": 325 }, { "completion_length": 753.854186630249, "epoch": 0.21866070319294323, "grad_norm": 0.7486142746914528, "kl": 0.4837158203125, "learning_rate": 1.9154796251973092e-05, "loss": 0.0193, "reward": 0.8564842958003283, "reward_std": 0.28438844471238556, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.08240461318637245, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8868055813014507, "step": 330 }, { "completion_length": 919.6937713623047, "epoch": 0.22197374415041207, "grad_norm": 12.592692034453572, "kl": 2.8746826171875, "learning_rate": 1.9107643448226536e-05, "loss": 0.1149, "reward": 0.6902904711663723, "reward_std": 0.29066669328603895, "rewards/accuracy_reward": 0.03541666772216558, "rewards/cosine_scaled_reward": -0.21457066799048335, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8694444738328457, "step": 335 }, { "completion_length": 858.195858001709, "epoch": 0.2252867851078809, "grad_norm": 0.7126891863258673, "kl": 1.474658203125, "learning_rate": 1.905927209998447e-05, "loss": 0.059, "reward": 0.687146489135921, "reward_std": 0.3664645422948524, "rewards/accuracy_reward": 0.047916668094694616, "rewards/cosine_scaled_reward": -0.18715909423772245, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8263889200985431, "step": 340 }, { "completion_length": 857.1104362487793, "epoch": 0.22859982606534973, "grad_norm": 1.4919037404360307, "kl": 0.6818115234375, "learning_rate": 1.900968867902419e-05, "loss": 0.0273, "reward": 0.8239085972309113, "reward_std": 0.2580954360309988, "rewards/accuracy_reward": 0.05416666828095913, "rewards/cosine_scaled_reward": -0.16150809452374232, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.931250025331974, "step": 345 }, { "completion_length": 692.845853805542, "epoch": 0.23191286702281858, "grad_norm": 1.0397197893440666, "kl": 0.6713623046875, "learning_rate": 1.8958899819290592e-05, "loss": 0.0268, "reward": 0.8607403263449669, "reward_std": 0.26304534131195395, "rewards/accuracy_reward": 0.05833333488553762, "rewards/cosine_scaled_reward": -0.11078747901483439, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9131944686174392, "step": 350 }, { "completion_length": 815.2729362487793, "epoch": 0.23522590798028742, "grad_norm": 4.034114599843324, "kl": 2.1651123046875, "learning_rate": 1.890691231600856e-05, "loss": 0.0866, "reward": 0.7483882358297705, "reward_std": 0.3335292543750256, "rewards/accuracy_reward": 0.04166666772216558, "rewards/cosine_scaled_reward": -0.14397290116758085, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8506944794207811, "step": 355 }, { "completion_length": 945.9021018981933, "epoch": 0.23853894893775623, "grad_norm": 0.9740981360402965, "kl": 1.22587890625, "learning_rate": 1.8853733124773837e-05, "loss": 0.049, "reward": 0.6698946075513958, "reward_std": 0.32081263293512163, "rewards/accuracy_reward": 0.02916666753590107, "rewards/cosine_scaled_reward": -0.14191097477450967, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.7826389260590076, "step": 360 }, { "completion_length": 929.4521011352539, "epoch": 0.24185198989522508, "grad_norm": 3.1700746747359405, "kl": 0.9063720703125, "learning_rate": 1.8799369360622394e-05, "loss": 0.0363, "reward": 0.6953002519905567, "reward_std": 0.2888270668219775, "rewards/accuracy_reward": 0.02708333395421505, "rewards/cosine_scaled_reward": -0.15956088887178338, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8277778102084994, "step": 365 }, { "completion_length": 789.5750205993652, "epoch": 0.24516503085269392, "grad_norm": 45.0958167465436, "kl": 1.22314453125, "learning_rate": 1.8743828297078485e-05, "loss": 0.0489, "reward": 0.839135817065835, "reward_std": 0.24224867282900958, "rewards/accuracy_reward": 0.02916666753590107, "rewards/cosine_scaled_reward": -0.1233642159961164, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9333333507180214, "step": 370 }, { "completion_length": 676.9479377746582, "epoch": 0.24847807181016277, "grad_norm": 0.7572791826833487, "kl": 0.6887451171875, "learning_rate": 1.8687117365181514e-05, "loss": 0.0276, "reward": 0.8459825713187457, "reward_std": 0.2010480370139703, "rewards/accuracy_reward": 0.022916667349636554, "rewards/cosine_scaled_reward": -0.12137857028283179, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9444444641470909, "step": 375 }, { "completion_length": 671.2375202178955, "epoch": 0.2517911127676316, "grad_norm": 1540.3346627332912, "kl": 16.9722412109375, "learning_rate": 1.8629244152491773e-05, "loss": 0.679, "reward": 0.7762745449319481, "reward_std": 0.2925152273150161, "rewards/accuracy_reward": 0.04375000130385161, "rewards/cosine_scaled_reward": -0.14941992065869272, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8819444619119168, "step": 380 }, { "completion_length": 620.960436630249, "epoch": 0.2551041537251004, "grad_norm": 2.626596038560775, "kl": 0.659716796875, "learning_rate": 1.8570216402075326e-05, "loss": 0.0264, "reward": 0.5826899779960513, "reward_std": 0.30203871307894586, "rewards/accuracy_reward": 0.01666666716337204, "rewards/cosine_scaled_reward": -0.25619894223054873, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8222222477197647, "step": 385 }, { "completion_length": 721.3437728881836, "epoch": 0.25841719468256924, "grad_norm": 2.513110780859623, "kl": 0.8121826171875, "learning_rate": 1.8510042011467978e-05, "loss": 0.0325, "reward": 0.6874676454812289, "reward_std": 0.3343358116224408, "rewards/accuracy_reward": 0.02500000074505806, "rewards/cosine_scaled_reward": -0.16322682366399022, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8256944805383682, "step": 390 }, { "completion_length": 888.1416854858398, "epoch": 0.2617302356400381, "grad_norm": 7.918090665242179, "kl": 0.8347412109375, "learning_rate": 1.8448729031618687e-05, "loss": 0.0334, "reward": 0.7576752958819271, "reward_std": 0.27606291431002317, "rewards/accuracy_reward": 0.02916666753590107, "rewards/cosine_scaled_reward": -0.14857472942676395, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8770833685994148, "step": 395 }, { "completion_length": 734.8666908264161, "epoch": 0.2650432765975069, "grad_norm": 2.6521198985827485, "kl": 0.62247314453125, "learning_rate": 1.838628566581236e-05, "loss": 0.0249, "reward": 0.7525582857429981, "reward_std": 0.23811102255713196, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.19258062870940193, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9243055827915668, "step": 400 }, { "epoch": 0.2650432765975069, "eval_completion_length": 663.5392348345588, "eval_kl": 18.474839154411764, "eval_loss": 0.7750041484832764, "eval_reward": 0.7423910866765415, "eval_reward_std": 0.2599155850270215, "eval_rewards/accuracy_reward": 0.024509804652017707, "eval_rewards/cosine_scaled_reward": -0.1710076377288822, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.8888889165485606, "eval_runtime": 80.0479, "eval_samples_per_second": 1.237, "eval_steps_per_second": 0.112, "step": 400 }, { "completion_length": 659.4020999908447, "epoch": 0.2683563175549758, "grad_norm": 10.705861519805678, "kl": 1.5389892578125, "learning_rate": 1.8322720268572333e-05, "loss": 0.0617, "reward": 0.8194267123937606, "reward_std": 0.28157884986139836, "rewards/accuracy_reward": 0.03958333451300859, "rewards/cosine_scaled_reward": -0.13612887668059556, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9159722492098809, "step": 405 }, { "completion_length": 721.8250198364258, "epoch": 0.2716693585124446, "grad_norm": 1.8787900525284176, "kl": 0.761474609375, "learning_rate": 1.8258041344542567e-05, "loss": 0.0305, "reward": 0.8566557567566633, "reward_std": 0.2328467371640727, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.12112204866134561, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9361111290752888, "step": 410 }, { "completion_length": 659.3041847229003, "epoch": 0.27498239946991343, "grad_norm": 11.064097897695925, "kl": 6.08837890625, "learning_rate": 1.8192257547349805e-05, "loss": 0.2437, "reward": 0.875496932119131, "reward_std": 0.22843618473852983, "rewards/accuracy_reward": 0.03958333451300859, "rewards/cosine_scaled_reward": -0.12172530895331875, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9576389014720916, "step": 415 }, { "completion_length": 582.1708518981934, "epoch": 0.2782954404273823, "grad_norm": 0.8781057210477057, "kl": 0.9693603515625, "learning_rate": 1.8125377678445755e-05, "loss": 0.0389, "reward": 0.881895923987031, "reward_std": 0.2179583671502769, "rewards/accuracy_reward": 0.03541666772216558, "rewards/cosine_scaled_reward": -0.11740965778008103, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9638888955116272, "step": 420 }, { "completion_length": 590.7396003723145, "epoch": 0.2816084813848511, "grad_norm": 1.1525583189437374, "kl": 125.327587890625, "learning_rate": 1.8057410685929505e-05, "loss": 5.0334, "reward": 0.8893688663840293, "reward_std": 0.206675009499304, "rewards/accuracy_reward": 0.03541666772216558, "rewards/cosine_scaled_reward": -0.10577005181294226, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9597222335636616, "step": 425 }, { "completion_length": 561.1625175476074, "epoch": 0.28492152234231993, "grad_norm": 1.748768910126364, "kl": 0.9494384765625, "learning_rate": 1.7988365663350352e-05, "loss": 0.038, "reward": 0.8399925954639912, "reward_std": 0.2784117936622351, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.10861854468239471, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9069444693624973, "step": 430 }, { "completion_length": 600.0583530426026, "epoch": 0.2882345632997888, "grad_norm": 24.0486523183378, "kl": 1.835791015625, "learning_rate": 1.7918251848491118e-05, "loss": 0.0734, "reward": 0.7380310939624906, "reward_std": 0.35503527710679916, "rewards/accuracy_reward": 0.02916666753590107, "rewards/cosine_scaled_reward": -0.12516337343549822, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8340278100222349, "step": 435 }, { "completion_length": 640.3646022796631, "epoch": 0.2915476042572576, "grad_norm": 1.4698435719737848, "kl": 2.008154296875, "learning_rate": 1.7847078622132202e-05, "loss": 0.0802, "reward": 0.7991350771859288, "reward_std": 0.3836879283422604, "rewards/accuracy_reward": 0.05416666828095913, "rewards/cosine_scaled_reward": -0.07933716488187201, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.82430558539927, "step": 440 }, { "completion_length": 635.225020980835, "epoch": 0.2948606452147265, "grad_norm": 2.6494821005201827, "kl": 0.714306640625, "learning_rate": 1.7774855506796497e-05, "loss": 0.0286, "reward": 0.8397124305367469, "reward_std": 0.36279195114038887, "rewards/accuracy_reward": 0.05000000149011612, "rewards/cosine_scaled_reward": -0.07348204434383661, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8631944686174393, "step": 445 }, { "completion_length": 681.6854377746582, "epoch": 0.2981736861721953, "grad_norm": 4.788101010393131, "kl": 2.3724365234375, "learning_rate": 1.770159216547532e-05, "loss": 0.095, "reward": 0.6784259299747646, "reward_std": 0.44399131783284246, "rewards/accuracy_reward": 0.03750000111758709, "rewards/cosine_scaled_reward": -0.1333796467544744, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.7743055794388056, "step": 450 }, { "completion_length": 703.420853805542, "epoch": 0.3014867271296641, "grad_norm": 3.527558449123951, "kl": 2.01552734375, "learning_rate": 1.76272984003356e-05, "loss": 0.0806, "reward": 0.6863215479068459, "reward_std": 0.4073473752941936, "rewards/accuracy_reward": 0.03333333432674408, "rewards/cosine_scaled_reward": -0.1421506962913554, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.7951389141380787, "step": 455 }, { "completion_length": 656.916686630249, "epoch": 0.304799768087133, "grad_norm": 3.758337673639945, "kl": 2.4762451171875, "learning_rate": 1.7551984151408363e-05, "loss": 0.0991, "reward": 0.6979025349020958, "reward_std": 0.44121168122510424, "rewards/accuracy_reward": 0.03958333451300859, "rewards/cosine_scaled_reward": -0.10626415694132448, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.764583359286189, "step": 460 }, { "completion_length": 572.3250183105469, "epoch": 0.3081128090446018, "grad_norm": 3.2880586219053263, "kl": 1.815576171875, "learning_rate": 1.7475659495258864e-05, "loss": 0.0725, "reward": 0.828828389197588, "reward_std": 0.42342835597228257, "rewards/accuracy_reward": 0.06041666809469461, "rewards/cosine_scaled_reward": -0.05103274945868179, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.819444464892149, "step": 465 }, { "completion_length": 529.1708480834961, "epoch": 0.3114258500020706, "grad_norm": 4.602328586716576, "kl": 1.8807861328125, "learning_rate": 1.739833464363838e-05, "loss": 0.0753, "reward": 0.7670636136084795, "reward_std": 0.4504899204475805, "rewards/accuracy_reward": 0.05833333507180214, "rewards/cosine_scaled_reward": -0.05654751847614534, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.7652778025716543, "step": 470 }, { "completion_length": 660.6312690734864, "epoch": 0.3147388909595395, "grad_norm": 6.369843143707269, "kl": 1.3401611328125, "learning_rate": 1.7320019942117954e-05, "loss": 0.0536, "reward": 0.782780846580863, "reward_std": 0.469034303445369, "rewards/accuracy_reward": 0.06875000204890966, "rewards/cosine_scaled_reward": -0.07069140401072219, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.7847222495824099, "step": 475 }, { "completion_length": 732.4750221252441, "epoch": 0.3180519319170083, "grad_norm": 1.0514790860661878, "kl": 3.8715576171875, "learning_rate": 1.7240725868704218e-05, "loss": 0.1549, "reward": 0.8217490036040545, "reward_std": 0.37456859347294086, "rewards/accuracy_reward": 0.05416666828095913, "rewards/cosine_scaled_reward": -0.08450102035421878, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8520833604037762, "step": 480 }, { "completion_length": 728.2979354858398, "epoch": 0.3213649728744772, "grad_norm": 1.485398684680928, "kl": 1.460791015625, "learning_rate": 1.71604630324375e-05, "loss": 0.0585, "reward": 0.836965924501419, "reward_std": 0.3446637814049609, "rewards/accuracy_reward": 0.0541666679084301, "rewards/cosine_scaled_reward": -0.059561880814726464, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8423611387610436, "step": 485 }, { "completion_length": 690.758349609375, "epoch": 0.324678013831946, "grad_norm": 1.9121860416428433, "kl": 0.79320068359375, "learning_rate": 1.7079242171972417e-05, "loss": 0.0318, "reward": 0.8989938478916883, "reward_std": 0.3036307736299932, "rewards/accuracy_reward": 0.05416666828095913, "rewards/cosine_scaled_reward": -0.05100618019350804, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8958333626389503, "step": 490 }, { "completion_length": 691.6333526611328, "epoch": 0.3279910547894148, "grad_norm": 1.5333588145776778, "kl": 0.6511474609375, "learning_rate": 1.6997074154141097e-05, "loss": 0.0261, "reward": 0.9193649560213089, "reward_std": 0.2695678794640116, "rewards/accuracy_reward": 0.047916668094694616, "rewards/cosine_scaled_reward": -0.058412853971822186, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9298611290752887, "step": 495 }, { "completion_length": 750.3500183105468, "epoch": 0.3313040957468837, "grad_norm": 4.227409094987621, "kl": 0.6984375, "learning_rate": 1.6913969972499272e-05, "loss": 0.0279, "reward": 0.9509195258840919, "reward_std": 0.36410973106976596, "rewards/accuracy_reward": 0.07916666865348816, "rewards/cosine_scaled_reward": -0.027552723238477482, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8993055813014508, "step": 500 }, { "epoch": 0.3313040957468837, "eval_completion_length": 759.7304077148438, "eval_kl": 0.998046875, "eval_loss": 0.04009021446108818, "eval_reward": 0.9331843011519488, "eval_reward_std": 0.2707175987170023, "eval_rewards/accuracy_reward": 0.06862745214911069, "eval_rewards/cosine_scaled_reward": -0.01126019039656967, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.8758170183967141, "eval_runtime": 86.3361, "eval_samples_per_second": 1.147, "eval_steps_per_second": 0.104, "step": 500 }, { "completion_length": 711.3854354858398, "epoch": 0.3346171367043525, "grad_norm": 0.9568539611657129, "kl": 1.13870849609375, "learning_rate": 1.682994074585541e-05, "loss": 0.0455, "reward": 0.9027604317292571, "reward_std": 0.30179907931014893, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.051406268728896976, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8916666928678751, "step": 505 }, { "completion_length": 680.3750171661377, "epoch": 0.3379301776618213, "grad_norm": 9.680636077679255, "kl": 0.96646728515625, "learning_rate": 1.674499771678309e-05, "loss": 0.0387, "reward": 0.942707608640194, "reward_std": 0.21283993816468866, "rewards/accuracy_reward": 0.047916668094694616, "rewards/cosine_scaled_reward": -0.06840352213475853, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9631944581866264, "step": 510 }, { "completion_length": 724.618766784668, "epoch": 0.3412432186192902, "grad_norm": 8.47244604664691, "kl": 0.8087646484375, "learning_rate": 1.665915225011681e-05, "loss": 0.0324, "reward": 0.9447658378630877, "reward_std": 0.24693514857208357, "rewards/accuracy_reward": 0.05625000167638063, "rewards/cosine_scaled_reward": -0.04828974761767313, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9368055790662766, "step": 515 }, { "completion_length": 720.7666854858398, "epoch": 0.344556259576759, "grad_norm": 41.36649312187087, "kl": 1.28271484375, "learning_rate": 1.6572415831431466e-05, "loss": 0.0513, "reward": 0.8966662667691707, "reward_std": 0.24706752334022894, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.0776393148367788, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9326389141380786, "step": 520 }, { "completion_length": 586.439601135254, "epoch": 0.3478693005342279, "grad_norm": 1.4249914111039361, "kl": 1.137109375, "learning_rate": 1.6484800065505627e-05, "loss": 0.0455, "reward": 0.7199330668896436, "reward_std": 0.3295366237871349, "rewards/accuracy_reward": 0.02083333395421505, "rewards/cosine_scaled_reward": -0.1265947333246004, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8256944715976715, "step": 525 }, { "completion_length": 622.0208503723145, "epoch": 0.3511823414916967, "grad_norm": 0.9978155487864159, "kl": 1.0482421875, "learning_rate": 1.6396316674768914e-05, "loss": 0.0419, "reward": 0.816641541570425, "reward_std": 0.24922135260421782, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.13474737045471558, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9201389089226722, "step": 530 }, { "completion_length": 707.7437728881836, "epoch": 0.3544953824491655, "grad_norm": 1.1854447530767172, "kl": 1.2285400390625, "learning_rate": 1.630697749773359e-05, "loss": 0.0492, "reward": 0.9490411698818206, "reward_std": 0.18728532029781492, "rewards/accuracy_reward": 0.03958333451300859, "rewards/cosine_scaled_reward": -0.084986640146235, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9944444477558136, "step": 535 }, { "completion_length": 743.3187705993653, "epoch": 0.3578084234066344, "grad_norm": 1.454815828238096, "kl": 0.673486328125, "learning_rate": 1.621679448741067e-05, "loss": 0.0269, "reward": 0.9328758142888546, "reward_std": 0.22614758528070525, "rewards/accuracy_reward": 0.04583333469927311, "rewards/cosine_scaled_reward": -0.09559643640823197, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9826388947665692, "step": 540 }, { "completion_length": 727.9937713623046, "epoch": 0.3611214643641032, "grad_norm": 1.272613665016836, "kl": 1.1934326171875, "learning_rate": 1.6125779709710668e-05, "loss": 0.0478, "reward": 0.8213923370465637, "reward_std": 0.31375509237404914, "rewards/accuracy_reward": 0.04583333469927311, "rewards/cosine_scaled_reward": -0.11471879750024527, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8902778048068285, "step": 545 }, { "completion_length": 716.895853805542, "epoch": 0.364434505321572, "grad_norm": 0.7472607189613197, "kl": 0.73807373046875, "learning_rate": 1.603394534182925e-05, "loss": 0.0295, "reward": 0.8135474029928446, "reward_std": 0.27684179751668125, "rewards/accuracy_reward": 0.02916666753590107, "rewards/cosine_scaled_reward": -0.07811928552109748, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.862500025331974, "step": 550 }, { "completion_length": 711.8229377746582, "epoch": 0.3677475462790409, "grad_norm": 0.509889238437433, "kl": 0.2975830078125, "learning_rate": 1.5941303670618018e-05, "loss": 0.0119, "reward": 0.8659656617790461, "reward_std": 0.24631512356572785, "rewards/accuracy_reward": 0.02916666753590107, "rewards/cosine_scaled_reward": -0.08472880212357267, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9215277932584286, "step": 555 }, { "completion_length": 706.2146026611329, "epoch": 0.3710605872365097, "grad_norm": 0.5115181033590307, "kl": 0.2783935546875, "learning_rate": 1.5847867090940602e-05, "loss": 0.0111, "reward": 0.8821543082594872, "reward_std": 0.21895654122345148, "rewards/accuracy_reward": 0.02916666753590107, "rewards/cosine_scaled_reward": -0.08867904875660315, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9416666813194752, "step": 560 }, { "completion_length": 731.2646049499511, "epoch": 0.37437362819397857, "grad_norm": 0.48946781213894414, "kl": 0.2870361328125, "learning_rate": 1.57536481040143e-05, "loss": 0.0115, "reward": 0.9268658980727196, "reward_std": 0.19756257701665164, "rewards/accuracy_reward": 0.03750000111758709, "rewards/cosine_scaled_reward": -0.07243968344992027, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9618055641651153, "step": 565 }, { "completion_length": 710.7771034240723, "epoch": 0.3776866691514474, "grad_norm": 0.6647879828200784, "kl": 0.32255859375, "learning_rate": 1.5658659315737505e-05, "loss": 0.0129, "reward": 0.9056837107986212, "reward_std": 0.22030911170877515, "rewards/accuracy_reward": 0.03541666772216558, "rewards/cosine_scaled_reward": -0.07973298630677164, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9500000149011611, "step": 570 }, { "completion_length": 652.4000190734863, "epoch": 0.3809997101089162, "grad_norm": 1.3789247121732728, "kl": 0.4084228515625, "learning_rate": 1.5562913435003113e-05, "loss": 0.0163, "reward": 0.9116351526230574, "reward_std": 0.26273676122073086, "rewards/accuracy_reward": 0.05416666828095913, "rewards/cosine_scaled_reward": -0.0668371033621952, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9243055753409862, "step": 575 }, { "completion_length": 664.8875198364258, "epoch": 0.38431275106638507, "grad_norm": 1.2564435031983983, "kl": 0.9608154296875, "learning_rate": 1.5466423271998144e-05, "loss": 0.0384, "reward": 0.9768917407840491, "reward_std": 0.31770267653628254, "rewards/accuracy_reward": 0.07708333563059569, "rewards/cosine_scaled_reward": -0.02796940163243562, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9277778014540672, "step": 580 }, { "completion_length": 686.3541862487793, "epoch": 0.3876257920238539, "grad_norm": 1.6154786655400664, "kl": 1.129638671875, "learning_rate": 1.536920173648984e-05, "loss": 0.0452, "reward": 0.9404020644724369, "reward_std": 0.2943329735193402, "rewards/accuracy_reward": 0.06458333488553762, "rewards/cosine_scaled_reward": -0.02279240933712572, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.898611132055521, "step": 585 }, { "completion_length": 655.035436630249, "epoch": 0.39093883298132276, "grad_norm": 1.841897952460594, "kl": 1.5520263671875, "learning_rate": 1.5271261836098403e-05, "loss": 0.0621, "reward": 0.8769187476485968, "reward_std": 0.3173756536561996, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.08002571895194706, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9048611432313919, "step": 590 }, { "completion_length": 663.7958549499511, "epoch": 0.3942518739387916, "grad_norm": 1.8600695583478823, "kl": 1.14420166015625, "learning_rate": 1.5172616674556673e-05, "loss": 0.0458, "reward": 0.8190094228833914, "reward_std": 0.2868053139653057, "rewards/accuracy_reward": 0.04166666753590107, "rewards/cosine_scaled_reward": -0.11432393525028602, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.891666692495346, "step": 595 }, { "completion_length": 673.5312675476074, "epoch": 0.3975649148962604, "grad_norm": 25.826966738337322, "kl": 1.2376953125, "learning_rate": 1.5073279449956916e-05, "loss": 0.0495, "reward": 0.8733935471624136, "reward_std": 0.2678844271984417, "rewards/accuracy_reward": 0.04375000111758709, "rewards/cosine_scaled_reward": -0.07452313954709097, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9041666880249977, "step": 600 }, { "epoch": 0.3975649148962604, "eval_completion_length": 716.86766501034, "eval_kl": 0.8521943933823529, "eval_loss": 0.032866623252630234, "eval_reward": 0.9601863868096295, "eval_reward_std": 0.31298351463149576, "eval_rewards/accuracy_reward": 0.06862745258738012, "eval_rewards/cosine_scaled_reward": -0.025107769350356916, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9166666935471928, "eval_runtime": 83.1627, "eval_samples_per_second": 1.19, "eval_steps_per_second": 0.108, "step": 600 }, { "completion_length": 723.000020980835, "epoch": 0.40087795585372926, "grad_norm": 3.5053176136507873, "kl": 0.85250244140625, "learning_rate": 1.4973263452985023e-05, "loss": 0.0341, "reward": 0.8990890353918075, "reward_std": 0.29346464802511035, "rewards/accuracy_reward": 0.05833333488553762, "rewards/cosine_scaled_reward": -0.04327209957991727, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8840278036892414, "step": 605 }, { "completion_length": 769.2354370117188, "epoch": 0.4041909968111981, "grad_norm": 3.1968409631474715, "kl": 1.3515869140625, "learning_rate": 1.4872582065142285e-05, "loss": 0.0541, "reward": 0.8378175616264343, "reward_std": 0.2982491932692938, "rewards/accuracy_reward": 0.0416666679084301, "rewards/cosine_scaled_reward": -0.02815468145417981, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8243055805563927, "step": 610 }, { "completion_length": 727.0312713623047, "epoch": 0.4075040377686669, "grad_norm": 1.4026923224774785, "kl": 0.73531494140625, "learning_rate": 1.4771248756955042e-05, "loss": 0.0294, "reward": 0.9043661113828421, "reward_std": 0.2913089117500931, "rewards/accuracy_reward": 0.0541666679084301, "rewards/cosine_scaled_reward": -0.026883910468313843, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8770833555608988, "step": 615 }, { "completion_length": 720.7458534240723, "epoch": 0.41081707872613576, "grad_norm": 2.0450283154044127, "kl": 0.4779541015625, "learning_rate": 1.4669277086172406e-05, "loss": 0.0191, "reward": 0.9371541447937488, "reward_std": 0.25683711245656016, "rewards/accuracy_reward": 0.05208333469927311, "rewards/cosine_scaled_reward": -0.02465143794725009, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9097222484648227, "step": 620 }, { "completion_length": 714.5229400634765, "epoch": 0.4141301196836046, "grad_norm": 1.930094666675544, "kl": 0.68609619140625, "learning_rate": 1.4566680695952333e-05, "loss": 0.0274, "reward": 0.9720582082867623, "reward_std": 0.27749894536682407, "rewards/accuracy_reward": 0.06250000167638063, "rewards/cosine_scaled_reward": -0.028636260045459494, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9381944634020328, "step": 625 }, { "completion_length": 719.3750228881836, "epoch": 0.41744316064107345, "grad_norm": 1.2427490585338696, "kl": 0.95345458984375, "learning_rate": 1.4463473313036241e-05, "loss": 0.0381, "reward": 1.0004869259893894, "reward_std": 0.21139949709177017, "rewards/accuracy_reward": 0.06041666828095913, "rewards/cosine_scaled_reward": -0.00923532065935433, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9493055835366249, "step": 630 }, { "completion_length": 761.3708549499512, "epoch": 0.42075620159854227, "grad_norm": 0.6484431223301813, "kl": 0.64298095703125, "learning_rate": 1.4359668745912472e-05, "loss": 0.0257, "reward": 1.0493405498564243, "reward_std": 0.2240296528674662, "rewards/accuracy_reward": 0.08333333563059568, "rewards/cosine_scaled_reward": 0.006979415030218661, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9590277917683124, "step": 635 }, { "completion_length": 774.4062698364257, "epoch": 0.4240692425560111, "grad_norm": 1.5220961504307924, "kl": 0.61929931640625, "learning_rate": 1.4255280882968787e-05, "loss": 0.0248, "reward": 0.9054041653871536, "reward_std": 0.16729897408513353, "rewards/accuracy_reward": 0.018750000558793545, "rewards/cosine_scaled_reward": -0.06265141163021326, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9493055775761604, "step": 640 }, { "completion_length": 773.1312713623047, "epoch": 0.42738228351347995, "grad_norm": 4.631130436584823, "kl": 0.68836669921875, "learning_rate": 1.415032369063422e-05, "loss": 0.0275, "reward": 0.9361987210810184, "reward_std": 0.21468218287918717, "rewards/accuracy_reward": 0.04375000130385161, "rewards/cosine_scaled_reward": -0.0700513044372201, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9625000141561031, "step": 645 }, { "completion_length": 802.4937683105469, "epoch": 0.43069532447094877, "grad_norm": 0.7376882271708779, "kl": 2.206640625, "learning_rate": 1.4044811211510419e-05, "loss": 0.0883, "reward": 0.9607818759977818, "reward_std": 0.21701918505132198, "rewards/accuracy_reward": 0.05000000149011612, "rewards/cosine_scaled_reward": -0.0593570476397872, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9701389022171497, "step": 650 }, { "completion_length": 823.3000183105469, "epoch": 0.4340083654284176, "grad_norm": 1.4557463965372042, "kl": 0.9728759765625, "learning_rate": 1.3938757562492873e-05, "loss": 0.039, "reward": 0.876766087859869, "reward_std": 0.23884346910053864, "rewards/accuracy_reward": 0.031250000931322575, "rewards/cosine_scaled_reward": -0.11767837936349679, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9631944626569748, "step": 655 }, { "completion_length": 731.6396057128907, "epoch": 0.43732140638588646, "grad_norm": 1.238457860761198, "kl": 0.840087890625, "learning_rate": 1.3832176932882136e-05, "loss": 0.0336, "reward": 0.950103372335434, "reward_std": 0.2417588339652866, "rewards/accuracy_reward": 0.05416666828095913, "rewards/cosine_scaled_reward": -0.04364665783941746, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9395833514630795, "step": 660 }, { "completion_length": 711.6021041870117, "epoch": 0.4406344473433553, "grad_norm": 1.0911377179318549, "kl": 0.59212646484375, "learning_rate": 1.3725083582485397e-05, "loss": 0.0237, "reward": 0.9273017428815364, "reward_std": 0.270478050771635, "rewards/accuracy_reward": 0.05833333507180214, "rewards/cosine_scaled_reward": -0.05811494703521021, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9270833604037761, "step": 665 }, { "completion_length": 738.7333515167236, "epoch": 0.44394748830082414, "grad_norm": 7.840453512414562, "kl": 4.596484375, "learning_rate": 1.3617491839708614e-05, "loss": 0.1838, "reward": 0.9688164483755827, "reward_std": 0.3597332563949749, "rewards/accuracy_reward": 0.10000000279396773, "rewards/cosine_scaled_reward": -0.015211354894563555, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8840278059244155, "step": 670 }, { "completion_length": 727.2771026611329, "epoch": 0.44726052925829296, "grad_norm": 29.315154881131072, "kl": 0.8662353515625, "learning_rate": 1.3509416099639456e-05, "loss": 0.0347, "reward": 0.9601229157298803, "reward_std": 0.3384942405857146, "rewards/accuracy_reward": 0.08125000242143869, "rewards/cosine_scaled_reward": -0.023904890747508033, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.902777797728777, "step": 675 }, { "completion_length": 686.4437717437744, "epoch": 0.4505735702157618, "grad_norm": 0.7720690438145432, "kl": 0.58448486328125, "learning_rate": 1.3400870822121348e-05, "loss": 0.0234, "reward": 0.9981479395180941, "reward_std": 0.3479425340075977, "rewards/accuracy_reward": 0.08333333563059568, "rewards/cosine_scaled_reward": -0.021990970680781173, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.936805573105812, "step": 680 }, { "completion_length": 657.3271022796631, "epoch": 0.45388661117323065, "grad_norm": 0.618682727431703, "kl": 0.50362548828125, "learning_rate": 1.3291870529818809e-05, "loss": 0.0202, "reward": 0.9656439781188965, "reward_std": 0.31729735336266457, "rewards/accuracy_reward": 0.07291666883975267, "rewards/cosine_scaled_reward": -0.032967154250945895, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9256944604218006, "step": 685 }, { "completion_length": 733.8916862487793, "epoch": 0.45719965213069946, "grad_norm": 0.6961341239109778, "kl": 0.517236328125, "learning_rate": 1.3182429806274442e-05, "loss": 0.0207, "reward": 0.9764438889920711, "reward_std": 0.2684546816162765, "rewards/accuracy_reward": 0.06250000167638063, "rewards/cosine_scaled_reward": -0.018000577902421357, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.931944464892149, "step": 690 }, { "completion_length": 722.145849609375, "epoch": 0.4605126930881683, "grad_norm": 0.7396386944007698, "kl": 0.62032470703125, "learning_rate": 1.3072563293957725e-05, "loss": 0.0248, "reward": 0.9321913201361894, "reward_std": 0.3160252112313174, "rewards/accuracy_reward": 0.06250000167638063, "rewards/cosine_scaled_reward": -0.02266981763532385, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8923611298203469, "step": 695 }, { "completion_length": 685.0750194549561, "epoch": 0.46382573404563715, "grad_norm": 31.2929243471924, "kl": 1.23011474609375, "learning_rate": 1.2962285692305964e-05, "loss": 0.0493, "reward": 0.9321463823318481, "reward_std": 0.2796243290591519, "rewards/accuracy_reward": 0.06458333488553762, "rewards/cosine_scaled_reward": -0.02202030966291204, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8895833499729633, "step": 700 }, { "epoch": 0.46382573404563715, "eval_completion_length": 700.1960987764246, "eval_kl": 1.6740579044117647, "eval_loss": 0.06825181841850281, "eval_reward": 0.94467833813499, "eval_reward_std": 0.3344465803135844, "eval_rewards/accuracy_reward": 0.07843137400991776, "eval_rewards/cosine_scaled_reward": 0.013305778889095081, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.8529411834828994, "eval_runtime": 83.5312, "eval_samples_per_second": 1.185, "eval_steps_per_second": 0.108, "step": 700 }, { "completion_length": 697.9687675476074, "epoch": 0.46713877500310597, "grad_norm": 1.6134325511763703, "kl": 0.5646728515625, "learning_rate": 1.2851611755757587e-05, "loss": 0.0226, "reward": 0.9937051258981228, "reward_std": 0.33686687525478193, "rewards/accuracy_reward": 0.08125000204890967, "rewards/cosine_scaled_reward": -0.0014337893459014594, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9138889037072658, "step": 705 }, { "completion_length": 675.2562713623047, "epoch": 0.47045181596057484, "grad_norm": 0.6943625561210038, "kl": 0.5255859375, "learning_rate": 1.2740556291778096e-05, "loss": 0.021, "reward": 0.9420141892507672, "reward_std": 0.32359609878622, "rewards/accuracy_reward": 0.08958333600312471, "rewards/cosine_scaled_reward": 0.005208605836378411, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8472222492098809, "step": 710 }, { "completion_length": 647.3791893005371, "epoch": 0.47376485691804365, "grad_norm": 0.5534630201300166, "kl": 0.5138916015625, "learning_rate": 1.2629134158878919e-05, "loss": 0.0206, "reward": 0.8885765176266431, "reward_std": 0.29848170580808076, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.045451278623659164, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8715278021991253, "step": 715 }, { "completion_length": 738.968766784668, "epoch": 0.47707789787551247, "grad_norm": 0.5211345461785403, "kl": 0.46080322265625, "learning_rate": 1.2517360264629463e-05, "loss": 0.0184, "reward": 0.9274849720299244, "reward_std": 0.275846075033769, "rewards/accuracy_reward": 0.050000001303851606, "rewards/cosine_scaled_reward": -0.039181716930761466, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9166666902601719, "step": 720 }, { "completion_length": 683.3146018981934, "epoch": 0.48039093883298134, "grad_norm": 0.7087234623515466, "kl": 0.5052978515625, "learning_rate": 1.2405249563662539e-05, "loss": 0.0202, "reward": 0.9626883253455162, "reward_std": 0.32444824001286177, "rewards/accuracy_reward": 0.07708333563059569, "rewards/cosine_scaled_reward": -0.04147836706979433, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9270833536982537, "step": 725 }, { "completion_length": 685.2146026611329, "epoch": 0.48370397979045016, "grad_norm": 17.968130595058522, "kl": 1.60780029296875, "learning_rate": 1.2292817055673543e-05, "loss": 0.0643, "reward": 0.8940968006849289, "reward_std": 0.22779874617699533, "rewards/accuracy_reward": 0.03333333432674408, "rewards/cosine_scaled_reward": -0.0635421129030874, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9243055813014507, "step": 730 }, { "completion_length": 721.5333530426026, "epoch": 0.48701702074791897, "grad_norm": 3.5786328013466937, "kl": 0.9418701171875, "learning_rate": 1.2180077783413601e-05, "loss": 0.0377, "reward": 0.923260135948658, "reward_std": 0.2868120894068852, "rewards/accuracy_reward": 0.047916668094694616, "rewards/cosine_scaled_reward": -0.04618433914147317, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9215278081595898, "step": 735 }, { "completion_length": 713.6896049499512, "epoch": 0.49033006170538784, "grad_norm": 21.53242418472282, "kl": 3.4288330078125, "learning_rate": 1.2067046830676947e-05, "loss": 0.1371, "reward": 0.9498519212007522, "reward_std": 0.30734682325273754, "rewards/accuracy_reward": 0.06250000186264515, "rewards/cosine_scaled_reward": -0.02514811131404713, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9125000238418579, "step": 740 }, { "completion_length": 697.1396034240722, "epoch": 0.49364310266285666, "grad_norm": 1.5486780453271443, "kl": 2.62125244140625, "learning_rate": 1.1953739320282778e-05, "loss": 0.1047, "reward": 0.8954519655555486, "reward_std": 0.3432210959028453, "rewards/accuracy_reward": 0.05833333507180214, "rewards/cosine_scaled_reward": -0.06704806288471446, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9041666962206364, "step": 745 }, { "completion_length": 649.6104343414306, "epoch": 0.49695614362032553, "grad_norm": 1173.3053200275926, "kl": 7.90325927734375, "learning_rate": 1.1840170412051957e-05, "loss": 0.3164, "reward": 0.8023746436461806, "reward_std": 0.3010966286063194, "rewards/accuracy_reward": 0.03333333432674408, "rewards/cosine_scaled_reward": -0.10804204960004427, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8770833674818277, "step": 750 }, { "completion_length": 674.7458564758301, "epoch": 0.5002691845777943, "grad_norm": 14.74814178846135, "kl": 2.3808349609375, "learning_rate": 1.1726355300778693e-05, "loss": 0.0953, "reward": 0.8602411191910505, "reward_std": 0.27471131838392465, "rewards/accuracy_reward": 0.04166666753590107, "rewards/cosine_scaled_reward": -0.057120018915156834, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8756944686174393, "step": 755 }, { "completion_length": 743.4979347229004, "epoch": 0.5035822255352632, "grad_norm": 2.0765022790341012, "kl": 0.8923828125, "learning_rate": 1.1612309214197599e-05, "loss": 0.0357, "reward": 0.8452902540564537, "reward_std": 0.31740867618937046, "rewards/accuracy_reward": 0.04375000111758709, "rewards/cosine_scaled_reward": -0.05609866543672979, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8576389141380787, "step": 760 }, { "completion_length": 761.6396049499511, "epoch": 0.506895266492732, "grad_norm": 1.7231866299667538, "kl": 0.5144775390625, "learning_rate": 1.1498047410946307e-05, "loss": 0.0206, "reward": 0.8678824879229069, "reward_std": 0.3043431679951027, "rewards/accuracy_reward": 0.03958333451300859, "rewards/cosine_scaled_reward": -0.0508675416233018, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8791666932404041, "step": 765 }, { "completion_length": 739.7687683105469, "epoch": 0.5102083074502008, "grad_norm": 0.568218077832533, "kl": 0.421337890625, "learning_rate": 1.1383585178523955e-05, "loss": 0.0168, "reward": 1.007787472754717, "reward_std": 0.2486687523603905, "rewards/accuracy_reward": 0.06458333507180214, "rewards/cosine_scaled_reward": -0.0061014459235593675, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9493055731058121, "step": 770 }, { "completion_length": 734.8041847229003, "epoch": 0.5135213484076697, "grad_norm": 1.3062884170967002, "kl": 0.4234619140625, "learning_rate": 1.126893783124583e-05, "loss": 0.0169, "reward": 1.0365624018013477, "reward_std": 0.2236189019968151, "rewards/accuracy_reward": 0.07291666883975267, "rewards/cosine_scaled_reward": 0.00947904633358121, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9541666805744171, "step": 775 }, { "completion_length": 714.9791885375977, "epoch": 0.5168343893651385, "grad_norm": 0.6124342589578466, "kl": 0.39639892578125, "learning_rate": 1.1154120708194398e-05, "loss": 0.0159, "reward": 0.964961226284504, "reward_std": 0.2262617411557585, "rewards/accuracy_reward": 0.052083334885537624, "rewards/cosine_scaled_reward": -0.005177695211023093, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9180555775761604, "step": 780 }, { "completion_length": 748.1708549499511, "epoch": 0.5201474303226074, "grad_norm": 0.899394670649138, "kl": 0.5111083984375, "learning_rate": 1.1039149171167046e-05, "loss": 0.0205, "reward": 0.9845144189894199, "reward_std": 0.29746524210786446, "rewards/accuracy_reward": 0.07291666846722364, "rewards/cosine_scaled_reward": 0.008819950895849615, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9027778059244156, "step": 785 }, { "completion_length": 710.1166881561279, "epoch": 0.5234604712800762, "grad_norm": 1.2655596934697766, "kl": 6.897509765625, "learning_rate": 1.0924038602620757e-05, "loss": 0.2758, "reward": 1.0180802159011364, "reward_std": 0.1895498657017015, "rewards/accuracy_reward": 0.06458333488553762, "rewards/cosine_scaled_reward": 0.002107965323375538, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9513889037072658, "step": 790 }, { "completion_length": 679.722936630249, "epoch": 0.526773512237545, "grad_norm": 1.092411719329428, "kl": 1.1653564453125, "learning_rate": 1.0808804403614044e-05, "loss": 0.0466, "reward": 1.004332087561488, "reward_std": 0.2646419349708594, "rewards/accuracy_reward": 0.07708333563059569, "rewards/cosine_scaled_reward": -0.006779057979292702, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.934027797728777, "step": 795 }, { "completion_length": 737.0104370117188, "epoch": 0.5300865531950139, "grad_norm": 1.567837458087204, "kl": 0.6482177734375, "learning_rate": 1.0693461991746389e-05, "loss": 0.0259, "reward": 0.9288295730948448, "reward_std": 0.2937559608079027, "rewards/accuracy_reward": 0.06041666828095913, "rewards/cosine_scaled_reward": -0.025337116903392597, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8937500335276127, "step": 800 }, { "epoch": 0.5300865531950139, "eval_completion_length": 742.7353156594669, "eval_kl": 0.43543198529411764, "eval_loss": 0.017235994338989258, "eval_reward": 1.0902815005358528, "eval_reward_std": 0.30682923003812046, "eval_rewards/accuracy_reward": 0.10294117866193547, "eval_rewards/cosine_scaled_reward": 0.03962787949507508, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9477124389480142, "eval_runtime": 84.6374, "eval_samples_per_second": 1.17, "eval_steps_per_second": 0.106, "step": 800 }, { "completion_length": 758.0916877746582, "epoch": 0.5333995941524827, "grad_norm": 2.8541568335150886, "kl": 0.5290771484375, "learning_rate": 1.0578026799095464e-05, "loss": 0.0212, "reward": 0.9965711794793606, "reward_std": 0.24475849202135577, "rewards/accuracy_reward": 0.06458333507180214, "rewards/cosine_scaled_reward": -0.0034288427559658885, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9354166880249977, "step": 805 }, { "completion_length": 700.8396034240723, "epoch": 0.5367126351099516, "grad_norm": 0.9109338182018821, "kl": 0.90511474609375, "learning_rate": 1.046251427015241e-05, "loss": 0.0362, "reward": 1.0379983201622962, "reward_std": 0.2225944455480203, "rewards/accuracy_reward": 0.06666666846722365, "rewards/cosine_scaled_reward": 0.008137176046147943, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9631944604218006, "step": 810 }, { "completion_length": 695.5937713623047, "epoch": 0.5400256760674204, "grad_norm": 1.4954741661397666, "kl": 0.78690185546875, "learning_rate": 1.0346939859755481e-05, "loss": 0.0315, "reward": 0.9543692465871573, "reward_std": 0.2549169279518537, "rewards/accuracy_reward": 0.054166668094694614, "rewards/cosine_scaled_reward": -0.028269669704604893, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9284722492098808, "step": 815 }, { "completion_length": 709.1854377746582, "epoch": 0.5433387170248892, "grad_norm": 2.969920386348259, "kl": 0.73394775390625, "learning_rate": 1.023131903102226e-05, "loss": 0.0294, "reward": 1.025769717246294, "reward_std": 0.2602516794577241, "rewards/accuracy_reward": 0.07083333544433117, "rewards/cosine_scaled_reward": -0.0033969731943216176, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.958333345502615, "step": 820 }, { "completion_length": 665.9583553314209, "epoch": 0.5466517579823581, "grad_norm": 4.16810579527893, "kl": 0.92969970703125, "learning_rate": 1.0115667253280817e-05, "loss": 0.0372, "reward": 1.0662250239402056, "reward_std": 0.2625566049013287, "rewards/accuracy_reward": 0.0895833358168602, "rewards/cosine_scaled_reward": 0.00997499090153724, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9666666850447655, "step": 825 }, { "completion_length": 666.9916839599609, "epoch": 0.5499647989398269, "grad_norm": 5.715907679016816, "kl": 0.79324951171875, "learning_rate": 1e-05, "loss": 0.0317, "reward": 1.0384725525975227, "reward_std": 0.22899209909373894, "rewards/accuracy_reward": 0.07500000167638063, "rewards/cosine_scaled_reward": -0.008749696309678257, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9722222328186035, "step": 830 }, { "completion_length": 720.1583549499512, "epoch": 0.5532778398972957, "grad_norm": 3.099744455129263, "kl": 0.9510986328125, "learning_rate": 9.884332746719186e-06, "loss": 0.038, "reward": 1.009821503609419, "reward_std": 0.2705997523386031, "rewards/accuracy_reward": 0.08125000242143869, "rewards/cosine_scaled_reward": -0.030456301575759424, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9590277925133706, "step": 835 }, { "completion_length": 662.1416858673095, "epoch": 0.5565908808547646, "grad_norm": 2.855618997267347, "kl": 40.13369140625, "learning_rate": 9.768680968977743e-06, "loss": 1.6069, "reward": 0.982228261232376, "reward_std": 0.28527570209116676, "rewards/accuracy_reward": 0.06041666846722364, "rewards/cosine_scaled_reward": -0.028188421328377445, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9500000156462193, "step": 840 }, { "completion_length": 708.8833526611328, "epoch": 0.5599039218122334, "grad_norm": 2.977059186895388, "kl": 1.17364501953125, "learning_rate": 9.653060140244524e-06, "loss": 0.047, "reward": 0.945079381018877, "reward_std": 0.257498794113053, "rewards/accuracy_reward": 0.05416666828095913, "rewards/cosine_scaled_reward": -0.0403373085136991, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9312500156462192, "step": 845 }, { "completion_length": 771.7541870117187, "epoch": 0.5632169627697022, "grad_norm": 0.9200384439475252, "kl": 0.93564453125, "learning_rate": 9.537485729847594e-06, "loss": 0.0374, "reward": 0.9272668905556202, "reward_std": 0.27646568615964495, "rewards/accuracy_reward": 0.06041666828095913, "rewards/cosine_scaled_reward": -0.046344244069769044, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9131944689899683, "step": 850 }, { "completion_length": 718.1583557128906, "epoch": 0.5665300037271711, "grad_norm": 0.6034289002211813, "kl": 0.3440673828125, "learning_rate": 9.421973200904538e-06, "loss": 0.0138, "reward": 0.9842816740274429, "reward_std": 0.2700654156855308, "rewards/accuracy_reward": 0.07083333544433117, "rewards/cosine_scaled_reward": 0.001642756478395313, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9118055731058121, "step": 855 }, { "completion_length": 763.6416885375977, "epoch": 0.5698430446846399, "grad_norm": 0.6604826814021678, "kl": 0.2713134765625, "learning_rate": 9.306538008253611e-06, "loss": 0.0108, "reward": 1.0509522318840028, "reward_std": 0.2729166650911793, "rewards/accuracy_reward": 0.0916666692122817, "rewards/cosine_scaled_reward": 0.021785542974248527, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.937500013411045, "step": 860 }, { "completion_length": 770.4625205993652, "epoch": 0.5731560856421087, "grad_norm": 0.5330115884618337, "kl": 0.3289794921875, "learning_rate": 9.19119559638596e-06, "loss": 0.0132, "reward": 0.9851846359670162, "reward_std": 0.19856081604957582, "rewards/accuracy_reward": 0.05000000149011612, "rewards/cosine_scaled_reward": -0.018982053641229867, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9541666828095913, "step": 865 }, { "completion_length": 724.3750198364257, "epoch": 0.5764691265995776, "grad_norm": 0.5645014829218323, "kl": 0.2625732421875, "learning_rate": 9.075961397379247e-06, "loss": 0.0105, "reward": 1.0784725606441499, "reward_std": 0.23683490817202255, "rewards/accuracy_reward": 0.0895833358168602, "rewards/cosine_scaled_reward": 0.02638920613389928, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9625000134110451, "step": 870 }, { "completion_length": 705.5854362487793, "epoch": 0.5797821675570465, "grad_norm": 0.5855840029762401, "kl": 0.255615234375, "learning_rate": 8.960850828832958e-06, "loss": 0.0102, "reward": 1.0455495871603488, "reward_std": 0.2203972745803185, "rewards/accuracy_reward": 0.07083333507180214, "rewards/cosine_scaled_reward": 0.007355120766442269, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9673611216247082, "step": 875 }, { "completion_length": 700.4583541870118, "epoch": 0.5830952085145152, "grad_norm": 0.8517886923292233, "kl": 0.28126220703125, "learning_rate": 8.845879291805605e-06, "loss": 0.0113, "reward": 1.0279725313186645, "reward_std": 0.22440977127989753, "rewards/accuracy_reward": 0.06666666865348816, "rewards/cosine_scaled_reward": 0.008528062212280928, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9527777910232544, "step": 880 }, { "completion_length": 717.0208549499512, "epoch": 0.5864082494719841, "grad_norm": 2919.81684540161, "kl": 16.19708251953125, "learning_rate": 8.731062168754174e-06, "loss": 0.6468, "reward": 0.9581716526299715, "reward_std": 0.24088463254738599, "rewards/accuracy_reward": 0.05000000149011612, "rewards/cosine_scaled_reward": -0.018217260553501546, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9263889037072659, "step": 885 }, { "completion_length": 754.5708549499511, "epoch": 0.589721290429453, "grad_norm": 0.8581945545634289, "kl": 0.2947509765625, "learning_rate": 8.616414821476048e-06, "loss": 0.0118, "reward": 0.9067786194384098, "reward_std": 0.298170676373411, "rewards/accuracy_reward": 0.0479166679084301, "rewards/cosine_scaled_reward": -0.02794363327557221, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8868055790662766, "step": 890 }, { "completion_length": 712.4521034240722, "epoch": 0.5930343313869217, "grad_norm": 0.7058335299923665, "kl": 0.27764892578125, "learning_rate": 8.501952589053694e-06, "loss": 0.0111, "reward": 1.0002846218645574, "reward_std": 0.29179278418887405, "rewards/accuracy_reward": 0.07708333507180214, "rewards/cosine_scaled_reward": 0.011395707644987851, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9118055745959281, "step": 895 }, { "completion_length": 649.2208549499512, "epoch": 0.5963473723443906, "grad_norm": 0.5606050276325089, "kl": 0.27261962890625, "learning_rate": 8.387690785802403e-06, "loss": 0.0109, "reward": 1.046046482026577, "reward_std": 0.23475574197946117, "rewards/accuracy_reward": 0.08958333525806665, "rewards/cosine_scaled_reward": 0.014796447940170765, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9416666842997075, "step": 900 }, { "epoch": 0.5963473723443906, "eval_completion_length": 644.7010031307445, "eval_kl": 0.25080422794117646, "eval_loss": 0.010022947564721107, "eval_reward": 1.0253153443336487, "eval_reward_std": 0.1935208297389395, "eval_rewards/accuracy_reward": 0.05882353116484249, "eval_rewards/cosine_scaled_reward": -0.0057304411688271694, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9722222405321458, "eval_runtime": 80.0986, "eval_samples_per_second": 1.236, "eval_steps_per_second": 0.112, "step": 900 }, { "completion_length": 689.6271015167237, "epoch": 0.5996604133018595, "grad_norm": 0.5211624527190019, "kl": 0.2569091796875, "learning_rate": 8.273644699221309e-06, "loss": 0.0103, "reward": 1.0711087822914123, "reward_std": 0.2464535224309657, "rewards/accuracy_reward": 0.08541666902601719, "rewards/cosine_scaled_reward": 0.02527542635798454, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9604166775941849, "step": 905 }, { "completion_length": 700.2916862487793, "epoch": 0.6029734542593282, "grad_norm": 0.5683869854937798, "kl": 1.43768310546875, "learning_rate": 8.159829587948048e-06, "loss": 0.0575, "reward": 1.1194160163402558, "reward_std": 0.2532109005260281, "rewards/accuracy_reward": 0.1062500024214387, "rewards/cosine_scaled_reward": 0.04094377264846116, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9722222283482551, "step": 910 }, { "completion_length": 673.3479362487793, "epoch": 0.6062864952167971, "grad_norm": 0.5234637970396707, "kl": 0.2445556640625, "learning_rate": 8.046260679717225e-06, "loss": 0.0098, "reward": 1.0545516267418862, "reward_std": 0.19365703349467367, "rewards/accuracy_reward": 0.06458333488553762, "rewards/cosine_scaled_reward": 0.012190489581553266, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.977777785807848, "step": 915 }, { "completion_length": 663.1041877746582, "epoch": 0.609599536174266, "grad_norm": 0.5292191534552335, "kl": 0.25194091796875, "learning_rate": 7.932953169323057e-06, "loss": 0.0101, "reward": 1.0318547308444976, "reward_std": 0.22293779149185866, "rewards/accuracy_reward": 0.06666666865348816, "rewards/cosine_scaled_reward": 0.004771368706133217, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9604166753590107, "step": 920 }, { "completion_length": 694.0937698364257, "epoch": 0.6129125771317347, "grad_norm": 0.5183130719429009, "kl": 0.2506591796875, "learning_rate": 7.8199222165864e-06, "loss": 0.01, "reward": 1.028163066506386, "reward_std": 0.22450451673648786, "rewards/accuracy_reward": 0.07083333525806665, "rewards/cosine_scaled_reward": 0.011496381685719825, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.945833346247673, "step": 925 }, { "completion_length": 725.65835647583, "epoch": 0.6162256180892036, "grad_norm": 0.6026357117944807, "kl": 0.27022705078125, "learning_rate": 7.70718294432646e-06, "loss": 0.0108, "reward": 1.0014728397130965, "reward_std": 0.20938806016929448, "rewards/accuracy_reward": 0.05625000149011612, "rewards/cosine_scaled_reward": -0.0075549599016085265, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.952777798473835, "step": 930 }, { "completion_length": 721.6521026611329, "epoch": 0.6195386590466725, "grad_norm": 0.5565389609373601, "kl": 0.2677734375, "learning_rate": 7.594750436337467e-06, "loss": 0.0107, "reward": 0.9926806919276714, "reward_std": 0.29521104086888955, "rewards/accuracy_reward": 0.07708333544433117, "rewards/cosine_scaled_reward": 0.01004178102593869, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9055555790662766, "step": 935 }, { "completion_length": 710.0604354858399, "epoch": 0.6228517000041413, "grad_norm": 0.8077343196005132, "kl": 0.260107421875, "learning_rate": 7.482639735370536e-06, "loss": 0.0104, "reward": 0.985926815867424, "reward_std": 0.3472483110264875, "rewards/accuracy_reward": 0.07916666902601718, "rewards/cosine_scaled_reward": 0.006065675290301442, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9006944678723812, "step": 940 }, { "completion_length": 734.9875198364258, "epoch": 0.6261647409616101, "grad_norm": 0.5430288310030688, "kl": 0.238916015625, "learning_rate": 7.37086584112108e-06, "loss": 0.0096, "reward": 0.9927262924611568, "reward_std": 0.25354631019872614, "rewards/accuracy_reward": 0.06458333507180214, "rewards/cosine_scaled_reward": 0.0038373848306946456, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9243055783212185, "step": 945 }, { "completion_length": 736.0666885375977, "epoch": 0.629477781919079, "grad_norm": 0.5812190147268742, "kl": 0.24256591796875, "learning_rate": 7.2594437082219074e-06, "loss": 0.0097, "reward": 1.012109387665987, "reward_std": 0.26125249567558056, "rewards/accuracy_reward": 0.06875000167638064, "rewards/cosine_scaled_reward": 0.0051649166329298165, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9381944566965104, "step": 950 }, { "completion_length": 688.0562713623046, "epoch": 0.6327908228765479, "grad_norm": 0.5190009419185936, "kl": 0.24962158203125, "learning_rate": 7.148388244242414e-06, "loss": 0.01, "reward": 1.1038883820176124, "reward_std": 0.2695035228040069, "rewards/accuracy_reward": 0.0979166692122817, "rewards/cosine_scaled_reward": 0.04138836342608556, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9645833462476731, "step": 955 }, { "completion_length": 683.8479347229004, "epoch": 0.6361038638340166, "grad_norm": 0.5563285719759087, "kl": 0.24112548828125, "learning_rate": 7.037714307694038e-06, "loss": 0.0096, "reward": 1.0375377908349037, "reward_std": 0.23214901172323152, "rewards/accuracy_reward": 0.06666666865348816, "rewards/cosine_scaled_reward": 0.01948220714693889, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9513888999819755, "step": 960 }, { "completion_length": 683.9833511352539, "epoch": 0.6394169047914855, "grad_norm": 0.5968834042878632, "kl": 0.2594482421875, "learning_rate": 6.927436706042276e-06, "loss": 0.0104, "reward": 1.042594925314188, "reward_std": 0.24978068978525697, "rewards/accuracy_reward": 0.07500000223517418, "rewards/cosine_scaled_reward": 0.025928232248406857, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9416666813194752, "step": 965 }, { "completion_length": 672.5958541870117, "epoch": 0.6427299457489544, "grad_norm": 0.6069879730631458, "kl": 0.25821533203125, "learning_rate": 6.8175701937255645e-06, "loss": 0.0103, "reward": 1.0218962721526623, "reward_std": 0.21961677792132833, "rewards/accuracy_reward": 0.06250000167638063, "rewards/cosine_scaled_reward": 0.008701804676093161, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9506944552063942, "step": 970 }, { "completion_length": 675.0771018981934, "epoch": 0.6460429867064231, "grad_norm": 0.5915406306352937, "kl": 0.26412353515625, "learning_rate": 6.708129470181197e-06, "loss": 0.0106, "reward": 1.107464261353016, "reward_std": 0.3173739513847977, "rewards/accuracy_reward": 0.10833333637565375, "rewards/cosine_scaled_reward": 0.05468644829234108, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9444444559514522, "step": 975 }, { "completion_length": 652.9521068572998, "epoch": 0.649356027663892, "grad_norm": 0.5827154364823033, "kl": 0.25538330078125, "learning_rate": 6.5991291778786556e-06, "loss": 0.0102, "reward": 1.042717768251896, "reward_std": 0.2565536227310076, "rewards/accuracy_reward": 0.08125000242143869, "rewards/cosine_scaled_reward": 0.023273307248018682, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9381944589316845, "step": 980 }, { "completion_length": 669.9021049499512, "epoch": 0.6526690686213609, "grad_norm": 0.5832428399345815, "kl": 0.25567626953125, "learning_rate": 6.490583900360543e-06, "loss": 0.0102, "reward": 1.0675076805055141, "reward_std": 0.23609142500790767, "rewards/accuracy_reward": 0.08541666865348815, "rewards/cosine_scaled_reward": 0.030007645767182112, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9520833477377891, "step": 985 }, { "completion_length": 738.0833557128906, "epoch": 0.6559821095788296, "grad_norm": 0.5416621889956812, "kl": 0.24434814453125, "learning_rate": 6.38250816029139e-06, "loss": 0.0098, "reward": 1.0092557102441788, "reward_std": 0.2631334120524116, "rewards/accuracy_reward": 0.07500000167638063, "rewards/cosine_scaled_reward": 0.006477906345389783, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9277777954936027, "step": 990 }, { "completion_length": 699.2333518981934, "epoch": 0.6592951505362985, "grad_norm": 0.5607785467679972, "kl": 0.23555908203125, "learning_rate": 6.274916417514605e-06, "loss": 0.0094, "reward": 1.0329258136451245, "reward_std": 0.26789961218019015, "rewards/accuracy_reward": 0.06875000186264515, "rewards/cosine_scaled_reward": 0.018342451547505335, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9458333514630795, "step": 995 }, { "completion_length": 702.7916854858398, "epoch": 0.6626081914937674, "grad_norm": 0.5651148451019158, "kl": 0.2371337890625, "learning_rate": 6.167823067117868e-06, "loss": 0.0095, "reward": 1.0623246341943742, "reward_std": 0.25413749769795685, "rewards/accuracy_reward": 0.08541666883975267, "rewards/cosine_scaled_reward": 0.02135238423361443, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9555555701255798, "step": 1000 }, { "epoch": 0.6626081914937674, "eval_completion_length": 696.436290067785, "eval_kl": 0.24333639705882354, "eval_loss": 0.009705452248454094, "eval_reward": 1.1996825477656197, "eval_reward_std": 0.2648356732200174, "eval_rewards/accuracy_reward": 0.13725490473649082, "eval_rewards/cosine_scaled_reward": 0.08040148370406207, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9820261457387138, "eval_runtime": 84.3604, "eval_samples_per_second": 1.174, "eval_steps_per_second": 0.107, "step": 1000 }, { "completion_length": 699.8250198364258, "epoch": 0.6659212324512361, "grad_norm": 0.5629888944413032, "kl": 0.24071044921875, "learning_rate": 6.061242437507131e-06, "loss": 0.0096, "reward": 1.0877350278198719, "reward_std": 0.20314363973448052, "rewards/accuracy_reward": 0.08541666902601719, "rewards/cosine_scaled_reward": 0.0266238811891526, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.97569445297122, "step": 1005 }, { "completion_length": 703.8583549499511, "epoch": 0.669234273408705, "grad_norm": 0.5641870905567404, "kl": 0.2332275390625, "learning_rate": 5.955188788489583e-06, "loss": 0.0093, "reward": 1.1175980359315871, "reward_std": 0.2400467697996646, "rewards/accuracy_reward": 0.10000000279396773, "rewards/cosine_scaled_reward": 0.04954244927503169, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9680555678904057, "step": 1010 }, { "completion_length": 705.4708541870117, "epoch": 0.6725473143661739, "grad_norm": 0.6097687532503788, "kl": 0.2300537109375, "learning_rate": 5.849676309365786e-06, "loss": 0.0092, "reward": 1.074021714925766, "reward_std": 0.24223397983296308, "rewards/accuracy_reward": 0.08541666902601719, "rewards/cosine_scaled_reward": 0.03166056994232349, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9569444552063942, "step": 1015 }, { "completion_length": 683.6458511352539, "epoch": 0.6758603553236426, "grad_norm": 0.6006818511926881, "kl": 0.2662353515625, "learning_rate": 5.744719117031217e-06, "loss": 0.0106, "reward": 1.1056484147906303, "reward_std": 0.28973424368305134, "rewards/accuracy_reward": 0.0958333358168602, "rewards/cosine_scaled_reward": 0.05078727581421845, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9590277902781963, "step": 1020 }, { "completion_length": 706.3750190734863, "epoch": 0.6791733962811115, "grad_norm": 0.5366108979063211, "kl": 0.24591064453125, "learning_rate": 5.6403312540875325e-06, "loss": 0.0098, "reward": 1.033188060671091, "reward_std": 0.2257270894420799, "rewards/accuracy_reward": 0.06666666865348816, "rewards/cosine_scaled_reward": 0.014438032626640052, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9520833477377891, "step": 1025 }, { "completion_length": 718.6250205993653, "epoch": 0.6824864372385804, "grad_norm": 0.5658309178067615, "kl": 0.2420166015625, "learning_rate": 5.536526686963762e-06, "loss": 0.0097, "reward": 1.044818665459752, "reward_std": 0.2756887644645758, "rewards/accuracy_reward": 0.08125000223517417, "rewards/cosine_scaled_reward": 0.019124195608310402, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9444444596767425, "step": 1030 }, { "completion_length": 731.0229370117188, "epoch": 0.6857994781960493, "grad_norm": 0.6779968426633557, "kl": 0.24952392578125, "learning_rate": 5.433319304047666e-06, "loss": 0.01, "reward": 1.0660872675478459, "reward_std": 0.24434197215596215, "rewards/accuracy_reward": 0.0875000024214387, "rewards/cosine_scaled_reward": 0.031365022042155036, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9472222365438938, "step": 1035 }, { "completion_length": 705.5687705993653, "epoch": 0.689112519153518, "grad_norm": 0.5766678467196866, "kl": 0.31824951171875, "learning_rate": 5.330722913827594e-06, "loss": 0.0127, "reward": 1.009748300537467, "reward_std": 0.3129956013697665, "rewards/accuracy_reward": 0.08750000260770321, "rewards/cosine_scaled_reward": 0.02085939382086508, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9013889171183109, "step": 1040 }, { "completion_length": 705.1479354858399, "epoch": 0.6924255601109869, "grad_norm": 0.5410222017810264, "kl": 0.25225830078125, "learning_rate": 5.228751243044961e-06, "loss": 0.0101, "reward": 1.0665145918726922, "reward_std": 0.26668303488986567, "rewards/accuracy_reward": 0.0937500026077032, "rewards/cosine_scaled_reward": 0.026931234363291878, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9458333484828472, "step": 1045 }, { "completion_length": 735.3271034240722, "epoch": 0.6957386010684558, "grad_norm": 0.5359056407192179, "kl": 0.2318115234375, "learning_rate": 5.127417934857718e-06, "loss": 0.0093, "reward": 1.099215094745159, "reward_std": 0.31327825486660005, "rewards/accuracy_reward": 0.11458333600312472, "rewards/cosine_scaled_reward": 0.05407617967575788, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9305555738508702, "step": 1050 }, { "completion_length": 712.6312698364258, "epoch": 0.6990516420259245, "grad_norm": 0.5484248439575278, "kl": 0.264306640625, "learning_rate": 5.026736547014981e-06, "loss": 0.0106, "reward": 1.0673648558557034, "reward_std": 0.3585894995951094, "rewards/accuracy_reward": 0.09583333600312471, "rewards/cosine_scaled_reward": 0.034725937712937596, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9368055760860443, "step": 1055 }, { "completion_length": 706.4875194549561, "epoch": 0.7023646829833934, "grad_norm": 0.5567636291351401, "kl": 0.2294189453125, "learning_rate": 4.926720550043089e-06, "loss": 0.0092, "reward": 1.0243882276117802, "reward_std": 0.23136171433143318, "rewards/accuracy_reward": 0.0645833346992731, "rewards/cosine_scaled_reward": 0.0021659893449395896, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9576389007270336, "step": 1060 }, { "completion_length": 685.1187721252442, "epoch": 0.7056777239408623, "grad_norm": 0.6694817001760959, "kl": 0.23017578125, "learning_rate": 4.827383325443331e-06, "loss": 0.0092, "reward": 1.0866692245006562, "reward_std": 0.27554886613506824, "rewards/accuracy_reward": 0.10208333600312472, "rewards/cosine_scaled_reward": 0.0331969631719403, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9513889059424401, "step": 1065 }, { "completion_length": 685.9104347229004, "epoch": 0.708990764898331, "grad_norm": 0.6521262885611052, "kl": 0.24091796875, "learning_rate": 4.728738163901597e-06, "loss": 0.0096, "reward": 1.0506494276225566, "reward_std": 0.2424812117445981, "rewards/accuracy_reward": 0.08125000204890967, "rewards/cosine_scaled_reward": 0.014538292206270852, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9548611260950566, "step": 1070 }, { "completion_length": 690.1791877746582, "epoch": 0.7123038058557999, "grad_norm": 0.5725287484318065, "kl": 0.24205322265625, "learning_rate": 4.630798263510162e-06, "loss": 0.0097, "reward": 1.0749888263642788, "reward_std": 0.27061148614156993, "rewards/accuracy_reward": 0.09583333600312471, "rewards/cosine_scaled_reward": 0.03262769319117069, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9465277925133705, "step": 1075 }, { "completion_length": 691.6666847229004, "epoch": 0.7156168468132688, "grad_norm": 0.5724040789499583, "kl": 0.25650634765625, "learning_rate": 4.533576728001858e-06, "loss": 0.0103, "reward": 0.9847696669399738, "reward_std": 0.28619281406863595, "rewards/accuracy_reward": 0.06458333507180214, "rewards/cosine_scaled_reward": -0.006897027121158317, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9270833559334278, "step": 1080 }, { "completion_length": 700.5854370117188, "epoch": 0.7189298877707375, "grad_norm": 0.6908528329225427, "kl": 0.25274658203125, "learning_rate": 4.437086564996891e-06, "loss": 0.0101, "reward": 1.0387089431285859, "reward_std": 0.23834604396251963, "rewards/accuracy_reward": 0.07291666846722364, "rewards/cosine_scaled_reward": 0.017875579325482248, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9479166835546493, "step": 1085 }, { "completion_length": 701.4646049499512, "epoch": 0.7222429287282064, "grad_norm": 0.5758477782319862, "kl": 0.2464111328125, "learning_rate": 4.341340684262498e-06, "loss": 0.0099, "reward": 1.0454999834299088, "reward_std": 0.2807784583827015, "rewards/accuracy_reward": 0.07708333507180214, "rewards/cosine_scaled_reward": 0.02813884927891195, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9402777954936028, "step": 1090 }, { "completion_length": 694.2437683105469, "epoch": 0.7255559696856753, "grad_norm": 0.60136666874286, "kl": 0.23504638671875, "learning_rate": 4.246351895985702e-06, "loss": 0.0094, "reward": 1.1387354739010334, "reward_std": 0.2430174820823595, "rewards/accuracy_reward": 0.11875000316649675, "rewards/cosine_scaled_reward": 0.05748545229434967, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9625000171363354, "step": 1095 }, { "completion_length": 696.6229362487793, "epoch": 0.728869010643144, "grad_norm": 0.563156396588543, "kl": 0.2312255859375, "learning_rate": 4.152132909059402e-06, "loss": 0.0092, "reward": 1.0300480626523494, "reward_std": 0.2090206912427675, "rewards/accuracy_reward": 0.06875000167638064, "rewards/cosine_scaled_reward": 0.013381362741347403, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9479166865348816, "step": 1100 }, { "epoch": 0.728869010643144, "eval_completion_length": 697.7794404871323, "eval_kl": 0.23885569852941177, "eval_loss": 0.009638833813369274, "eval_reward": 1.086963110110339, "eval_reward_std": 0.28365588593570623, "eval_rewards/accuracy_reward": 0.09803921816980138, "eval_rewards/cosine_scaled_reward": 0.047747393850894534, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9411764881190132, "eval_runtime": 83.2296, "eval_samples_per_second": 1.189, "eval_steps_per_second": 0.108, "step": 1100 }, { "completion_length": 687.5271041870117, "epoch": 0.7321820516006129, "grad_norm": 0.5823793514438087, "kl": 0.2355224609375, "learning_rate": 4.058696329381987e-06, "loss": 0.0094, "reward": 1.0460910208523273, "reward_std": 0.28138685831218024, "rewards/accuracy_reward": 0.08333333563059568, "rewards/cosine_scaled_reward": 0.014146549673750997, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9486111253499985, "step": 1105 }, { "completion_length": 725.7416831970215, "epoch": 0.7354950925580818, "grad_norm": 0.6742278165213453, "kl": 0.23687744140625, "learning_rate": 3.966054658170754e-06, "loss": 0.0095, "reward": 1.0044114805758, "reward_std": 0.23454540363745763, "rewards/accuracy_reward": 0.06041666828095913, "rewards/cosine_scaled_reward": -0.0032274457102175803, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9472222357988358, "step": 1110 }, { "completion_length": 760.8500213623047, "epoch": 0.7388081335155506, "grad_norm": 128.1593177783891, "kl": 1.3335693359375, "learning_rate": 3.874220290289337e-06, "loss": 0.0532, "reward": 1.0164015203714372, "reward_std": 0.29348075333982704, "rewards/accuracy_reward": 0.07916666846722364, "rewards/cosine_scaled_reward": -0.004431843245401979, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9416666828095913, "step": 1115 }, { "completion_length": 736.0604362487793, "epoch": 0.7421211744730194, "grad_norm": 0.6705838698370622, "kl": 0.25177001953125, "learning_rate": 3.7832055125893318e-06, "loss": 0.0101, "reward": 1.0943914458155632, "reward_std": 0.30084287009376565, "rewards/accuracy_reward": 0.10416666902601719, "rewards/cosine_scaled_reward": 0.040919190156273544, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9493055760860443, "step": 1120 }, { "completion_length": 718.4875236511231, "epoch": 0.7454342154304883, "grad_norm": 0.5039074482364153, "kl": 0.233251953125, "learning_rate": 3.6930225022664136e-06, "loss": 0.0093, "reward": 1.1117434859275819, "reward_std": 0.2212675413582474, "rewards/accuracy_reward": 0.10000000298023223, "rewards/cosine_scaled_reward": 0.04785456540994346, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9638889022171497, "step": 1125 }, { "completion_length": 722.5166870117188, "epoch": 0.7487472563879571, "grad_norm": 0.6539610352819505, "kl": 0.24056396484375, "learning_rate": 3.6036833252310887e-06, "loss": 0.0096, "reward": 1.1434138268232346, "reward_std": 0.30227450939128175, "rewards/accuracy_reward": 0.11250000279396773, "rewards/cosine_scaled_reward": 0.07535824882797897, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9555555678904056, "step": 1130 }, { "completion_length": 712.8541870117188, "epoch": 0.7520602973454259, "grad_norm": 0.7945847077767145, "kl": 0.2441162109375, "learning_rate": 3.515199934494373e-06, "loss": 0.0098, "reward": 1.0877235375344754, "reward_std": 0.28627775572240355, "rewards/accuracy_reward": 0.0895833358168602, "rewards/cosine_scaled_reward": 0.03980684823472984, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9583333536982537, "step": 1135 }, { "completion_length": 715.6958541870117, "epoch": 0.7553733383028948, "grad_norm": 0.7021143665931746, "kl": 0.248291015625, "learning_rate": 3.427584168568535e-06, "loss": 0.0099, "reward": 1.0960090607404709, "reward_std": 0.2802498764358461, "rewards/accuracy_reward": 0.09375000279396772, "rewards/cosine_scaled_reward": 0.04809235641732812, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9541666798293591, "step": 1140 }, { "completion_length": 730.9041893005372, "epoch": 0.7586863792603636, "grad_norm": 0.6375347310859943, "kl": 0.268896484375, "learning_rate": 3.3408477498831917e-06, "loss": 0.0108, "reward": 1.0276512533426285, "reward_std": 0.24158016670262442, "rewards/accuracy_reward": 0.07083333488553763, "rewards/cosine_scaled_reward": 0.020706782315392046, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9361111275851727, "step": 1145 }, { "completion_length": 713.5625213623047, "epoch": 0.7619994202178324, "grad_norm": 0.5641155041304022, "kl": 0.2553955078125, "learning_rate": 3.2550022832169125e-06, "loss": 0.0102, "reward": 1.0406024023890494, "reward_std": 0.29889187471126205, "rewards/accuracy_reward": 0.07916666902601718, "rewards/cosine_scaled_reward": 0.026713477505836636, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9347222402691842, "step": 1150 }, { "completion_length": 719.2958526611328, "epoch": 0.7653124611753013, "grad_norm": 0.6350261443263014, "kl": 0.2781005859375, "learning_rate": 3.170059254144593e-06, "loss": 0.0111, "reward": 1.0504739299416541, "reward_std": 0.3410513519484084, "rewards/accuracy_reward": 0.10208333618938922, "rewards/cosine_scaled_reward": 0.047001674235798416, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9013889148831368, "step": 1155 }, { "completion_length": 708.4125175476074, "epoch": 0.7686255021327701, "grad_norm": 0.6157481438141363, "kl": 0.24415283203125, "learning_rate": 3.086030027500728e-06, "loss": 0.0098, "reward": 1.1004967622458934, "reward_std": 0.30008399614016523, "rewards/accuracy_reward": 0.10833333600312471, "rewards/cosine_scaled_reward": 0.06369117611902766, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9284722439944744, "step": 1160 }, { "completion_length": 697.7937721252441, "epoch": 0.7719385430902389, "grad_norm": 0.6509694144940286, "kl": 0.23865966796875, "learning_rate": 3.002925845858905e-06, "loss": 0.0095, "reward": 1.0825313463807107, "reward_std": 0.25022683480056, "rewards/accuracy_reward": 0.08541666902601719, "rewards/cosine_scaled_reward": 0.037392429204192015, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9597222380340099, "step": 1165 }, { "completion_length": 698.5250190734863, "epoch": 0.7752515840477078, "grad_norm": 0.6048667966577433, "kl": 0.22767333984375, "learning_rate": 2.920757828027586e-06, "loss": 0.0091, "reward": 1.083505555987358, "reward_std": 0.24698185920133256, "rewards/accuracy_reward": 0.09375000279396772, "rewards/cosine_scaled_reward": 0.046005524846259506, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9437500186264515, "step": 1170 }, { "completion_length": 694.9375221252442, "epoch": 0.7785646250051766, "grad_norm": 0.564866469272664, "kl": 3.107080078125, "learning_rate": 2.839536967562504e-06, "loss": 0.1242, "reward": 1.1438683733344077, "reward_std": 0.33052165554836394, "rewards/accuracy_reward": 0.11875000335276127, "rewards/cosine_scaled_reward": 0.06817389702191576, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9569444634020329, "step": 1175 }, { "completion_length": 718.3271049499511, "epoch": 0.7818776659626455, "grad_norm": 0.5807109771447004, "kl": 0.22999267578125, "learning_rate": 2.759274131295787e-06, "loss": 0.0092, "reward": 1.075054731220007, "reward_std": 0.2507293263653992, "rewards/accuracy_reward": 0.09375000223517418, "rewards/cosine_scaled_reward": 0.041026921581942585, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9402777947485447, "step": 1180 }, { "completion_length": 721.0312713623047, "epoch": 0.7851907069201143, "grad_norm": 0.5642350428427413, "kl": 0.22774658203125, "learning_rate": 2.679980057882049e-06, "loss": 0.0091, "reward": 1.062976571172476, "reward_std": 0.2711804293037858, "rewards/accuracy_reward": 0.07916666883975268, "rewards/cosine_scaled_reward": 0.03242098632035777, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9513889089226722, "step": 1185 }, { "completion_length": 702.827103805542, "epoch": 0.7885037478775831, "grad_norm": 0.7137684664844179, "kl": 0.23399658203125, "learning_rate": 2.60166535636162e-06, "loss": 0.0094, "reward": 1.0552208371460439, "reward_std": 0.25100512909702954, "rewards/accuracy_reward": 0.07708333507180214, "rewards/cosine_scaled_reward": 0.03369304013904184, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9444444581866265, "step": 1190 }, { "completion_length": 699.7812721252442, "epoch": 0.791816788835052, "grad_norm": 0.5380423713134228, "kl": 0.2351806640625, "learning_rate": 2.5243405047411353e-06, "loss": 0.0094, "reward": 1.1528331734240056, "reward_std": 0.30599280503229237, "rewards/accuracy_reward": 0.13333333637565375, "rewards/cosine_scaled_reward": 0.08547202623449265, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.934027798473835, "step": 1195 }, { "completion_length": 689.8916847229004, "epoch": 0.7951298297925208, "grad_norm": 0.5685754819448275, "kl": 0.21748046875, "learning_rate": 2.448015848591638e-06, "loss": 0.0087, "reward": 1.1659582242369653, "reward_std": 0.29099906139890663, "rewards/accuracy_reward": 0.12708333637565375, "rewards/cosine_scaled_reward": 0.07706930527929216, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9618055664002896, "step": 1200 }, { "epoch": 0.7951298297925208, "eval_completion_length": 686.0686430089614, "eval_kl": 0.23006663602941177, "eval_loss": 0.009243295527994633, "eval_reward": 1.1754143728929407, "eval_reward_std": 0.2901155304382829, "eval_rewards/accuracy_reward": 0.13725490473649082, "eval_rewards/cosine_scaled_reward": 0.0937149930087959, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9444444565212026, "eval_runtime": 84.052, "eval_samples_per_second": 1.178, "eval_steps_per_second": 0.107, "step": 1200 }, { "completion_length": 708.1396034240722, "epoch": 0.7984428707499897, "grad_norm": 0.5671705641077195, "kl": 0.2204833984375, "learning_rate": 2.3727015996644043e-06, "loss": 0.0088, "reward": 1.0816673040390015, "reward_std": 0.2487310474156402, "rewards/accuracy_reward": 0.09166666883975268, "rewards/cosine_scaled_reward": 0.04902839425485581, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.940972238779068, "step": 1205 }, { "completion_length": 696.5854377746582, "epoch": 0.8017559117074585, "grad_norm": 0.6557400602769374, "kl": 0.21082763671875, "learning_rate": 2.298407834524682e-06, "loss": 0.0084, "reward": 1.0709110483527184, "reward_std": 0.24072149136918597, "rewards/accuracy_reward": 0.08125000186264515, "rewards/cosine_scaled_reward": 0.02299434217857197, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9666666813194752, "step": 1210 }, { "completion_length": 678.5875228881836, "epoch": 0.8050689526649273, "grad_norm": 0.5285542098034939, "kl": 0.21624755859375, "learning_rate": 2.2251444932035094e-06, "loss": 0.0087, "reward": 1.1003202863037587, "reward_std": 0.2405082249257248, "rewards/accuracy_reward": 0.08541666883975267, "rewards/cosine_scaled_reward": 0.03920914768241346, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9756944544613362, "step": 1215 }, { "completion_length": 686.9521034240722, "epoch": 0.8083819936223962, "grad_norm": 0.577761392483963, "kl": 0.22117919921875, "learning_rate": 2.1529213778677993e-06, "loss": 0.0089, "reward": 1.0638688035309314, "reward_std": 0.26320576341531704, "rewards/accuracy_reward": 0.07500000204890966, "rewards/cosine_scaled_reward": 0.033313213015208024, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9555555701255798, "step": 1220 }, { "completion_length": 693.1354351043701, "epoch": 0.811695034579865, "grad_norm": 0.553556015985704, "kl": 0.216015625, "learning_rate": 2.081748151508883e-06, "loss": 0.0086, "reward": 1.0954834826290607, "reward_std": 0.3008945170440711, "rewards/accuracy_reward": 0.0916666692122817, "rewards/cosine_scaled_reward": 0.04687234191223979, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9569444596767426, "step": 1225 }, { "completion_length": 673.9687698364257, "epoch": 0.8150080755373338, "grad_norm": 0.6964937402041743, "kl": 0.22882080078125, "learning_rate": 2.0116343366496493e-06, "loss": 0.0092, "reward": 1.126390317082405, "reward_std": 0.3129663784086006, "rewards/accuracy_reward": 0.10833333600312471, "rewards/cosine_scaled_reward": 0.05486250657122582, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9631944559514523, "step": 1230 }, { "completion_length": 674.3375205993652, "epoch": 0.8183211164948027, "grad_norm": 0.5920398254582698, "kl": 0.30960693359375, "learning_rate": 1.942589314070494e-06, "loss": 0.0124, "reward": 1.1138412103056907, "reward_std": 0.2601606338354031, "rewards/accuracy_reward": 0.1000000024214387, "rewards/cosine_scaled_reward": 0.054813407792244105, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9590277940034866, "step": 1235 }, { "completion_length": 698.9021026611329, "epoch": 0.8216341574522715, "grad_norm": 0.5828877262002554, "kl": 0.23211669921875, "learning_rate": 1.8746223215542482e-06, "loss": 0.0093, "reward": 1.0541487254202366, "reward_std": 0.2587520010827575, "rewards/accuracy_reward": 0.08125000242143869, "rewards/cosine_scaled_reward": 0.03262091748947569, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9402777947485447, "step": 1240 }, { "completion_length": 707.8062698364258, "epoch": 0.8249471984097403, "grad_norm": 0.5564758710960112, "kl": 0.22384033203125, "learning_rate": 1.8077424526501964e-06, "loss": 0.0089, "reward": 1.1040533937513828, "reward_std": 0.26052900922368283, "rewards/accuracy_reward": 0.09583333618938923, "rewards/cosine_scaled_reward": 0.05752558750100434, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9506944619119168, "step": 1245 }, { "completion_length": 692.0291862487793, "epoch": 0.8282602393672092, "grad_norm": 0.702975083010214, "kl": 0.2284912109375, "learning_rate": 1.7419586554574364e-06, "loss": 0.0091, "reward": 1.1278405405580998, "reward_std": 0.22963403367321006, "rewards/accuracy_reward": 0.09583333563059568, "rewards/cosine_scaled_reward": 0.05284050657064654, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9791666746139527, "step": 1250 }, { "completion_length": 693.6312713623047, "epoch": 0.831573280324678, "grad_norm": 0.7127370755129566, "kl": 0.2173095703125, "learning_rate": 1.6772797314276712e-06, "loss": 0.0087, "reward": 1.1144855566322804, "reward_std": 0.28662063417141326, "rewards/accuracy_reward": 0.09583333600312471, "rewards/cosine_scaled_reward": 0.052679980307584626, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9659722343087196, "step": 1255 }, { "completion_length": 695.8354400634765, "epoch": 0.8348863212821469, "grad_norm": 0.5649979842296693, "kl": 0.21492919921875, "learning_rate": 1.6137143341876439e-06, "loss": 0.0086, "reward": 1.133350669592619, "reward_std": 0.24948284951387906, "rewards/accuracy_reward": 0.11041667014360428, "rewards/cosine_scaled_reward": 0.061128420592285695, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.961805573105812, "step": 1260 }, { "completion_length": 669.5125198364258, "epoch": 0.8381993622396157, "grad_norm": 0.6028044283955993, "kl": 0.22464599609375, "learning_rate": 1.5512709683813165e-06, "loss": 0.009, "reward": 1.0796908847987652, "reward_std": 0.2626152715180069, "rewards/accuracy_reward": 0.08541666902601719, "rewards/cosine_scaled_reward": 0.04149640873074532, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9527777917683125, "step": 1265 }, { "completion_length": 674.6937705993653, "epoch": 0.8415124031970845, "grad_norm": 0.5472131559727627, "kl": 0.21929931640625, "learning_rate": 1.4899579885320237e-06, "loss": 0.0088, "reward": 1.0800478100776671, "reward_std": 0.24584392708638916, "rewards/accuracy_reward": 0.08333333544433116, "rewards/cosine_scaled_reward": 0.036992224189452826, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9597222372889519, "step": 1270 }, { "completion_length": 700.8833534240723, "epoch": 0.8448254441545534, "grad_norm": 0.6621275593122103, "kl": 0.21627197265625, "learning_rate": 1.4297835979246777e-06, "loss": 0.0086, "reward": 1.1098698511719705, "reward_std": 0.23763526385591832, "rewards/accuracy_reward": 0.0937500026077032, "rewards/cosine_scaled_reward": 0.052230933296959846, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9638889022171497, "step": 1275 }, { "completion_length": 690.8562690734864, "epoch": 0.8481384851120222, "grad_norm": 3.198891703997116, "kl": 0.218701171875, "learning_rate": 1.370755847508226e-06, "loss": 0.0087, "reward": 1.0767874464392662, "reward_std": 0.25625000601285136, "rewards/accuracy_reward": 0.08125000223517417, "rewards/cosine_scaled_reward": 0.038592972815968095, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9569444611668587, "step": 1280 }, { "completion_length": 679.8000205993652, "epoch": 0.851451526069491, "grad_norm": 0.6066307244308354, "kl": 0.22574462890625, "learning_rate": 1.3128826348184886e-06, "loss": 0.009, "reward": 1.0736933879554271, "reward_std": 0.253808597495663, "rewards/accuracy_reward": 0.08125000223517417, "rewards/cosine_scaled_reward": 0.03619336179108359, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9562500156462193, "step": 1285 }, { "completion_length": 683.5937690734863, "epoch": 0.8547645670269599, "grad_norm": 0.6494587542229079, "kl": 0.21436767578125, "learning_rate": 1.256171702921516e-06, "loss": 0.0086, "reward": 1.1028260089457036, "reward_std": 0.2539305640355451, "rewards/accuracy_reward": 0.0958333358168602, "rewards/cosine_scaled_reward": 0.044492648518644276, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9625000089406968, "step": 1290 }, { "completion_length": 693.2479370117187, "epoch": 0.8580776079844287, "grad_norm": 0.7987439563235657, "kl": 0.22764892578125, "learning_rate": 1.200630639377609e-06, "loss": 0.0091, "reward": 1.0953412406146525, "reward_std": 0.23627875379752367, "rewards/accuracy_reward": 0.0937500026077032, "rewards/cosine_scaled_reward": 0.047424542799126355, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9541666842997074, "step": 1295 }, { "completion_length": 687.9583549499512, "epoch": 0.8613906489418975, "grad_norm": 0.8954000260247649, "kl": 0.21829833984375, "learning_rate": 1.1462668752261652e-06, "loss": 0.0087, "reward": 1.0874564148485661, "reward_std": 0.26065834192268084, "rewards/accuracy_reward": 0.0916666692122817, "rewards/cosine_scaled_reward": 0.0506508283666335, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9451389066874981, "step": 1300 }, { "epoch": 0.8613906489418975, "eval_completion_length": 698.9657090130975, "eval_kl": 0.21834788602941177, "eval_loss": 0.008626604452729225, "eval_reward": 1.0870709559496712, "eval_reward_std": 0.26526113763889847, "eval_rewards/accuracy_reward": 0.09803921816980138, "eval_rewards/cosine_scaled_reward": 0.051123207435011864, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9379085162106682, "eval_runtime": 85.4798, "eval_samples_per_second": 1.158, "eval_steps_per_second": 0.105, "step": 1300 }, { "completion_length": 689.6396018981934, "epoch": 0.8647036898993664, "grad_norm": 0.5673792232155471, "kl": 0.2687744140625, "learning_rate": 1.0930876839914418e-06, "loss": 0.0107, "reward": 1.1014880381524563, "reward_std": 0.2780650518951006, "rewards/accuracy_reward": 0.10000000260770321, "rewards/cosine_scaled_reward": 0.050099123571999374, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9513889074325561, "step": 1305 }, { "completion_length": 691.9125175476074, "epoch": 0.8680167308568352, "grad_norm": 0.5685375943619576, "kl": 0.223779296875, "learning_rate": 1.04110018070941e-06, "loss": 0.009, "reward": 1.10931745916605, "reward_std": 0.34339213649218436, "rewards/accuracy_reward": 0.10833333600312471, "rewards/cosine_scaled_reward": 0.06487297938438133, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9361111283302307, "step": 1310 }, { "completion_length": 687.3062698364258, "epoch": 0.871329771814304, "grad_norm": 0.9388072487911722, "kl": 0.23115234375, "learning_rate": 9.903113209758098e-07, "loss": 0.0092, "reward": 1.0614679992198943, "reward_std": 0.26736960905836893, "rewards/accuracy_reward": 0.08958333563059569, "rewards/cosine_scaled_reward": 0.03091242607915774, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.940972239524126, "step": 1315 }, { "completion_length": 694.6375213623047, "epoch": 0.8746428127717729, "grad_norm": 0.5864788503819439, "kl": 0.23155517578125, "learning_rate": 9.407279000155311e-07, "loss": 0.0093, "reward": 1.0609800457954406, "reward_std": 0.2526944712852128, "rewards/accuracy_reward": 0.07916666902601718, "rewards/cosine_scaled_reward": 0.02834113404387608, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.953472238779068, "step": 1320 }, { "completion_length": 704.5708549499511, "epoch": 0.8779558537292417, "grad_norm": 0.7254569072566448, "kl": 0.23623046875, "learning_rate": 8.923565517734633e-07, "loss": 0.0094, "reward": 1.0658707104623317, "reward_std": 0.2499769182759337, "rewards/accuracy_reward": 0.0854166692122817, "rewards/cosine_scaled_reward": 0.04156513197813183, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9388889089226723, "step": 1325 }, { "completion_length": 696.4937698364258, "epoch": 0.8812688946867105, "grad_norm": 0.8767327949076303, "kl": 0.22974853515625, "learning_rate": 8.452037480269082e-07, "loss": 0.0092, "reward": 1.0582940235733986, "reward_std": 0.2840153209341224, "rewards/accuracy_reward": 0.07916666883975268, "rewards/cosine_scaled_reward": 0.031210664578247817, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9479166850447655, "step": 1330 }, { "completion_length": 700.2896015167237, "epoch": 0.8845819356441794, "grad_norm": 0.6046494923754447, "kl": 0.25595703125, "learning_rate": 7.992757975196974e-07, "loss": 0.0102, "reward": 1.054824821650982, "reward_std": 0.26202141686226244, "rewards/accuracy_reward": 0.0833333358168602, "rewards/cosine_scaled_reward": 0.03121368814026937, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9402777932584285, "step": 1335 }, { "completion_length": 705.0166893005371, "epoch": 0.8878949766016483, "grad_norm": 0.9005917747962447, "kl": 0.2489501953125, "learning_rate": 7.545788451181313e-07, "loss": 0.01, "reward": 1.1147420533001422, "reward_std": 0.2937927826191299, "rewards/accuracy_reward": 0.11041666977107525, "rewards/cosine_scaled_reward": 0.056408689240925014, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9479166820645333, "step": 1340 }, { "completion_length": 705.0833511352539, "epoch": 0.891208017559117, "grad_norm": 0.6480563264048018, "kl": 0.2248291015625, "learning_rate": 7.11118870988825e-07, "loss": 0.009, "reward": 1.0496106386184691, "reward_std": 0.32909488283330574, "rewards/accuracy_reward": 0.07916666902601718, "rewards/cosine_scaled_reward": 0.03294394265394658, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9375000208616256, "step": 1345 }, { "completion_length": 706.8666862487793, "epoch": 0.8945210585165859, "grad_norm": 0.8189152959666164, "kl": 0.2332275390625, "learning_rate": 6.689016897986123e-07, "loss": 0.0093, "reward": 1.0961603365838528, "reward_std": 0.2675176261429442, "rewards/accuracy_reward": 0.1000000024214387, "rewards/cosine_scaled_reward": 0.057271424785722044, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9388889029622078, "step": 1350 }, { "completion_length": 712.0479393005371, "epoch": 0.8978340994740548, "grad_norm": 0.6723385387517044, "kl": 0.238427734375, "learning_rate": 6.279329499365649e-07, "loss": 0.0095, "reward": 1.0590997479856015, "reward_std": 0.2600853289361112, "rewards/accuracy_reward": 0.0854166692122817, "rewards/cosine_scaled_reward": 0.03548861061572097, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9381944686174393, "step": 1355 }, { "completion_length": 713.3916900634765, "epoch": 0.9011471404315236, "grad_norm": 0.6239743663817402, "kl": 0.242529296875, "learning_rate": 5.88218132758287e-07, "loss": 0.0097, "reward": 1.0413473382592202, "reward_std": 0.26567225792095994, "rewards/accuracy_reward": 0.07916666865348816, "rewards/cosine_scaled_reward": 0.026069535012356936, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9361111290752888, "step": 1360 }, { "completion_length": 712.7250221252441, "epoch": 0.9044601813889924, "grad_norm": 0.5981778172151471, "kl": 0.25528564453125, "learning_rate": 5.497625518525374e-07, "loss": 0.0102, "reward": 1.055962248146534, "reward_std": 0.284910136385588, "rewards/accuracy_reward": 0.08541666902601719, "rewards/cosine_scaled_reward": 0.0323511115508154, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9381944619119167, "step": 1365 }, { "completion_length": 718.0333534240723, "epoch": 0.9077732223464613, "grad_norm": 0.7401804095601529, "kl": 0.25889892578125, "learning_rate": 5.125713523303133e-07, "loss": 0.0104, "reward": 1.0684035860002041, "reward_std": 0.2855658239132026, "rewards/accuracy_reward": 0.09375000242143869, "rewards/cosine_scaled_reward": 0.03576466972008348, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9388889104127884, "step": 1370 }, { "completion_length": 728.3187705993653, "epoch": 0.91108626330393, "grad_norm": 0.7858800120987066, "kl": 0.2416015625, "learning_rate": 4.7664951013645875e-07, "loss": 0.0097, "reward": 1.0451556123793124, "reward_std": 0.27854832728626205, "rewards/accuracy_reward": 0.08333333525806666, "rewards/cosine_scaled_reward": 0.029877811259939335, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9319444634020329, "step": 1375 }, { "completion_length": 723.3416854858399, "epoch": 0.9143993042613989, "grad_norm": 1.404475188878266, "kl": 0.22696533203125, "learning_rate": 4.420018313839147e-07, "loss": 0.0091, "reward": 1.049865160137415, "reward_std": 0.3253047477803193, "rewards/accuracy_reward": 0.08541666902601719, "rewards/cosine_scaled_reward": 0.02903180572320707, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9354166880249977, "step": 1380 }, { "completion_length": 721.9250244140625, "epoch": 0.9177123452188678, "grad_norm": 1.087760459049171, "kl": 0.2325927734375, "learning_rate": 4.086329517107046e-07, "loss": 0.0093, "reward": 1.0888452626764775, "reward_std": 0.27404293974395844, "rewards/accuracy_reward": 0.10000000260770321, "rewards/cosine_scaled_reward": 0.047873024456202985, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9409722402691841, "step": 1385 }, { "completion_length": 704.7958541870117, "epoch": 0.9210253861763366, "grad_norm": 0.9914491611629392, "kl": 0.238232421875, "learning_rate": 3.7654733565969826e-07, "loss": 0.0095, "reward": 1.1314453668892384, "reward_std": 0.3430222579801921, "rewards/accuracy_reward": 0.11875000298023224, "rewards/cosine_scaled_reward": 0.07936200675321743, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9333333499729634, "step": 1390 }, { "completion_length": 695.5625205993653, "epoch": 0.9243384271338054, "grad_norm": 0.6089582321778417, "kl": 0.2903564453125, "learning_rate": 3.457492760812975e-07, "loss": 0.0116, "reward": 1.1250825211405755, "reward_std": 0.317639110411983, "rewards/accuracy_reward": 0.11250000260770321, "rewards/cosine_scaled_reward": 0.06327693813364021, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.949305571615696, "step": 1395 }, { "completion_length": 705.9187675476074, "epoch": 0.9276514680912743, "grad_norm": 0.6391800429911892, "kl": 0.24630126953125, "learning_rate": 3.1624289355907334e-07, "loss": 0.0098, "reward": 1.0702355980873108, "reward_std": 0.23250129760126584, "rewards/accuracy_reward": 0.0916666692122817, "rewards/cosine_scaled_reward": 0.04037445327558089, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.938194464147091, "step": 1400 }, { "epoch": 0.9276514680912743, "eval_completion_length": 700.0735473632812, "eval_kl": 0.22983685661764705, "eval_loss": 0.009279442019760609, "eval_reward": 1.1201362399493946, "eval_reward_std": 0.28640370750251937, "eval_rewards/accuracy_reward": 0.11764706101487665, "eval_rewards/cosine_scaled_reward": 0.07438459050129442, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9281046004856334, "eval_runtime": 80.7771, "eval_samples_per_second": 1.226, "eval_steps_per_second": 0.111, "step": 1400 }, { "completion_length": 717.8875213623047, "epoch": 0.9309645090487431, "grad_norm": 0.6835557387795058, "kl": 0.23819580078125, "learning_rate": 2.8803213585846036e-07, "loss": 0.0095, "reward": 1.0553066372871398, "reward_std": 0.2549894027062692, "rewards/accuracy_reward": 0.09375000279396772, "rewards/cosine_scaled_reward": 0.042112173559144137, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9194444641470909, "step": 1405 }, { "completion_length": 685.7479331970214, "epoch": 0.9342775500062119, "grad_norm": 0.7088347918615974, "kl": 0.22274169921875, "learning_rate": 2.6112077739857465e-07, "loss": 0.0089, "reward": 1.0665720701217651, "reward_std": 0.25481581311905754, "rewards/accuracy_reward": 0.08125000223517417, "rewards/cosine_scaled_reward": 0.03323871046304703, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9520833477377891, "step": 1410 }, { "completion_length": 704.9562705993652, "epoch": 0.9375905909636808, "grad_norm": 0.691577077592554, "kl": 0.235888671875, "learning_rate": 2.3551241874721353e-07, "loss": 0.0094, "reward": 1.0381302520632745, "reward_std": 0.28534621774451807, "rewards/accuracy_reward": 0.07708333544433117, "rewards/cosine_scaled_reward": 0.027713555796071886, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9333333522081375, "step": 1415 }, { "completion_length": 697.8021026611328, "epoch": 0.9409036319211497, "grad_norm": 2.0485610770341998, "kl": 0.22833251953125, "learning_rate": 2.1121048613912843e-07, "loss": 0.0091, "reward": 1.087983725592494, "reward_std": 0.3151285564294085, "rewards/accuracy_reward": 0.10416666977107525, "rewards/cosine_scaled_reward": 0.057428138051182034, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9263889066874981, "step": 1420 }, { "completion_length": 690.93960647583, "epoch": 0.9442166728786184, "grad_norm": 0.666251108534613, "kl": 0.22415771484375, "learning_rate": 1.8821823101760949e-07, "loss": 0.009, "reward": 1.0924677282571793, "reward_std": 0.29339295757818035, "rewards/accuracy_reward": 0.10000000204890966, "rewards/cosine_scaled_reward": 0.051495483738835904, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9409722432494163, "step": 1425 }, { "completion_length": 690.7021018981934, "epoch": 0.9475297138360873, "grad_norm": 2.7191940866718802, "kl": 0.2302978515625, "learning_rate": 1.665387295994747e-07, "loss": 0.0092, "reward": 1.0474169701337814, "reward_std": 0.29303541539702566, "rewards/accuracy_reward": 0.07916666883975268, "rewards/cosine_scaled_reward": 0.032833608938381074, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9354166887700558, "step": 1430 }, { "completion_length": 707.6896057128906, "epoch": 0.9508427547935562, "grad_norm": 0.6106449500828635, "kl": 0.2447021484375, "learning_rate": 1.4617488246348012e-07, "loss": 0.0098, "reward": 1.0700841918587685, "reward_std": 0.3453480552649125, "rewards/accuracy_reward": 0.09791666939854622, "rewards/cosine_scaled_reward": 0.045084162172861396, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9270833499729634, "step": 1435 }, { "completion_length": 694.4583557128906, "epoch": 0.9541557957510249, "grad_norm": 0.7321351943055284, "kl": 0.23602294921875, "learning_rate": 1.271294141622459e-07, "loss": 0.0094, "reward": 1.1047494940459728, "reward_std": 0.28909234585589727, "rewards/accuracy_reward": 0.1083333358168602, "rewards/cosine_scaled_reward": 0.06030502080684528, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.936111131310463, "step": 1440 }, { "completion_length": 690.3562721252441, "epoch": 0.9574688367084938, "grad_norm": 0.5484337563215285, "kl": 0.21298828125, "learning_rate": 1.094048728577346e-07, "loss": 0.0085, "reward": 1.1469990946352482, "reward_std": 0.26795126046054063, "rewards/accuracy_reward": 0.11875000279396772, "rewards/cosine_scaled_reward": 0.06783241051016375, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9604166798293591, "step": 1445 }, { "completion_length": 703.7291854858398, "epoch": 0.9607818776659627, "grad_norm": 0.7045568444696371, "kl": 0.21536865234375, "learning_rate": 9.300362998030832e-08, "loss": 0.0086, "reward": 1.0955403231084346, "reward_std": 0.3071222034457605, "rewards/accuracy_reward": 0.11041666939854622, "rewards/cosine_scaled_reward": 0.05804029582650401, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9270833529531955, "step": 1450 }, { "completion_length": 696.5146034240722, "epoch": 0.9640949186234314, "grad_norm": 0.5643443806992297, "kl": 0.2209716796875, "learning_rate": 7.792787991146356e-08, "loss": 0.0088, "reward": 1.1237544424831867, "reward_std": 0.3226728780689882, "rewards/accuracy_reward": 0.11041666958481074, "rewards/cosine_scaled_reward": 0.05847664371249266, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9548611268401146, "step": 1455 }, { "completion_length": 698.7021045684814, "epoch": 0.9674079595809003, "grad_norm": 1.4726951296432622, "kl": 0.2478759765625, "learning_rate": 6.417963969022389e-08, "loss": 0.0099, "reward": 1.1310194060206413, "reward_std": 0.300541485712165, "rewards/accuracy_reward": 0.12500000279396772, "rewards/cosine_scaled_reward": 0.07615825935499743, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9298611298203469, "step": 1460 }, { "completion_length": 695.1916862487793, "epoch": 0.9707210005383692, "grad_norm": 0.6772357793748816, "kl": 0.22127685546875, "learning_rate": 5.176074874327919e-08, "loss": 0.0089, "reward": 1.1259605154395103, "reward_std": 0.324521057237871, "rewards/accuracy_reward": 0.11041666958481074, "rewards/cosine_scaled_reward": 0.06554382540634833, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9500000163912773, "step": 1465 }, { "completion_length": 700.1166908264161, "epoch": 0.9740340414958379, "grad_norm": 0.9280512399085947, "kl": 0.22369384765625, "learning_rate": 4.067286863888131e-08, "loss": 0.0089, "reward": 1.116700118035078, "reward_std": 0.30002225610078315, "rewards/accuracy_reward": 0.11250000335276127, "rewards/cosine_scaled_reward": 0.05836675295140594, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9458333514630795, "step": 1470 }, { "completion_length": 695.5937744140625, "epoch": 0.9773470824533068, "grad_norm": 0.6609700465204748, "kl": 0.225048828125, "learning_rate": 3.091748286453866e-08, "loss": 0.009, "reward": 1.078812862187624, "reward_std": 0.24239173660316737, "rewards/accuracy_reward": 0.08333333563059568, "rewards/cosine_scaled_reward": 0.040618392394389954, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9548611275851726, "step": 1475 }, { "completion_length": 690.068766784668, "epoch": 0.9806601234107757, "grad_norm": 0.5497284396300925, "kl": 0.216259765625, "learning_rate": 2.2495896628529355e-08, "loss": 0.0087, "reward": 1.1554948560893537, "reward_std": 0.333444345445605, "rewards/accuracy_reward": 0.13333333637565375, "rewards/cosine_scaled_reward": 0.0798003840725869, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.942361131310463, "step": 1480 }, { "completion_length": 710.7146034240723, "epoch": 0.9839731643682446, "grad_norm": 0.6180714789670924, "kl": 0.218316650390625, "learning_rate": 1.5409236685277608e-08, "loss": 0.0087, "reward": 1.115602646768093, "reward_std": 0.3021911540068686, "rewards/accuracy_reward": 0.11041666977107525, "rewards/cosine_scaled_reward": 0.06004706410458312, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.94513890594244, "step": 1485 }, { "completion_length": 701.2916839599609, "epoch": 0.9872862053257133, "grad_norm": 0.6733871699508633, "kl": 0.23233642578125, "learning_rate": 9.658451184600959e-09, "loss": 0.0093, "reward": 1.1246757924556732, "reward_std": 0.3239902650122531, "rewards/accuracy_reward": 0.12083333637565374, "rewards/cosine_scaled_reward": 0.06842575815971941, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9354166872799397, "step": 1490 }, { "completion_length": 692.4354377746582, "epoch": 0.9905992462831822, "grad_norm": 0.5974961049477129, "kl": 0.218798828125, "learning_rate": 5.2443095448506674e-09, "loss": 0.0087, "reward": 1.0733229786157608, "reward_std": 0.3221176668535918, "rewards/accuracy_reward": 0.0916666692122817, "rewards/cosine_scaled_reward": 0.04415629003779031, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9375000193715095, "step": 1495 }, { "completion_length": 693.5416900634766, "epoch": 0.9939122872406511, "grad_norm": 1.3933332238808005, "kl": 0.22105712890625, "learning_rate": 2.167402349972925e-09, "loss": 0.0088, "reward": 1.1403438955545426, "reward_std": 0.30477734181913546, "rewards/accuracy_reward": 0.11875000335276127, "rewards/cosine_scaled_reward": 0.07298276004439685, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9486111260950565, "step": 1500 }, { "epoch": 0.9939122872406511, "eval_completion_length": 699.0637530158548, "eval_kl": 0.23213465073529413, "eval_loss": 0.009332895278930664, "eval_reward": 1.158429626156302, "eval_reward_std": 0.3137010185267119, "eval_rewards/accuracy_reward": 0.1323529451208956, "eval_rewards/cosine_scaled_reward": 0.07836420439621981, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9477124459603253, "eval_runtime": 82.1122, "eval_samples_per_second": 1.206, "eval_steps_per_second": 0.11, "step": 1500 }, { "completion_length": 697.9646026611329, "epoch": 0.9972253281981198, "grad_norm": 0.7311978807571526, "kl": 0.218212890625, "learning_rate": 4.2814127048873553e-10, "loss": 0.0087, "reward": 1.1320341393351554, "reward_std": 0.26264935575891285, "rewards/accuracy_reward": 0.11041666958481074, "rewards/cosine_scaled_reward": 0.06258967398898677, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9590277940034866, "step": 1505 }, { "completion_length": 707.6172103881836, "epoch": 0.9998757609640949, "kl": 0.2285003662109375, "reward": 1.218618675135076, "reward_std": 0.3531808884363272, "rewards/accuracy_reward": 0.15885417046956718, "rewards/cosine_scaled_reward": 0.11445197471766733, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9453125167638063, "step": 1509, "total_flos": 0.0, "train_loss": 0.1835271008142303, "train_runtime": 123241.1861, "train_samples_per_second": 0.588, "train_steps_per_second": 0.012 } ], "logging_steps": 5, "max_steps": 1509, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }