diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6507 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 498, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 82.65625, + "epoch": 0.012048192771084338, + "grad_norm": 5.841508927710052, + "kl": 0.0, + "learning_rate": 9.97991967871486e-07, + "loss": 0.0, + "reward": 1.4489864706993103, + "reward_std": 0.8421240150928497, + "rewards/accuracy_reward": 0.8005490005016327, + "rewards/format_reward": 0.6484375, + "step": 1 + }, + { + "completion_length": 91.453125, + "epoch": 0.024096385542168676, + "grad_norm": 4.392637703815363, + "kl": 0.00279998779296875, + "learning_rate": 9.959839357429717e-07, + "loss": 0.0001, + "reward": 1.3076424598693848, + "reward_std": 0.8380775451660156, + "rewards/accuracy_reward": 0.6123300492763519, + "rewards/format_reward": 0.6953125, + "step": 2 + }, + { + "completion_length": 79.171875, + "epoch": 0.03614457831325301, + "grad_norm": 5.134937236220538, + "kl": 0.009063720703125, + "learning_rate": 9.93975903614458e-07, + "loss": 0.0004, + "reward": 1.650797963142395, + "reward_std": 0.8256142735481262, + "rewards/accuracy_reward": 0.8773605227470398, + "rewards/format_reward": 0.7734375, + "step": 3 + }, + { + "completion_length": 90.8671875, + "epoch": 0.04819277108433735, + "grad_norm": 4.181043208735878, + "kl": 0.0099029541015625, + "learning_rate": 9.919678714859437e-07, + "loss": 0.0004, + "reward": 1.4978268146514893, + "reward_std": 0.7668428122997284, + "rewards/accuracy_reward": 0.6618892848491669, + "rewards/format_reward": 0.8359375, + "step": 4 + }, + { + "completion_length": 83.15625, + "epoch": 0.060240963855421686, + "grad_norm": 4.623169300333461, + "kl": 0.028106689453125, + "learning_rate": 9.899598393574296e-07, + "loss": 0.0011, + "reward": 1.959537386894226, + "reward_std": 0.6147363781929016, + "rewards/accuracy_reward": 1.0532873272895813, + "rewards/format_reward": 0.90625, + "step": 5 + }, + { + "completion_length": 75.1484375, + "epoch": 0.07228915662650602, + "grad_norm": 5.568012410409197, + "kl": 0.03021240234375, + "learning_rate": 9.879518072289156e-07, + "loss": 0.0012, + "reward": 2.047786593437195, + "reward_std": 0.4053535610437393, + "rewards/accuracy_reward": 1.0946615934371948, + "rewards/format_reward": 0.953125, + "step": 6 + }, + { + "completion_length": 76.03125, + "epoch": 0.08433734939759036, + "grad_norm": 4.7579852016782045, + "kl": 0.033935546875, + "learning_rate": 9.859437751004016e-07, + "loss": 0.0014, + "reward": 2.1630080938339233, + "reward_std": 0.3877447098493576, + "rewards/accuracy_reward": 1.2333204746246338, + "rewards/format_reward": 0.9296875, + "step": 7 + }, + { + "completion_length": 71.546875, + "epoch": 0.0963855421686747, + "grad_norm": 9.256093312505593, + "kl": 0.244384765625, + "learning_rate": 9.839357429718876e-07, + "loss": 0.0097, + "reward": 2.015242576599121, + "reward_std": 0.4337102472782135, + "rewards/accuracy_reward": 1.054305076599121, + "rewards/format_reward": 0.9609375, + "step": 8 + }, + { + "completion_length": 72.1796875, + "epoch": 0.10843373493975904, + "grad_norm": 9.959610046323814, + "kl": 0.2841796875, + "learning_rate": 9.819277108433734e-07, + "loss": 0.0114, + "reward": 1.9989103078842163, + "reward_std": 0.38074547052383423, + "rewards/accuracy_reward": 1.0145351886749268, + "rewards/format_reward": 0.984375, + "step": 9 + }, + { + "completion_length": 67.0078125, + "epoch": 0.12048192771084337, + "grad_norm": 4.494217301954794, + "kl": 0.0677490234375, + "learning_rate": 9.799196787148593e-07, + "loss": 0.0027, + "reward": 2.208647847175598, + "reward_std": 0.20472895354032516, + "rewards/accuracy_reward": 1.2086476683616638, + "rewards/format_reward": 1.0, + "step": 10 + }, + { + "completion_length": 66.3125, + "epoch": 0.13253012048192772, + "grad_norm": 4.205085729740715, + "kl": 0.111083984375, + "learning_rate": 9.779116465863453e-07, + "loss": 0.0044, + "reward": 2.016738772392273, + "reward_std": 0.39626075327396393, + "rewards/accuracy_reward": 1.0323637425899506, + "rewards/format_reward": 0.984375, + "step": 11 + }, + { + "completion_length": 64.2265625, + "epoch": 0.14457831325301204, + "grad_norm": 5.285643902891126, + "kl": 0.0670166015625, + "learning_rate": 9.759036144578313e-07, + "loss": 0.0027, + "reward": 2.0809445977211, + "reward_std": 0.3285638391971588, + "rewards/accuracy_reward": 1.080944538116455, + "rewards/format_reward": 1.0, + "step": 12 + }, + { + "completion_length": 57.7265625, + "epoch": 0.1566265060240964, + "grad_norm": 5.332797970620105, + "kl": 0.07958984375, + "learning_rate": 9.738955823293173e-07, + "loss": 0.0032, + "reward": 2.1677627563476562, + "reward_std": 0.32235731184482574, + "rewards/accuracy_reward": 1.1677626371383667, + "rewards/format_reward": 1.0, + "step": 13 + }, + { + "completion_length": 62.765625, + "epoch": 0.1686746987951807, + "grad_norm": 7.594424067233083, + "kl": 0.086181640625, + "learning_rate": 9.718875502008033e-07, + "loss": 0.0034, + "reward": 2.287484049797058, + "reward_std": 0.2577601447701454, + "rewards/accuracy_reward": 1.3031091094017029, + "rewards/format_reward": 0.984375, + "step": 14 + }, + { + "completion_length": 61.28125, + "epoch": 0.18072289156626506, + "grad_norm": 6.602361615736723, + "kl": 0.087890625, + "learning_rate": 9.69879518072289e-07, + "loss": 0.0035, + "reward": 2.28032910823822, + "reward_std": 0.38463760912418365, + "rewards/accuracy_reward": 1.2881416082382202, + "rewards/format_reward": 0.9921875, + "step": 15 + }, + { + "completion_length": 63.6796875, + "epoch": 0.1927710843373494, + "grad_norm": 4.1986480450121135, + "kl": 0.078125, + "learning_rate": 9.67871485943775e-07, + "loss": 0.0031, + "reward": 2.1277613639831543, + "reward_std": 0.2963729351758957, + "rewards/accuracy_reward": 1.1433865427970886, + "rewards/format_reward": 0.984375, + "step": 16 + }, + { + "completion_length": 60.65625, + "epoch": 0.20481927710843373, + "grad_norm": 6.921299965436032, + "kl": 0.088134765625, + "learning_rate": 9.65863453815261e-07, + "loss": 0.0035, + "reward": 2.157727599143982, + "reward_std": 0.30868735909461975, + "rewards/accuracy_reward": 1.1733525395393372, + "rewards/format_reward": 0.984375, + "step": 17 + }, + { + "completion_length": 59.2265625, + "epoch": 0.21686746987951808, + "grad_norm": 4.904213548043611, + "kl": 0.07666015625, + "learning_rate": 9.63855421686747e-07, + "loss": 0.0031, + "reward": 2.24626088142395, + "reward_std": 0.22766248881816864, + "rewards/accuracy_reward": 1.2540735006332397, + "rewards/format_reward": 0.9921875, + "step": 18 + }, + { + "completion_length": 58.703125, + "epoch": 0.2289156626506024, + "grad_norm": 4.786279154756674, + "kl": 0.109619140625, + "learning_rate": 9.61847389558233e-07, + "loss": 0.0044, + "reward": 2.050855040550232, + "reward_std": 0.35161878168582916, + "rewards/accuracy_reward": 1.0586674511432648, + "rewards/format_reward": 0.9921875, + "step": 19 + }, + { + "completion_length": 58.109375, + "epoch": 0.24096385542168675, + "grad_norm": 4.05967579782597, + "kl": 0.08056640625, + "learning_rate": 9.598393574297187e-07, + "loss": 0.0032, + "reward": 2.20633327960968, + "reward_std": 0.3129453659057617, + "rewards/accuracy_reward": 1.2219581604003906, + "rewards/format_reward": 0.984375, + "step": 20 + }, + { + "completion_length": 57.71875, + "epoch": 0.25301204819277107, + "grad_norm": 5.8300935596675885, + "kl": 0.080078125, + "learning_rate": 9.57831325301205e-07, + "loss": 0.0032, + "reward": 2.417273759841919, + "reward_std": 0.28760989010334015, + "rewards/accuracy_reward": 1.4250862002372742, + "rewards/format_reward": 0.9921875, + "step": 21 + }, + { + "completion_length": 54.5859375, + "epoch": 0.26506024096385544, + "grad_norm": 7.535044861581114, + "kl": 0.106201171875, + "learning_rate": 9.558232931726907e-07, + "loss": 0.0042, + "reward": 2.2527129650115967, + "reward_std": 0.2951706647872925, + "rewards/accuracy_reward": 1.2683378458023071, + "rewards/format_reward": 0.984375, + "step": 22 + }, + { + "completion_length": 61.09375, + "epoch": 0.27710843373493976, + "grad_norm": 4.416172924233661, + "kl": 0.10009765625, + "learning_rate": 9.538152610441766e-07, + "loss": 0.004, + "reward": 2.1894314289093018, + "reward_std": 0.21257736533880234, + "rewards/accuracy_reward": 1.1894314289093018, + "rewards/format_reward": 1.0, + "step": 23 + }, + { + "completion_length": 54.9921875, + "epoch": 0.2891566265060241, + "grad_norm": 4.553446996976198, + "kl": 0.09814453125, + "learning_rate": 9.518072289156625e-07, + "loss": 0.0039, + "reward": 2.3037142753601074, + "reward_std": 0.3323938250541687, + "rewards/accuracy_reward": 1.3115268349647522, + "rewards/format_reward": 0.9921875, + "step": 24 + }, + { + "completion_length": 55.9921875, + "epoch": 0.30120481927710846, + "grad_norm": 8.671383785487564, + "kl": 0.120849609375, + "learning_rate": 9.497991967871486e-07, + "loss": 0.0048, + "reward": 2.239556074142456, + "reward_std": 0.3447880446910858, + "rewards/accuracy_reward": 1.2551808953285217, + "rewards/format_reward": 0.984375, + "step": 25 + }, + { + "completion_length": 58.7890625, + "epoch": 0.3132530120481928, + "grad_norm": 8.322624639517006, + "kl": 0.12353515625, + "learning_rate": 9.477911646586345e-07, + "loss": 0.0049, + "reward": 2.2209770679473877, + "reward_std": 0.3139883056282997, + "rewards/accuracy_reward": 1.2287896275520325, + "rewards/format_reward": 0.9921875, + "step": 26 + }, + { + "completion_length": 56.2421875, + "epoch": 0.3253012048192771, + "grad_norm": 20.55146941012377, + "kl": 0.130126953125, + "learning_rate": 9.457831325301205e-07, + "loss": 0.0052, + "reward": 2.344720959663391, + "reward_std": 0.25742725282907486, + "rewards/accuracy_reward": 1.3525334596633911, + "rewards/format_reward": 0.9921875, + "step": 27 + }, + { + "completion_length": 52.3671875, + "epoch": 0.3373493975903614, + "grad_norm": 4.550988243582887, + "kl": 0.12548828125, + "learning_rate": 9.437751004016063e-07, + "loss": 0.005, + "reward": 2.407941460609436, + "reward_std": 0.3139786869287491, + "rewards/accuracy_reward": 1.4313790798187256, + "rewards/format_reward": 0.9765625, + "step": 28 + }, + { + "completion_length": 53.328125, + "epoch": 0.3493975903614458, + "grad_norm": 5.133796660962732, + "kl": 0.1435546875, + "learning_rate": 9.417670682730924e-07, + "loss": 0.0057, + "reward": 2.3306795358657837, + "reward_std": 0.3039723336696625, + "rewards/accuracy_reward": 1.3463045954704285, + "rewards/format_reward": 0.984375, + "step": 29 + }, + { + "completion_length": 53.8125, + "epoch": 0.3614457831325301, + "grad_norm": 6.796717577260548, + "kl": 0.27880859375, + "learning_rate": 9.397590361445783e-07, + "loss": 0.0112, + "reward": 2.2834625244140625, + "reward_std": 0.3063512295484543, + "rewards/accuracy_reward": 1.2834625244140625, + "rewards/format_reward": 1.0, + "step": 30 + }, + { + "completion_length": 56.3203125, + "epoch": 0.37349397590361444, + "grad_norm": 4.3393989853337285, + "kl": 0.14794921875, + "learning_rate": 9.377510040160642e-07, + "loss": 0.0059, + "reward": 2.354575991630554, + "reward_std": 0.314766064286232, + "rewards/accuracy_reward": 1.3623886704444885, + "rewards/format_reward": 0.9921875, + "step": 31 + }, + { + "completion_length": 54.171875, + "epoch": 0.3855421686746988, + "grad_norm": 4.279946209704863, + "kl": 0.197265625, + "learning_rate": 9.357429718875502e-07, + "loss": 0.0079, + "reward": 2.1385136246681213, + "reward_std": 0.24586574733257294, + "rewards/accuracy_reward": 1.1463261544704437, + "rewards/format_reward": 0.9921875, + "step": 32 + }, + { + "completion_length": 51.4140625, + "epoch": 0.39759036144578314, + "grad_norm": 5.88762957444806, + "kl": 0.1630859375, + "learning_rate": 9.33734939759036e-07, + "loss": 0.0065, + "reward": 2.2907108068466187, + "reward_std": 0.25231631100177765, + "rewards/accuracy_reward": 1.2907109260559082, + "rewards/format_reward": 1.0, + "step": 33 + }, + { + "completion_length": 50.4609375, + "epoch": 0.40963855421686746, + "grad_norm": 5.469228934242547, + "kl": 0.16845703125, + "learning_rate": 9.317269076305221e-07, + "loss": 0.0067, + "reward": 2.2533600330352783, + "reward_std": 0.25808002054691315, + "rewards/accuracy_reward": 1.2611725330352783, + "rewards/format_reward": 0.9921875, + "step": 34 + }, + { + "completion_length": 47.84375, + "epoch": 0.42168674698795183, + "grad_norm": 5.412602747215773, + "kl": 0.177734375, + "learning_rate": 9.29718875502008e-07, + "loss": 0.0071, + "reward": 2.3132054805755615, + "reward_std": 0.2454073503613472, + "rewards/accuracy_reward": 1.3132054805755615, + "rewards/format_reward": 1.0, + "step": 35 + }, + { + "completion_length": 44.21875, + "epoch": 0.43373493975903615, + "grad_norm": 5.190368238545804, + "kl": 0.2275390625, + "learning_rate": 9.27710843373494e-07, + "loss": 0.0091, + "reward": 2.2854232788085938, + "reward_std": 0.29085223376750946, + "rewards/accuracy_reward": 1.293235719203949, + "rewards/format_reward": 0.9921875, + "step": 36 + }, + { + "completion_length": 48.71875, + "epoch": 0.4457831325301205, + "grad_norm": 4.780274291960778, + "kl": 0.20751953125, + "learning_rate": 9.257028112449798e-07, + "loss": 0.0083, + "reward": 2.246184825897217, + "reward_std": 0.31601477414369583, + "rewards/accuracy_reward": 1.261809766292572, + "rewards/format_reward": 0.984375, + "step": 37 + }, + { + "completion_length": 42.265625, + "epoch": 0.4578313253012048, + "grad_norm": 6.234590681750942, + "kl": 0.265625, + "learning_rate": 9.236947791164659e-07, + "loss": 0.0106, + "reward": 2.112604260444641, + "reward_std": 0.30199334025382996, + "rewards/accuracy_reward": 1.1126042604446411, + "rewards/format_reward": 1.0, + "step": 38 + }, + { + "completion_length": 45.1015625, + "epoch": 0.46987951807228917, + "grad_norm": 4.611394363412455, + "kl": 0.15576171875, + "learning_rate": 9.216867469879518e-07, + "loss": 0.0062, + "reward": 2.3590028285980225, + "reward_std": 0.2973439395427704, + "rewards/accuracy_reward": 1.3746278285980225, + "rewards/format_reward": 0.984375, + "step": 39 + }, + { + "completion_length": 45.3046875, + "epoch": 0.4819277108433735, + "grad_norm": 6.117578716606278, + "kl": 0.17626953125, + "learning_rate": 9.196787148594377e-07, + "loss": 0.0071, + "reward": 2.2271867990493774, + "reward_std": 0.22323830425739288, + "rewards/accuracy_reward": 1.234999418258667, + "rewards/format_reward": 0.9921875, + "step": 40 + }, + { + "completion_length": 41.9453125, + "epoch": 0.4939759036144578, + "grad_norm": 4.858430237306144, + "kl": 0.2236328125, + "learning_rate": 9.176706827309237e-07, + "loss": 0.0089, + "reward": 2.217424750328064, + "reward_std": 0.2663164809346199, + "rewards/accuracy_reward": 1.2252373099327087, + "rewards/format_reward": 0.9921875, + "step": 41 + }, + { + "completion_length": 41.0234375, + "epoch": 0.5060240963855421, + "grad_norm": 4.127212546225013, + "kl": 0.18212890625, + "learning_rate": 9.156626506024095e-07, + "loss": 0.0073, + "reward": 2.16755473613739, + "reward_std": 0.3387562334537506, + "rewards/accuracy_reward": 1.1753671169281006, + "rewards/format_reward": 0.9921875, + "step": 42 + }, + { + "completion_length": 42.6640625, + "epoch": 0.5180722891566265, + "grad_norm": 5.226665280180925, + "kl": 0.23193359375, + "learning_rate": 9.136546184738956e-07, + "loss": 0.0093, + "reward": 2.203770875930786, + "reward_std": 0.3409430831670761, + "rewards/accuracy_reward": 1.2350206971168518, + "rewards/format_reward": 0.96875, + "step": 43 + }, + { + "completion_length": 40.9609375, + "epoch": 0.5301204819277109, + "grad_norm": 4.308668359699942, + "kl": 0.134033203125, + "learning_rate": 9.116465863453815e-07, + "loss": 0.0054, + "reward": 2.2817225456237793, + "reward_std": 0.19574209302663803, + "rewards/accuracy_reward": 1.281722605228424, + "rewards/format_reward": 1.0, + "step": 44 + }, + { + "completion_length": 38.7734375, + "epoch": 0.5421686746987951, + "grad_norm": 6.033974360622575, + "kl": 0.13232421875, + "learning_rate": 9.096385542168675e-07, + "loss": 0.0053, + "reward": 2.2139052152633667, + "reward_std": 0.28486668318510056, + "rewards/accuracy_reward": 1.2451552748680115, + "rewards/format_reward": 0.96875, + "step": 45 + }, + { + "completion_length": 41.1484375, + "epoch": 0.5542168674698795, + "grad_norm": 5.314865555502224, + "kl": 0.11279296875, + "learning_rate": 9.076305220883533e-07, + "loss": 0.0045, + "reward": 2.4188212156295776, + "reward_std": 0.2556447684764862, + "rewards/accuracy_reward": 1.4266336560249329, + "rewards/format_reward": 0.9921875, + "step": 46 + }, + { + "completion_length": 42.7109375, + "epoch": 0.5662650602409639, + "grad_norm": 3.687080063413381, + "kl": 0.123046875, + "learning_rate": 9.056224899598393e-07, + "loss": 0.0049, + "reward": 2.2985291481018066, + "reward_std": 0.2858593165874481, + "rewards/accuracy_reward": 1.3063417077064514, + "rewards/format_reward": 0.9921875, + "step": 47 + }, + { + "completion_length": 46.859375, + "epoch": 0.5783132530120482, + "grad_norm": 4.277184476359137, + "kl": 0.20166015625, + "learning_rate": 9.036144578313253e-07, + "loss": 0.0081, + "reward": 2.1704814434051514, + "reward_std": 0.3619203567504883, + "rewards/accuracy_reward": 1.186106562614441, + "rewards/format_reward": 0.984375, + "step": 48 + }, + { + "completion_length": 45.21875, + "epoch": 0.5903614457831325, + "grad_norm": 3.7971557376020577, + "kl": 0.124267578125, + "learning_rate": 9.016064257028112e-07, + "loss": 0.005, + "reward": 2.1000068187713623, + "reward_std": 0.2924596816301346, + "rewards/accuracy_reward": 1.123444378376007, + "rewards/format_reward": 0.9765625, + "step": 49 + }, + { + "completion_length": 44.7734375, + "epoch": 0.6024096385542169, + "grad_norm": 4.458817172061971, + "kl": 0.111083984375, + "learning_rate": 8.995983935742972e-07, + "loss": 0.0044, + "reward": 2.2635247707366943, + "reward_std": 0.3522821515798569, + "rewards/accuracy_reward": 1.2869621515274048, + "rewards/format_reward": 0.9765625, + "step": 50 + }, + { + "completion_length": 51.5859375, + "epoch": 0.6144578313253012, + "grad_norm": 5.351600002967812, + "kl": 0.115234375, + "learning_rate": 8.97590361445783e-07, + "loss": 0.0046, + "reward": 2.321009397506714, + "reward_std": 0.23405297100543976, + "rewards/accuracy_reward": 1.3366344571113586, + "rewards/format_reward": 0.984375, + "step": 51 + }, + { + "completion_length": 50.421875, + "epoch": 0.6265060240963856, + "grad_norm": 4.213335817741083, + "kl": 0.1396484375, + "learning_rate": 8.955823293172691e-07, + "loss": 0.0056, + "reward": 2.3553450107574463, + "reward_std": 0.25443293899297714, + "rewards/accuracy_reward": 1.3944076299667358, + "rewards/format_reward": 0.9609375, + "step": 52 + }, + { + "completion_length": 60.6015625, + "epoch": 0.6385542168674698, + "grad_norm": 6.123689334744157, + "kl": 0.121337890625, + "learning_rate": 8.93574297188755e-07, + "loss": 0.0049, + "reward": 2.112071990966797, + "reward_std": 0.30149899423122406, + "rewards/accuracy_reward": 1.1433220505714417, + "rewards/format_reward": 0.96875, + "step": 53 + }, + { + "completion_length": 50.0703125, + "epoch": 0.6506024096385542, + "grad_norm": 4.396654754831157, + "kl": 0.1337890625, + "learning_rate": 8.915662650602409e-07, + "loss": 0.0053, + "reward": 2.233729839324951, + "reward_std": 0.23247240483760834, + "rewards/accuracy_reward": 1.2571672797203064, + "rewards/format_reward": 0.9765625, + "step": 54 + }, + { + "completion_length": 60.2890625, + "epoch": 0.6626506024096386, + "grad_norm": 7.03985835954293, + "kl": 0.10498046875, + "learning_rate": 8.895582329317268e-07, + "loss": 0.0042, + "reward": 2.196902871131897, + "reward_std": 0.2882121652364731, + "rewards/accuracy_reward": 1.2125278115272522, + "rewards/format_reward": 0.984375, + "step": 55 + }, + { + "completion_length": 50.640625, + "epoch": 0.6746987951807228, + "grad_norm": 4.86896494949543, + "kl": 0.12451171875, + "learning_rate": 8.875502008032128e-07, + "loss": 0.005, + "reward": 2.171112537384033, + "reward_std": 0.16461243480443954, + "rewards/accuracy_reward": 1.1867375373840332, + "rewards/format_reward": 0.984375, + "step": 56 + }, + { + "completion_length": 53.21875, + "epoch": 0.6867469879518072, + "grad_norm": 3.557538165261062, + "kl": 0.1240234375, + "learning_rate": 8.855421686746988e-07, + "loss": 0.005, + "reward": 2.2328275442123413, + "reward_std": 0.2752218544483185, + "rewards/accuracy_reward": 1.2406402230262756, + "rewards/format_reward": 0.9921875, + "step": 57 + }, + { + "completion_length": 47.8671875, + "epoch": 0.6987951807228916, + "grad_norm": 5.180162989820259, + "kl": 0.125, + "learning_rate": 8.835341365461847e-07, + "loss": 0.005, + "reward": 2.2453041076660156, + "reward_std": 0.315682128071785, + "rewards/accuracy_reward": 1.268741488456726, + "rewards/format_reward": 0.9765625, + "step": 58 + }, + { + "completion_length": 57.9765625, + "epoch": 0.7108433734939759, + "grad_norm": 3.899105782667564, + "kl": 0.10205078125, + "learning_rate": 8.815261044176707e-07, + "loss": 0.0041, + "reward": 2.284543514251709, + "reward_std": 0.25333235412836075, + "rewards/accuracy_reward": 1.292356252670288, + "rewards/format_reward": 0.9921875, + "step": 59 + }, + { + "completion_length": 46.5859375, + "epoch": 0.7228915662650602, + "grad_norm": 13.765129472909528, + "kl": 0.106201171875, + "learning_rate": 8.795180722891565e-07, + "loss": 0.0042, + "reward": 2.113099694252014, + "reward_std": 0.326066330075264, + "rewards/accuracy_reward": 1.1287246942520142, + "rewards/format_reward": 0.984375, + "step": 60 + }, + { + "completion_length": 46.375, + "epoch": 0.7349397590361446, + "grad_norm": 6.1270425433473, + "kl": 0.16357421875, + "learning_rate": 8.775100401606425e-07, + "loss": 0.0065, + "reward": 1.9968695640563965, + "reward_std": 0.34320104122161865, + "rewards/accuracy_reward": 1.0124945640563965, + "rewards/format_reward": 0.984375, + "step": 61 + }, + { + "completion_length": 53.09375, + "epoch": 0.7469879518072289, + "grad_norm": 4.3056291481606745, + "kl": 0.1513671875, + "learning_rate": 8.755020080321285e-07, + "loss": 0.0061, + "reward": 2.1780970096588135, + "reward_std": 0.2706674858927727, + "rewards/accuracy_reward": 1.2093469500541687, + "rewards/format_reward": 0.96875, + "step": 62 + }, + { + "completion_length": 55.9375, + "epoch": 0.7590361445783133, + "grad_norm": 3.2395174572422416, + "kl": 0.14501953125, + "learning_rate": 8.734939759036144e-07, + "loss": 0.0058, + "reward": 2.1430922746658325, + "reward_std": 0.24412654340267181, + "rewards/accuracy_reward": 1.1665297150611877, + "rewards/format_reward": 0.9765625, + "step": 63 + }, + { + "completion_length": 56.6328125, + "epoch": 0.7710843373493976, + "grad_norm": 4.190814109425291, + "kl": 0.11962890625, + "learning_rate": 8.714859437751003e-07, + "loss": 0.0048, + "reward": 2.1700193881988525, + "reward_std": 0.2942150831222534, + "rewards/accuracy_reward": 1.1934569478034973, + "rewards/format_reward": 0.9765625, + "step": 64 + }, + { + "completion_length": 64.3984375, + "epoch": 0.7831325301204819, + "grad_norm": 3.226137200230793, + "kl": 0.102783203125, + "learning_rate": 8.694779116465863e-07, + "loss": 0.0041, + "reward": 2.2898290157318115, + "reward_std": 0.2443845123052597, + "rewards/accuracy_reward": 1.3132665753364563, + "rewards/format_reward": 0.9765625, + "step": 65 + }, + { + "completion_length": 67.7109375, + "epoch": 0.7951807228915663, + "grad_norm": 3.9157620361816314, + "kl": 0.0927734375, + "learning_rate": 8.674698795180723e-07, + "loss": 0.0037, + "reward": 2.161790609359741, + "reward_std": 0.29590657353401184, + "rewards/accuracy_reward": 1.1696029901504517, + "rewards/format_reward": 0.9921875, + "step": 66 + }, + { + "completion_length": 74.3203125, + "epoch": 0.8072289156626506, + "grad_norm": 3.1212414712368375, + "kl": 0.082763671875, + "learning_rate": 8.654618473895582e-07, + "loss": 0.0033, + "reward": 2.215745210647583, + "reward_std": 0.2766411006450653, + "rewards/accuracy_reward": 1.2313700914382935, + "rewards/format_reward": 0.984375, + "step": 67 + }, + { + "completion_length": 74.0390625, + "epoch": 0.8192771084337349, + "grad_norm": 3.446969302283755, + "kl": 0.074951171875, + "learning_rate": 8.634538152610441e-07, + "loss": 0.003, + "reward": 2.1964612007141113, + "reward_std": 0.235237754881382, + "rewards/accuracy_reward": 1.2198986411094666, + "rewards/format_reward": 0.9765625, + "step": 68 + }, + { + "completion_length": 76.9375, + "epoch": 0.8313253012048193, + "grad_norm": 3.310962519125171, + "kl": 0.08154296875, + "learning_rate": 8.614457831325301e-07, + "loss": 0.0033, + "reward": 2.1269989013671875, + "reward_std": 0.2448011264204979, + "rewards/accuracy_reward": 1.1426239013671875, + "rewards/format_reward": 0.984375, + "step": 69 + }, + { + "completion_length": 71.3984375, + "epoch": 0.8433734939759037, + "grad_norm": 3.2998576155248966, + "kl": 0.0888671875, + "learning_rate": 8.59437751004016e-07, + "loss": 0.0036, + "reward": 2.2479825019836426, + "reward_std": 0.2886482775211334, + "rewards/accuracy_reward": 1.2636074423789978, + "rewards/format_reward": 0.984375, + "step": 70 + }, + { + "completion_length": 72.1484375, + "epoch": 0.8554216867469879, + "grad_norm": 7.668000907111886, + "kl": 0.07861328125, + "learning_rate": 8.57429718875502e-07, + "loss": 0.0031, + "reward": 2.2247371673583984, + "reward_std": 0.2391326129436493, + "rewards/accuracy_reward": 1.2637996673583984, + "rewards/format_reward": 0.9609375, + "step": 71 + }, + { + "completion_length": 77.7734375, + "epoch": 0.8674698795180723, + "grad_norm": 3.4104191137958013, + "kl": 0.068359375, + "learning_rate": 8.554216867469879e-07, + "loss": 0.0027, + "reward": 2.2031702995300293, + "reward_std": 0.21321924775838852, + "rewards/accuracy_reward": 1.210982859134674, + "rewards/format_reward": 0.9921875, + "step": 72 + }, + { + "completion_length": 76.5546875, + "epoch": 0.8795180722891566, + "grad_norm": 3.884229840630286, + "kl": 0.0947265625, + "learning_rate": 8.534136546184738e-07, + "loss": 0.0038, + "reward": 2.2307136058807373, + "reward_std": 0.2959597185254097, + "rewards/accuracy_reward": 1.2463387250900269, + "rewards/format_reward": 0.984375, + "step": 73 + }, + { + "completion_length": 73.7265625, + "epoch": 0.891566265060241, + "grad_norm": 7.2397255809983525, + "kl": 0.170654296875, + "learning_rate": 8.514056224899598e-07, + "loss": 0.0068, + "reward": 2.311343193054199, + "reward_std": 0.21377335488796234, + "rewards/accuracy_reward": 1.319155752658844, + "rewards/format_reward": 0.9921875, + "step": 74 + }, + { + "completion_length": 71.5859375, + "epoch": 0.9036144578313253, + "grad_norm": 3.397020763244455, + "kl": 0.073974609375, + "learning_rate": 8.493975903614458e-07, + "loss": 0.003, + "reward": 2.3479005098342896, + "reward_std": 0.2722414582967758, + "rewards/accuracy_reward": 1.3713379502296448, + "rewards/format_reward": 0.9765625, + "step": 75 + }, + { + "completion_length": 64.34375, + "epoch": 0.9156626506024096, + "grad_norm": 4.709358727325993, + "kl": 0.116455078125, + "learning_rate": 8.473895582329317e-07, + "loss": 0.0047, + "reward": 2.1038066148757935, + "reward_std": 0.3149692267179489, + "rewards/accuracy_reward": 1.158493995666504, + "rewards/format_reward": 0.9453125, + "step": 76 + }, + { + "completion_length": 69.390625, + "epoch": 0.927710843373494, + "grad_norm": 3.3768601117352923, + "kl": 0.11376953125, + "learning_rate": 8.453815261044176e-07, + "loss": 0.0046, + "reward": 2.02778023481369, + "reward_std": 0.3105141818523407, + "rewards/accuracy_reward": 1.074655294418335, + "rewards/format_reward": 0.953125, + "step": 77 + }, + { + "completion_length": 67.328125, + "epoch": 0.9397590361445783, + "grad_norm": 3.504578270706009, + "kl": 0.115234375, + "learning_rate": 8.433734939759036e-07, + "loss": 0.0046, + "reward": 2.194709539413452, + "reward_std": 0.27273692935705185, + "rewards/accuracy_reward": 1.2181469202041626, + "rewards/format_reward": 0.9765625, + "step": 78 + }, + { + "completion_length": 75.1640625, + "epoch": 0.9518072289156626, + "grad_norm": 4.043012399812061, + "kl": 0.123046875, + "learning_rate": 8.413654618473895e-07, + "loss": 0.0049, + "reward": 2.13509202003479, + "reward_std": 0.313528910279274, + "rewards/accuracy_reward": 1.18196702003479, + "rewards/format_reward": 0.953125, + "step": 79 + }, + { + "completion_length": 70.0234375, + "epoch": 0.963855421686747, + "grad_norm": 4.870660538899373, + "kl": 0.086181640625, + "learning_rate": 8.393574297188755e-07, + "loss": 0.0035, + "reward": 2.1953389644622803, + "reward_std": 0.26908765733242035, + "rewards/accuracy_reward": 1.2265888452529907, + "rewards/format_reward": 0.96875, + "step": 80 + }, + { + "completion_length": 80.859375, + "epoch": 0.9759036144578314, + "grad_norm": 3.8261245848047065, + "kl": 0.1015625, + "learning_rate": 8.373493975903614e-07, + "loss": 0.0041, + "reward": 2.0212653279304504, + "reward_std": 0.3835397958755493, + "rewards/accuracy_reward": 1.0915777683258057, + "rewards/format_reward": 0.9296875, + "step": 81 + }, + { + "completion_length": 74.046875, + "epoch": 0.9879518072289156, + "grad_norm": 4.0964460767880535, + "kl": 0.083984375, + "learning_rate": 8.353413654618474e-07, + "loss": 0.0034, + "reward": 2.2536615133285522, + "reward_std": 0.2658763527870178, + "rewards/accuracy_reward": 1.2770991325378418, + "rewards/format_reward": 0.9765625, + "step": 82 + }, + { + "completion_length": 74.58333587646484, + "epoch": 1.0, + "grad_norm": 2.9272571318373655, + "kl": 0.1044921875, + "learning_rate": 8.333333333333333e-07, + "loss": 0.004, + "reward": 2.1187774538993835, + "reward_std": 0.1469321921467781, + "rewards/accuracy_reward": 1.1187774240970612, + "rewards/format_reward": 1.0, + "step": 83 + }, + { + "completion_length": 67.5390625, + "epoch": 1.0120481927710843, + "grad_norm": 4.360041456699287, + "kl": 0.116455078125, + "learning_rate": 8.313253012048192e-07, + "loss": 0.0047, + "reward": 2.2748764753341675, + "reward_std": 0.30198951065540314, + "rewards/accuracy_reward": 1.2983139157295227, + "rewards/format_reward": 0.9765625, + "step": 84 + }, + { + "completion_length": 71.640625, + "epoch": 1.0240963855421688, + "grad_norm": 3.852904865115574, + "kl": 0.100341796875, + "learning_rate": 8.293172690763052e-07, + "loss": 0.004, + "reward": 2.22179639339447, + "reward_std": 0.2614322751760483, + "rewards/accuracy_reward": 1.2452340126037598, + "rewards/format_reward": 0.9765625, + "step": 85 + }, + { + "completion_length": 77.71875, + "epoch": 1.036144578313253, + "grad_norm": 4.570601093607917, + "kl": 0.086181640625, + "learning_rate": 8.273092369477911e-07, + "loss": 0.0034, + "reward": 2.3267804384231567, + "reward_std": 0.1871008574962616, + "rewards/accuracy_reward": 1.3424054384231567, + "rewards/format_reward": 0.984375, + "step": 86 + }, + { + "completion_length": 74.0703125, + "epoch": 1.0481927710843373, + "grad_norm": 4.387034223472388, + "kl": 0.09033203125, + "learning_rate": 8.253012048192771e-07, + "loss": 0.0036, + "reward": 2.280067205429077, + "reward_std": 0.2090277522802353, + "rewards/accuracy_reward": 1.2800670266151428, + "rewards/format_reward": 1.0, + "step": 87 + }, + { + "completion_length": 72.8828125, + "epoch": 1.0602409638554218, + "grad_norm": 3.640432077142004, + "kl": 0.097412109375, + "learning_rate": 8.23293172690763e-07, + "loss": 0.0039, + "reward": 2.2264442443847656, + "reward_std": 0.2877971976995468, + "rewards/accuracy_reward": 1.2576942443847656, + "rewards/format_reward": 0.96875, + "step": 88 + }, + { + "completion_length": 68.9765625, + "epoch": 1.072289156626506, + "grad_norm": 3.6617214501921755, + "kl": 0.10107421875, + "learning_rate": 8.21285140562249e-07, + "loss": 0.004, + "reward": 2.232625722885132, + "reward_std": 0.26599176973104477, + "rewards/accuracy_reward": 1.2482507824897766, + "rewards/format_reward": 0.984375, + "step": 89 + }, + { + "completion_length": 74.765625, + "epoch": 1.0843373493975903, + "grad_norm": 4.600311265578528, + "kl": 0.09130859375, + "learning_rate": 8.192771084337349e-07, + "loss": 0.0037, + "reward": 2.253629207611084, + "reward_std": 0.21175827831029892, + "rewards/accuracy_reward": 1.269254207611084, + "rewards/format_reward": 0.984375, + "step": 90 + }, + { + "completion_length": 76.59375, + "epoch": 1.0963855421686748, + "grad_norm": 4.145602929032845, + "kl": 0.087646484375, + "learning_rate": 8.172690763052207e-07, + "loss": 0.0035, + "reward": 2.2744953632354736, + "reward_std": 0.24358398467302322, + "rewards/accuracy_reward": 1.2901203632354736, + "rewards/format_reward": 0.984375, + "step": 91 + }, + { + "completion_length": 75.875, + "epoch": 1.108433734939759, + "grad_norm": 3.8292102418969853, + "kl": 0.10693359375, + "learning_rate": 8.152610441767068e-07, + "loss": 0.0043, + "reward": 2.4102468490600586, + "reward_std": 0.22168071568012238, + "rewards/accuracy_reward": 1.4180592894554138, + "rewards/format_reward": 0.9921875, + "step": 92 + }, + { + "completion_length": 73.5078125, + "epoch": 1.1204819277108433, + "grad_norm": 3.889694391559541, + "kl": 0.0859375, + "learning_rate": 8.132530120481927e-07, + "loss": 0.0034, + "reward": 2.19115674495697, + "reward_std": 0.191669300198555, + "rewards/accuracy_reward": 1.1989692449569702, + "rewards/format_reward": 0.9921875, + "step": 93 + }, + { + "completion_length": 74.359375, + "epoch": 1.1325301204819278, + "grad_norm": 13.572499915490392, + "kl": 0.115966796875, + "learning_rate": 8.112449799196787e-07, + "loss": 0.0046, + "reward": 2.3821544647216797, + "reward_std": 0.2079356163740158, + "rewards/accuracy_reward": 1.3899668455123901, + "rewards/format_reward": 0.9921875, + "step": 94 + }, + { + "completion_length": 70.875, + "epoch": 1.144578313253012, + "grad_norm": 3.96863603284974, + "kl": 0.096923828125, + "learning_rate": 8.092369477911646e-07, + "loss": 0.0039, + "reward": 2.301279664039612, + "reward_std": 0.17724627256393433, + "rewards/accuracy_reward": 1.309092104434967, + "rewards/format_reward": 0.9921875, + "step": 95 + }, + { + "completion_length": 69.3125, + "epoch": 1.1566265060240963, + "grad_norm": 3.4379001474745206, + "kl": 0.090087890625, + "learning_rate": 8.072289156626506e-07, + "loss": 0.0036, + "reward": 2.371612310409546, + "reward_std": 0.1584479957818985, + "rewards/accuracy_reward": 1.371612310409546, + "rewards/format_reward": 1.0, + "step": 96 + }, + { + "completion_length": 68.6171875, + "epoch": 1.1686746987951806, + "grad_norm": 4.586260816062996, + "kl": 0.09375, + "learning_rate": 8.052208835341365e-07, + "loss": 0.0037, + "reward": 2.4862219095230103, + "reward_std": 0.20000579208135605, + "rewards/accuracy_reward": 1.4862220287322998, + "rewards/format_reward": 1.0, + "step": 97 + }, + { + "completion_length": 70.015625, + "epoch": 1.180722891566265, + "grad_norm": 4.047101829945655, + "kl": 0.112060546875, + "learning_rate": 8.032128514056225e-07, + "loss": 0.0045, + "reward": 2.2514266967773438, + "reward_std": 0.22294947504997253, + "rewards/accuracy_reward": 1.2514267563819885, + "rewards/format_reward": 1.0, + "step": 98 + }, + { + "completion_length": 66.9140625, + "epoch": 1.1927710843373494, + "grad_norm": 5.444249065473958, + "kl": 0.088134765625, + "learning_rate": 8.012048192771084e-07, + "loss": 0.0035, + "reward": 2.333179473876953, + "reward_std": 0.1811930388212204, + "rewards/accuracy_reward": 1.3331794738769531, + "rewards/format_reward": 1.0, + "step": 99 + }, + { + "completion_length": 65.828125, + "epoch": 1.2048192771084336, + "grad_norm": 7.074570957060863, + "kl": 0.1064453125, + "learning_rate": 7.991967871485942e-07, + "loss": 0.0043, + "reward": 2.278498649597168, + "reward_std": 0.17714769393205643, + "rewards/accuracy_reward": 1.2863109111785889, + "rewards/format_reward": 0.9921875, + "step": 100 + }, + { + "completion_length": 62.6875, + "epoch": 1.216867469879518, + "grad_norm": 6.600402598086416, + "kl": 0.099609375, + "learning_rate": 7.971887550200803e-07, + "loss": 0.004, + "reward": 2.3798866271972656, + "reward_std": 0.1492375209927559, + "rewards/accuracy_reward": 1.3798866868019104, + "rewards/format_reward": 1.0, + "step": 101 + }, + { + "completion_length": 67.234375, + "epoch": 1.2289156626506024, + "grad_norm": 5.4322907915163645, + "kl": 0.0927734375, + "learning_rate": 7.951807228915662e-07, + "loss": 0.0037, + "reward": 2.295409917831421, + "reward_std": 0.26540718972682953, + "rewards/accuracy_reward": 1.311034917831421, + "rewards/format_reward": 0.984375, + "step": 102 + }, + { + "completion_length": 62.59375, + "epoch": 1.2409638554216866, + "grad_norm": 4.734234621294123, + "kl": 0.10986328125, + "learning_rate": 7.931726907630522e-07, + "loss": 0.0044, + "reward": 2.3131519556045532, + "reward_std": 0.2041746824979782, + "rewards/accuracy_reward": 1.3209643959999084, + "rewards/format_reward": 0.9921875, + "step": 103 + }, + { + "completion_length": 65.0078125, + "epoch": 1.2530120481927711, + "grad_norm": 11.27432402123553, + "kl": 0.094482421875, + "learning_rate": 7.911646586345381e-07, + "loss": 0.0038, + "reward": 2.423591375350952, + "reward_std": 0.17853456735610962, + "rewards/accuracy_reward": 1.4235913753509521, + "rewards/format_reward": 1.0, + "step": 104 + }, + { + "completion_length": 61.96875, + "epoch": 1.2650602409638554, + "grad_norm": 5.605209449566961, + "kl": 0.10595703125, + "learning_rate": 7.891566265060241e-07, + "loss": 0.0042, + "reward": 2.2498486042022705, + "reward_std": 0.2505866587162018, + "rewards/accuracy_reward": 1.2576610445976257, + "rewards/format_reward": 0.9921875, + "step": 105 + }, + { + "completion_length": 69.890625, + "epoch": 1.2771084337349397, + "grad_norm": 9.555144265496201, + "kl": 0.1015625, + "learning_rate": 7.8714859437751e-07, + "loss": 0.0041, + "reward": 2.153669834136963, + "reward_std": 0.2159716784954071, + "rewards/accuracy_reward": 1.161482334136963, + "rewards/format_reward": 0.9921875, + "step": 106 + }, + { + "completion_length": 63.5625, + "epoch": 1.2891566265060241, + "grad_norm": 4.205528221959235, + "kl": 0.100341796875, + "learning_rate": 7.851405622489959e-07, + "loss": 0.004, + "reward": 2.2599010467529297, + "reward_std": 0.22189538180828094, + "rewards/accuracy_reward": 1.2599008083343506, + "rewards/format_reward": 1.0, + "step": 107 + }, + { + "completion_length": 60.3359375, + "epoch": 1.3012048192771084, + "grad_norm": 4.549607105799596, + "kl": 0.13525390625, + "learning_rate": 7.831325301204819e-07, + "loss": 0.0054, + "reward": 2.2945663928985596, + "reward_std": 0.2269488275051117, + "rewards/accuracy_reward": 1.2945663928985596, + "rewards/format_reward": 1.0, + "step": 108 + }, + { + "completion_length": 63.9765625, + "epoch": 1.3132530120481927, + "grad_norm": 7.122658458301131, + "kl": 0.10400390625, + "learning_rate": 7.811244979919679e-07, + "loss": 0.0042, + "reward": 2.223813772201538, + "reward_std": 0.2691728472709656, + "rewards/accuracy_reward": 1.2316263318061829, + "rewards/format_reward": 0.9921875, + "step": 109 + }, + { + "completion_length": 64.0390625, + "epoch": 1.3253012048192772, + "grad_norm": 4.0970391288989285, + "kl": 0.102783203125, + "learning_rate": 7.791164658634538e-07, + "loss": 0.0041, + "reward": 2.402035713195801, + "reward_std": 0.2192593812942505, + "rewards/accuracy_reward": 1.409848153591156, + "rewards/format_reward": 0.9921875, + "step": 110 + }, + { + "completion_length": 61.984375, + "epoch": 1.3373493975903614, + "grad_norm": 5.00798288991921, + "kl": 0.100830078125, + "learning_rate": 7.771084337349397e-07, + "loss": 0.004, + "reward": 2.268544912338257, + "reward_std": 0.17878198623657227, + "rewards/accuracy_reward": 1.2685450315475464, + "rewards/format_reward": 1.0, + "step": 111 + }, + { + "completion_length": 58.296875, + "epoch": 1.3493975903614457, + "grad_norm": 4.283142882967245, + "kl": 0.10888671875, + "learning_rate": 7.751004016064257e-07, + "loss": 0.0044, + "reward": 2.373852849006653, + "reward_std": 0.17504306137561798, + "rewards/accuracy_reward": 1.3738529086112976, + "rewards/format_reward": 1.0, + "step": 112 + }, + { + "completion_length": 60.484375, + "epoch": 1.3614457831325302, + "grad_norm": 4.840347639337677, + "kl": 0.097412109375, + "learning_rate": 7.730923694779116e-07, + "loss": 0.0039, + "reward": 2.2944198846817017, + "reward_std": 0.2088237851858139, + "rewards/accuracy_reward": 1.2944198250770569, + "rewards/format_reward": 1.0, + "step": 113 + }, + { + "completion_length": 59.6328125, + "epoch": 1.3734939759036144, + "grad_norm": 3.441438097506757, + "kl": 0.095458984375, + "learning_rate": 7.710843373493975e-07, + "loss": 0.0038, + "reward": 2.2015284299850464, + "reward_std": 0.22288134694099426, + "rewards/accuracy_reward": 1.201528549194336, + "rewards/format_reward": 1.0, + "step": 114 + }, + { + "completion_length": 58.3203125, + "epoch": 1.3855421686746987, + "grad_norm": 5.2560716101244545, + "kl": 0.12890625, + "learning_rate": 7.690763052208835e-07, + "loss": 0.0052, + "reward": 2.395646095275879, + "reward_std": 0.21848639845848083, + "rewards/accuracy_reward": 1.3956461548805237, + "rewards/format_reward": 1.0, + "step": 115 + }, + { + "completion_length": 58.2734375, + "epoch": 1.3975903614457832, + "grad_norm": 5.450406858307557, + "kl": 0.1064453125, + "learning_rate": 7.670682730923694e-07, + "loss": 0.0043, + "reward": 2.4746010303497314, + "reward_std": 0.1482101045548916, + "rewards/accuracy_reward": 1.4746010303497314, + "rewards/format_reward": 1.0, + "step": 116 + }, + { + "completion_length": 57.65625, + "epoch": 1.4096385542168675, + "grad_norm": 4.642950561404122, + "kl": 0.124267578125, + "learning_rate": 7.650602409638554e-07, + "loss": 0.005, + "reward": 2.1899147033691406, + "reward_std": 0.2073155865073204, + "rewards/accuracy_reward": 1.1977271437644958, + "rewards/format_reward": 0.9921875, + "step": 117 + }, + { + "completion_length": 56.609375, + "epoch": 1.4216867469879517, + "grad_norm": 9.36763410057133, + "kl": 0.112548828125, + "learning_rate": 7.630522088353414e-07, + "loss": 0.0045, + "reward": 2.457427501678467, + "reward_std": 0.248141810297966, + "rewards/accuracy_reward": 1.4574276804924011, + "rewards/format_reward": 1.0, + "step": 118 + }, + { + "completion_length": 55.59375, + "epoch": 1.4337349397590362, + "grad_norm": 4.076025029890633, + "kl": 0.095947265625, + "learning_rate": 7.610441767068273e-07, + "loss": 0.0038, + "reward": 2.3175806999206543, + "reward_std": 0.21353702247142792, + "rewards/accuracy_reward": 1.3175806999206543, + "rewards/format_reward": 1.0, + "step": 119 + }, + { + "completion_length": 56.359375, + "epoch": 1.4457831325301205, + "grad_norm": 4.1118838634058905, + "kl": 0.10693359375, + "learning_rate": 7.590361445783132e-07, + "loss": 0.0043, + "reward": 2.306099772453308, + "reward_std": 0.2674330025911331, + "rewards/accuracy_reward": 1.3217247128486633, + "rewards/format_reward": 0.984375, + "step": 120 + }, + { + "completion_length": 56.765625, + "epoch": 1.4578313253012047, + "grad_norm": 4.370520474393478, + "kl": 0.10302734375, + "learning_rate": 7.570281124497991e-07, + "loss": 0.0041, + "reward": 2.1378331184387207, + "reward_std": 0.24683931469917297, + "rewards/accuracy_reward": 1.1378332376480103, + "rewards/format_reward": 1.0, + "step": 121 + }, + { + "completion_length": 61.4453125, + "epoch": 1.4698795180722892, + "grad_norm": 3.7827942646929427, + "kl": 0.120361328125, + "learning_rate": 7.550200803212851e-07, + "loss": 0.0048, + "reward": 2.1952574253082275, + "reward_std": 0.163675457239151, + "rewards/accuracy_reward": 1.1952574849128723, + "rewards/format_reward": 1.0, + "step": 122 + }, + { + "completion_length": 64.2734375, + "epoch": 1.4819277108433735, + "grad_norm": 3.7942059326042887, + "kl": 0.115478515625, + "learning_rate": 7.53012048192771e-07, + "loss": 0.0046, + "reward": 2.052876114845276, + "reward_std": 0.3279467225074768, + "rewards/accuracy_reward": 1.0606885850429535, + "rewards/format_reward": 0.9921875, + "step": 123 + }, + { + "completion_length": 61.7578125, + "epoch": 1.4939759036144578, + "grad_norm": 4.163145774578374, + "kl": 0.1083984375, + "learning_rate": 7.51004016064257e-07, + "loss": 0.0043, + "reward": 2.483773946762085, + "reward_std": 0.21236886084079742, + "rewards/accuracy_reward": 1.483773946762085, + "rewards/format_reward": 1.0, + "step": 124 + }, + { + "completion_length": 69.8359375, + "epoch": 1.5060240963855422, + "grad_norm": 8.540024207287942, + "kl": 0.122314453125, + "learning_rate": 7.489959839357429e-07, + "loss": 0.0049, + "reward": 2.207366466522217, + "reward_std": 0.22365009784698486, + "rewards/accuracy_reward": 1.2073664665222168, + "rewards/format_reward": 1.0, + "step": 125 + }, + { + "completion_length": 68.21875, + "epoch": 1.5180722891566265, + "grad_norm": 4.163585518888115, + "kl": 0.097412109375, + "learning_rate": 7.469879518072289e-07, + "loss": 0.0039, + "reward": 2.3682451248168945, + "reward_std": 0.17314215004444122, + "rewards/accuracy_reward": 1.3682451844215393, + "rewards/format_reward": 1.0, + "step": 126 + }, + { + "completion_length": 74.7734375, + "epoch": 1.5301204819277108, + "grad_norm": 5.7954755578535595, + "kl": 0.09912109375, + "learning_rate": 7.449799196787149e-07, + "loss": 0.004, + "reward": 2.3054428100585938, + "reward_std": 0.166117824614048, + "rewards/accuracy_reward": 1.313255250453949, + "rewards/format_reward": 0.9921875, + "step": 127 + }, + { + "completion_length": 77.3046875, + "epoch": 1.5421686746987953, + "grad_norm": 4.318669163836461, + "kl": 0.091796875, + "learning_rate": 7.429718875502008e-07, + "loss": 0.0037, + "reward": 2.1308990716934204, + "reward_std": 0.19852972030639648, + "rewards/accuracy_reward": 1.13089919090271, + "rewards/format_reward": 1.0, + "step": 128 + }, + { + "completion_length": 78.1015625, + "epoch": 1.5542168674698795, + "grad_norm": 4.096032296356097, + "kl": 0.102783203125, + "learning_rate": 7.409638554216867e-07, + "loss": 0.0041, + "reward": 2.445680260658264, + "reward_std": 0.1704091727733612, + "rewards/accuracy_reward": 1.4456802010536194, + "rewards/format_reward": 1.0, + "step": 129 + }, + { + "completion_length": 74.75, + "epoch": 1.5662650602409638, + "grad_norm": 4.47404453525868, + "kl": 0.100341796875, + "learning_rate": 7.389558232931726e-07, + "loss": 0.004, + "reward": 2.2448705434799194, + "reward_std": 0.21340852975845337, + "rewards/accuracy_reward": 1.2448704838752747, + "rewards/format_reward": 1.0, + "step": 130 + }, + { + "completion_length": 75.3671875, + "epoch": 1.5783132530120483, + "grad_norm": 23.135090346261265, + "kl": 1.1025390625, + "learning_rate": 7.369477911646586e-07, + "loss": 0.0444, + "reward": 2.368005871772766, + "reward_std": 0.24276328086853027, + "rewards/accuracy_reward": 1.3680058717727661, + "rewards/format_reward": 1.0, + "step": 131 + }, + { + "completion_length": 76.5234375, + "epoch": 1.5903614457831325, + "grad_norm": 3.560296625305877, + "kl": 0.14111328125, + "learning_rate": 7.349397590361446e-07, + "loss": 0.0056, + "reward": 2.3832234144210815, + "reward_std": 0.2271246314048767, + "rewards/accuracy_reward": 1.398848533630371, + "rewards/format_reward": 0.984375, + "step": 132 + }, + { + "completion_length": 78.515625, + "epoch": 1.6024096385542168, + "grad_norm": 4.271885997013165, + "kl": 0.103271484375, + "learning_rate": 7.329317269076305e-07, + "loss": 0.0041, + "reward": 2.11967396736145, + "reward_std": 0.21069814264774323, + "rewards/accuracy_reward": 1.119674026966095, + "rewards/format_reward": 1.0, + "step": 133 + }, + { + "completion_length": 81.2109375, + "epoch": 1.6144578313253013, + "grad_norm": 3.989749340172797, + "kl": 0.10009765625, + "learning_rate": 7.309236947791164e-07, + "loss": 0.004, + "reward": 2.2381746768951416, + "reward_std": 0.2712934762239456, + "rewards/accuracy_reward": 1.2537997961044312, + "rewards/format_reward": 0.984375, + "step": 134 + }, + { + "completion_length": 84.828125, + "epoch": 1.6265060240963856, + "grad_norm": 5.101727030105181, + "kl": 0.0927734375, + "learning_rate": 7.289156626506024e-07, + "loss": 0.0037, + "reward": 2.3006190061569214, + "reward_std": 0.2388201355934143, + "rewards/accuracy_reward": 1.3084314465522766, + "rewards/format_reward": 0.9921875, + "step": 135 + }, + { + "completion_length": 78.3984375, + "epoch": 1.6385542168674698, + "grad_norm": 7.945369222479043, + "kl": 0.109130859375, + "learning_rate": 7.269076305220884e-07, + "loss": 0.0044, + "reward": 2.187756061553955, + "reward_std": 0.22536994516849518, + "rewards/accuracy_reward": 1.2033808827400208, + "rewards/format_reward": 0.984375, + "step": 136 + }, + { + "completion_length": 83.0234375, + "epoch": 1.6506024096385543, + "grad_norm": 7.511759922163927, + "kl": 0.074462890625, + "learning_rate": 7.248995983935742e-07, + "loss": 0.003, + "reward": 2.299572706222534, + "reward_std": 0.22408785670995712, + "rewards/accuracy_reward": 1.3073852062225342, + "rewards/format_reward": 0.9921875, + "step": 137 + }, + { + "completion_length": 84.640625, + "epoch": 1.6626506024096386, + "grad_norm": 3.2982396535282623, + "kl": 0.0810546875, + "learning_rate": 7.228915662650602e-07, + "loss": 0.0032, + "reward": 2.3804391622543335, + "reward_std": 0.2060808688402176, + "rewards/accuracy_reward": 1.3804389834403992, + "rewards/format_reward": 1.0, + "step": 138 + }, + { + "completion_length": 87.8125, + "epoch": 1.6746987951807228, + "grad_norm": 8.41708008218346, + "kl": 0.0810546875, + "learning_rate": 7.208835341365461e-07, + "loss": 0.0032, + "reward": 2.2146860361099243, + "reward_std": 0.2540859431028366, + "rewards/accuracy_reward": 1.2146860361099243, + "rewards/format_reward": 1.0, + "step": 139 + }, + { + "completion_length": 86.140625, + "epoch": 1.6867469879518073, + "grad_norm": 3.5435273544538815, + "kl": 0.072998046875, + "learning_rate": 7.188755020080321e-07, + "loss": 0.0029, + "reward": 2.3307693004608154, + "reward_std": 0.20385809987783432, + "rewards/accuracy_reward": 1.3385818004608154, + "rewards/format_reward": 0.9921875, + "step": 140 + }, + { + "completion_length": 85.9375, + "epoch": 1.6987951807228916, + "grad_norm": 3.544683408089574, + "kl": 0.083984375, + "learning_rate": 7.168674698795181e-07, + "loss": 0.0034, + "reward": 2.2913438081741333, + "reward_std": 0.26863446831703186, + "rewards/accuracy_reward": 1.3069688081741333, + "rewards/format_reward": 0.984375, + "step": 141 + }, + { + "completion_length": 83.2578125, + "epoch": 1.7108433734939759, + "grad_norm": 4.741927242341381, + "kl": 0.12548828125, + "learning_rate": 7.14859437751004e-07, + "loss": 0.005, + "reward": 2.3960628509521484, + "reward_std": 0.2550785541534424, + "rewards/accuracy_reward": 1.3960627913475037, + "rewards/format_reward": 1.0, + "step": 142 + }, + { + "completion_length": 86.671875, + "epoch": 1.7228915662650603, + "grad_norm": 3.0874349711182494, + "kl": 0.07470703125, + "learning_rate": 7.128514056224899e-07, + "loss": 0.003, + "reward": 2.3813560009002686, + "reward_std": 0.25298502296209335, + "rewards/accuracy_reward": 1.381356120109558, + "rewards/format_reward": 1.0, + "step": 143 + }, + { + "completion_length": 80.40625, + "epoch": 1.7349397590361446, + "grad_norm": 9.215211678123678, + "kl": 0.085693359375, + "learning_rate": 7.108433734939758e-07, + "loss": 0.0034, + "reward": 2.3150322437286377, + "reward_std": 0.23231424391269684, + "rewards/accuracy_reward": 1.315032422542572, + "rewards/format_reward": 1.0, + "step": 144 + }, + { + "completion_length": 79.5859375, + "epoch": 1.7469879518072289, + "grad_norm": 3.3677362414264307, + "kl": 0.098876953125, + "learning_rate": 7.088353413654619e-07, + "loss": 0.0039, + "reward": 2.2901567220687866, + "reward_std": 0.21487458050251007, + "rewards/accuracy_reward": 1.2979693412780762, + "rewards/format_reward": 0.9921875, + "step": 145 + }, + { + "completion_length": 87.2734375, + "epoch": 1.7590361445783134, + "grad_norm": 3.8053306313986037, + "kl": 0.104736328125, + "learning_rate": 7.068273092369477e-07, + "loss": 0.0042, + "reward": 2.2074761390686035, + "reward_std": 0.24223129451274872, + "rewards/accuracy_reward": 1.2074760794639587, + "rewards/format_reward": 1.0, + "step": 146 + }, + { + "completion_length": 88.984375, + "epoch": 1.7710843373493976, + "grad_norm": 4.960937467624004, + "kl": 0.08251953125, + "learning_rate": 7.048192771084337e-07, + "loss": 0.0033, + "reward": 2.2357683181762695, + "reward_std": 0.2608248367905617, + "rewards/accuracy_reward": 1.2435806393623352, + "rewards/format_reward": 0.9921875, + "step": 147 + }, + { + "completion_length": 80.421875, + "epoch": 1.783132530120482, + "grad_norm": 3.5313461555382717, + "kl": 0.106689453125, + "learning_rate": 7.028112449799196e-07, + "loss": 0.0042, + "reward": 2.223365068435669, + "reward_std": 0.20793087780475616, + "rewards/accuracy_reward": 1.2311774492263794, + "rewards/format_reward": 0.9921875, + "step": 148 + }, + { + "completion_length": 81.6328125, + "epoch": 1.7951807228915664, + "grad_norm": 3.917968857756188, + "kl": 0.082763671875, + "learning_rate": 7.008032128514057e-07, + "loss": 0.0033, + "reward": 2.431049346923828, + "reward_std": 0.25210463255643845, + "rewards/accuracy_reward": 1.4310495257377625, + "rewards/format_reward": 1.0, + "step": 149 + }, + { + "completion_length": 82.71875, + "epoch": 1.8072289156626506, + "grad_norm": 3.2751640437820417, + "kl": 0.105224609375, + "learning_rate": 6.987951807228916e-07, + "loss": 0.0042, + "reward": 2.167607069015503, + "reward_std": 0.20023201406002045, + "rewards/accuracy_reward": 1.183232069015503, + "rewards/format_reward": 0.984375, + "step": 150 + }, + { + "completion_length": 80.1015625, + "epoch": 1.819277108433735, + "grad_norm": 3.696030829693263, + "kl": 0.09716796875, + "learning_rate": 6.967871485943774e-07, + "loss": 0.0039, + "reward": 2.545083999633789, + "reward_std": 0.17634352296590805, + "rewards/accuracy_reward": 1.5450841188430786, + "rewards/format_reward": 1.0, + "step": 151 + }, + { + "completion_length": 81.6484375, + "epoch": 1.8313253012048194, + "grad_norm": 5.419229696650584, + "kl": 0.119873046875, + "learning_rate": 6.947791164658634e-07, + "loss": 0.0048, + "reward": 2.144273281097412, + "reward_std": 0.2491978257894516, + "rewards/accuracy_reward": 1.152085781097412, + "rewards/format_reward": 0.9921875, + "step": 152 + }, + { + "completion_length": 77.96875, + "epoch": 1.8433734939759037, + "grad_norm": 34.81233821704641, + "kl": 0.09619140625, + "learning_rate": 6.927710843373493e-07, + "loss": 0.0039, + "reward": 2.4207249879837036, + "reward_std": 0.22066732123494148, + "rewards/accuracy_reward": 1.4207251071929932, + "rewards/format_reward": 1.0, + "step": 153 + }, + { + "completion_length": 81.3984375, + "epoch": 1.855421686746988, + "grad_norm": 4.095705367504911, + "kl": 0.101806640625, + "learning_rate": 6.907630522088354e-07, + "loss": 0.0041, + "reward": 2.160383105278015, + "reward_std": 0.27165083587169647, + "rewards/accuracy_reward": 1.1681956052780151, + "rewards/format_reward": 0.9921875, + "step": 154 + }, + { + "completion_length": 79.78125, + "epoch": 1.8674698795180724, + "grad_norm": 3.0440685644807663, + "kl": 0.11865234375, + "learning_rate": 6.887550200803212e-07, + "loss": 0.0047, + "reward": 2.4971319437026978, + "reward_std": 0.16808781027793884, + "rewards/accuracy_reward": 1.4971320629119873, + "rewards/format_reward": 1.0, + "step": 155 + }, + { + "completion_length": 83.09375, + "epoch": 1.8795180722891565, + "grad_norm": 3.1771226883841206, + "kl": 0.10498046875, + "learning_rate": 6.867469879518072e-07, + "loss": 0.0042, + "reward": 2.1450811624526978, + "reward_std": 0.2694619745016098, + "rewards/accuracy_reward": 1.1450812816619873, + "rewards/format_reward": 1.0, + "step": 156 + }, + { + "completion_length": 81.9453125, + "epoch": 1.891566265060241, + "grad_norm": 3.4230588560037583, + "kl": 0.113525390625, + "learning_rate": 6.847389558232931e-07, + "loss": 0.0045, + "reward": 2.44959032535553, + "reward_std": 0.16196198761463165, + "rewards/accuracy_reward": 1.4574028253555298, + "rewards/format_reward": 0.9921875, + "step": 157 + }, + { + "completion_length": 86.203125, + "epoch": 1.9036144578313254, + "grad_norm": 5.9344079114737, + "kl": 0.1015625, + "learning_rate": 6.827309236947792e-07, + "loss": 0.0041, + "reward": 2.1924350261688232, + "reward_std": 0.1869198903441429, + "rewards/accuracy_reward": 1.1924351453781128, + "rewards/format_reward": 1.0, + "step": 158 + }, + { + "completion_length": 84.7734375, + "epoch": 1.9156626506024095, + "grad_norm": 3.7338258911048707, + "kl": 0.105224609375, + "learning_rate": 6.807228915662651e-07, + "loss": 0.0042, + "reward": 2.298088550567627, + "reward_std": 0.2152806669473648, + "rewards/accuracy_reward": 1.3059011697769165, + "rewards/format_reward": 0.9921875, + "step": 159 + }, + { + "completion_length": 88.2109375, + "epoch": 1.927710843373494, + "grad_norm": 3.2737012532681535, + "kl": 0.124755859375, + "learning_rate": 6.787148594377509e-07, + "loss": 0.005, + "reward": 2.3695740699768066, + "reward_std": 0.300421878695488, + "rewards/accuracy_reward": 1.3930113911628723, + "rewards/format_reward": 0.9765625, + "step": 160 + }, + { + "completion_length": 82.9921875, + "epoch": 1.9397590361445785, + "grad_norm": 14.347253854862437, + "kl": 0.119873046875, + "learning_rate": 6.767068273092369e-07, + "loss": 0.0048, + "reward": 2.306626796722412, + "reward_std": 0.2548489645123482, + "rewards/accuracy_reward": 1.3222516179084778, + "rewards/format_reward": 0.984375, + "step": 161 + }, + { + "completion_length": 87.734375, + "epoch": 1.9518072289156625, + "grad_norm": 3.457686333163172, + "kl": 0.109375, + "learning_rate": 6.746987951807228e-07, + "loss": 0.0044, + "reward": 2.2328758239746094, + "reward_std": 0.28791245073080063, + "rewards/accuracy_reward": 1.2641257643699646, + "rewards/format_reward": 0.96875, + "step": 162 + }, + { + "completion_length": 83.25, + "epoch": 1.963855421686747, + "grad_norm": 4.1768305143971824, + "kl": 0.12353515625, + "learning_rate": 6.726907630522089e-07, + "loss": 0.0049, + "reward": 2.2161502838134766, + "reward_std": 0.25863420963287354, + "rewards/accuracy_reward": 1.2630252242088318, + "rewards/format_reward": 0.953125, + "step": 163 + }, + { + "completion_length": 88.734375, + "epoch": 1.9759036144578315, + "grad_norm": 4.842793088552531, + "kl": 0.105712890625, + "learning_rate": 6.706827309236947e-07, + "loss": 0.0042, + "reward": 2.090719521045685, + "reward_std": 0.25029148161411285, + "rewards/accuracy_reward": 1.1141569316387177, + "rewards/format_reward": 0.9765625, + "step": 164 + }, + { + "completion_length": 86.1953125, + "epoch": 1.9879518072289155, + "grad_norm": 3.657481472750154, + "kl": 0.125244140625, + "learning_rate": 6.686746987951807e-07, + "loss": 0.005, + "reward": 2.2765581607818604, + "reward_std": 0.2915503680706024, + "rewards/accuracy_reward": 1.30780827999115, + "rewards/format_reward": 0.96875, + "step": 165 + }, + { + "completion_length": 92.16666793823242, + "epoch": 2.0, + "grad_norm": 3.6057161188599776, + "kl": 0.125732421875, + "learning_rate": 6.666666666666666e-07, + "loss": 0.0047, + "reward": 2.234604835510254, + "reward_std": 0.2570358142256737, + "rewards/accuracy_reward": 1.2346049845218658, + "rewards/format_reward": 1.0, + "step": 166 + }, + { + "completion_length": 87.1484375, + "epoch": 2.0120481927710845, + "grad_norm": 3.7603470456590564, + "kl": 0.094482421875, + "learning_rate": 6.646586345381526e-07, + "loss": 0.0038, + "reward": 2.2034374475479126, + "reward_std": 0.3387380540370941, + "rewards/accuracy_reward": 1.2112498879432678, + "rewards/format_reward": 0.9921875, + "step": 167 + }, + { + "completion_length": 86.1953125, + "epoch": 2.0240963855421685, + "grad_norm": 4.4381952945033465, + "kl": 0.09765625, + "learning_rate": 6.626506024096386e-07, + "loss": 0.0039, + "reward": 2.222957730293274, + "reward_std": 0.2284381240606308, + "rewards/accuracy_reward": 1.238582730293274, + "rewards/format_reward": 0.984375, + "step": 168 + }, + { + "completion_length": 84.3125, + "epoch": 2.036144578313253, + "grad_norm": 3.399081917667578, + "kl": 0.0966796875, + "learning_rate": 6.606425702811244e-07, + "loss": 0.0039, + "reward": 2.2074966430664062, + "reward_std": 0.2783028930425644, + "rewards/accuracy_reward": 1.2231215238571167, + "rewards/format_reward": 0.984375, + "step": 169 + }, + { + "completion_length": 84.1640625, + "epoch": 2.0481927710843375, + "grad_norm": 3.794821230336393, + "kl": 0.10400390625, + "learning_rate": 6.586345381526104e-07, + "loss": 0.0042, + "reward": 2.2774429321289062, + "reward_std": 0.18755661696195602, + "rewards/accuracy_reward": 1.2774428129196167, + "rewards/format_reward": 1.0, + "step": 170 + }, + { + "completion_length": 84.7421875, + "epoch": 2.0602409638554215, + "grad_norm": 5.41653478361753, + "kl": 0.09130859375, + "learning_rate": 6.566265060240963e-07, + "loss": 0.0036, + "reward": 2.2825827598571777, + "reward_std": 0.20142250508069992, + "rewards/accuracy_reward": 1.2825825810432434, + "rewards/format_reward": 1.0, + "step": 171 + }, + { + "completion_length": 78.421875, + "epoch": 2.072289156626506, + "grad_norm": 4.831319526617051, + "kl": 0.099365234375, + "learning_rate": 6.546184738955824e-07, + "loss": 0.004, + "reward": 2.4247552156448364, + "reward_std": 0.19953592866659164, + "rewards/accuracy_reward": 1.4247552752494812, + "rewards/format_reward": 1.0, + "step": 172 + }, + { + "completion_length": 78.359375, + "epoch": 2.0843373493975905, + "grad_norm": 3.8109915515963038, + "kl": 0.10498046875, + "learning_rate": 6.526104417670682e-07, + "loss": 0.0042, + "reward": 2.3325507640838623, + "reward_std": 0.26026056706905365, + "rewards/accuracy_reward": 1.348175823688507, + "rewards/format_reward": 0.984375, + "step": 173 + }, + { + "completion_length": 79.21875, + "epoch": 2.0963855421686746, + "grad_norm": 4.94758596751216, + "kl": 0.130615234375, + "learning_rate": 6.506024096385541e-07, + "loss": 0.0052, + "reward": 2.3614529371261597, + "reward_std": 0.23941361159086227, + "rewards/accuracy_reward": 1.3614528179168701, + "rewards/format_reward": 1.0, + "step": 174 + }, + { + "completion_length": 80.8984375, + "epoch": 2.108433734939759, + "grad_norm": 4.645980861130919, + "kl": 0.12646484375, + "learning_rate": 6.485943775100401e-07, + "loss": 0.0051, + "reward": 2.148719310760498, + "reward_std": 0.2538711354136467, + "rewards/accuracy_reward": 1.1487191915512085, + "rewards/format_reward": 1.0, + "step": 175 + }, + { + "completion_length": 78.921875, + "epoch": 2.1204819277108435, + "grad_norm": 3.362542245290514, + "kl": 0.090576171875, + "learning_rate": 6.465863453815261e-07, + "loss": 0.0036, + "reward": 2.3466458320617676, + "reward_std": 0.21008533239364624, + "rewards/accuracy_reward": 1.346645712852478, + "rewards/format_reward": 1.0, + "step": 176 + }, + { + "completion_length": 78.5546875, + "epoch": 2.1325301204819276, + "grad_norm": 3.6960106974538585, + "kl": 0.0908203125, + "learning_rate": 6.445783132530121e-07, + "loss": 0.0036, + "reward": 2.4223729372024536, + "reward_std": 0.15239863470196724, + "rewards/accuracy_reward": 1.4223730564117432, + "rewards/format_reward": 1.0, + "step": 177 + }, + { + "completion_length": 76.890625, + "epoch": 2.144578313253012, + "grad_norm": 3.5646239400027913, + "kl": 0.103515625, + "learning_rate": 6.425702811244979e-07, + "loss": 0.0041, + "reward": 2.4388126134872437, + "reward_std": 0.22842204570770264, + "rewards/accuracy_reward": 1.4466250538825989, + "rewards/format_reward": 0.9921875, + "step": 178 + }, + { + "completion_length": 78.796875, + "epoch": 2.1566265060240966, + "grad_norm": 3.531186908359453, + "kl": 0.099609375, + "learning_rate": 6.405622489959839e-07, + "loss": 0.004, + "reward": 2.1039586067199707, + "reward_std": 0.23404612392187119, + "rewards/accuracy_reward": 1.1273961663246155, + "rewards/format_reward": 0.9765625, + "step": 179 + }, + { + "completion_length": 75.75, + "epoch": 2.1686746987951806, + "grad_norm": 5.0096541073452485, + "kl": 0.1015625, + "learning_rate": 6.385542168674698e-07, + "loss": 0.0041, + "reward": 2.374882221221924, + "reward_std": 0.2003496214747429, + "rewards/accuracy_reward": 1.374882161617279, + "rewards/format_reward": 1.0, + "step": 180 + }, + { + "completion_length": 79.9375, + "epoch": 2.180722891566265, + "grad_norm": 3.929802835585037, + "kl": 0.102294921875, + "learning_rate": 6.365461847389559e-07, + "loss": 0.0041, + "reward": 2.4310786724090576, + "reward_std": 0.20660096406936646, + "rewards/accuracy_reward": 1.4310787916183472, + "rewards/format_reward": 1.0, + "step": 181 + }, + { + "completion_length": 80.7578125, + "epoch": 2.1927710843373496, + "grad_norm": 4.226674931816659, + "kl": 0.09619140625, + "learning_rate": 6.345381526104418e-07, + "loss": 0.0038, + "reward": 2.3952780961990356, + "reward_std": 0.2160111963748932, + "rewards/accuracy_reward": 1.3952780961990356, + "rewards/format_reward": 1.0, + "step": 182 + }, + { + "completion_length": 80.484375, + "epoch": 2.2048192771084336, + "grad_norm": 3.463553859166022, + "kl": 0.107421875, + "learning_rate": 6.325301204819276e-07, + "loss": 0.0043, + "reward": 2.3913345336914062, + "reward_std": 0.22311442345380783, + "rewards/accuracy_reward": 1.3991470336914062, + "rewards/format_reward": 0.9921875, + "step": 183 + }, + { + "completion_length": 78.484375, + "epoch": 2.216867469879518, + "grad_norm": 3.9553841913647356, + "kl": 0.08642578125, + "learning_rate": 6.305220883534136e-07, + "loss": 0.0035, + "reward": 2.353707432746887, + "reward_std": 0.2809625118970871, + "rewards/accuracy_reward": 1.3615199327468872, + "rewards/format_reward": 0.9921875, + "step": 184 + }, + { + "completion_length": 86.203125, + "epoch": 2.2289156626506026, + "grad_norm": 6.103835532514207, + "kl": 0.075439453125, + "learning_rate": 6.285140562248996e-07, + "loss": 0.003, + "reward": 2.411812663078308, + "reward_std": 0.17931858450174332, + "rewards/accuracy_reward": 1.411812663078308, + "rewards/format_reward": 1.0, + "step": 185 + }, + { + "completion_length": 77.515625, + "epoch": 2.2409638554216866, + "grad_norm": 3.91857543195832, + "kl": 0.10107421875, + "learning_rate": 6.265060240963856e-07, + "loss": 0.004, + "reward": 2.2299575805664062, + "reward_std": 0.2100789025425911, + "rewards/accuracy_reward": 1.2377700209617615, + "rewards/format_reward": 0.9921875, + "step": 186 + }, + { + "completion_length": 77.09375, + "epoch": 2.253012048192771, + "grad_norm": 3.8592654709883796, + "kl": 0.095947265625, + "learning_rate": 6.244979919678714e-07, + "loss": 0.0038, + "reward": 2.47510826587677, + "reward_std": 0.2556135207414627, + "rewards/accuracy_reward": 1.4829206466674805, + "rewards/format_reward": 0.9921875, + "step": 187 + }, + { + "completion_length": 79.2890625, + "epoch": 2.2650602409638556, + "grad_norm": 6.921774157099546, + "kl": 0.093017578125, + "learning_rate": 6.224899598393574e-07, + "loss": 0.0037, + "reward": 2.3394941091537476, + "reward_std": 0.23163118958473206, + "rewards/accuracy_reward": 1.3394939303398132, + "rewards/format_reward": 1.0, + "step": 188 + }, + { + "completion_length": 79.546875, + "epoch": 2.2771084337349397, + "grad_norm": 5.699992937395376, + "kl": 0.08544921875, + "learning_rate": 6.204819277108434e-07, + "loss": 0.0034, + "reward": 2.330021381378174, + "reward_std": 0.21045994758605957, + "rewards/accuracy_reward": 1.3300212621688843, + "rewards/format_reward": 1.0, + "step": 189 + }, + { + "completion_length": 77.421875, + "epoch": 2.289156626506024, + "grad_norm": 4.425700742489554, + "kl": 0.098388671875, + "learning_rate": 6.184738955823293e-07, + "loss": 0.0039, + "reward": 2.2294440269470215, + "reward_std": 0.21671444922685623, + "rewards/accuracy_reward": 1.2294440865516663, + "rewards/format_reward": 1.0, + "step": 190 + }, + { + "completion_length": 74.6640625, + "epoch": 2.3012048192771086, + "grad_norm": 3.5141288907091783, + "kl": 0.08154296875, + "learning_rate": 6.164658634538153e-07, + "loss": 0.0033, + "reward": 2.417364239692688, + "reward_std": 0.18784678727388382, + "rewards/accuracy_reward": 1.4173641800880432, + "rewards/format_reward": 1.0, + "step": 191 + }, + { + "completion_length": 74.53125, + "epoch": 2.3132530120481927, + "grad_norm": 4.6610918738389095, + "kl": 0.096435546875, + "learning_rate": 6.144578313253011e-07, + "loss": 0.0039, + "reward": 2.4048426151275635, + "reward_std": 0.2764005810022354, + "rewards/accuracy_reward": 1.412655234336853, + "rewards/format_reward": 0.9921875, + "step": 192 + }, + { + "completion_length": 80.8984375, + "epoch": 2.325301204819277, + "grad_norm": 6.933183617809393, + "kl": 0.07861328125, + "learning_rate": 6.124497991967871e-07, + "loss": 0.0031, + "reward": 2.2180745601654053, + "reward_std": 0.2127843052148819, + "rewards/accuracy_reward": 1.21807461977005, + "rewards/format_reward": 1.0, + "step": 193 + }, + { + "completion_length": 80.9296875, + "epoch": 2.337349397590361, + "grad_norm": 4.526116466506062, + "kl": 0.088623046875, + "learning_rate": 6.104417670682731e-07, + "loss": 0.0035, + "reward": 2.2327487468719482, + "reward_std": 0.2369586005806923, + "rewards/accuracy_reward": 1.240561306476593, + "rewards/format_reward": 0.9921875, + "step": 194 + }, + { + "completion_length": 79.8359375, + "epoch": 2.3493975903614457, + "grad_norm": 3.410370565415923, + "kl": 0.09326171875, + "learning_rate": 6.084337349397591e-07, + "loss": 0.0037, + "reward": 2.222264051437378, + "reward_std": 0.26303592324256897, + "rewards/accuracy_reward": 1.230076551437378, + "rewards/format_reward": 0.9921875, + "step": 195 + }, + { + "completion_length": 73.8828125, + "epoch": 2.36144578313253, + "grad_norm": 3.962197046428477, + "kl": 0.103271484375, + "learning_rate": 6.064257028112449e-07, + "loss": 0.0041, + "reward": 2.296523690223694, + "reward_std": 0.370675727725029, + "rewards/accuracy_reward": 1.2965235710144043, + "rewards/format_reward": 1.0, + "step": 196 + }, + { + "completion_length": 74.515625, + "epoch": 2.3734939759036147, + "grad_norm": 3.7849181083166066, + "kl": 0.100341796875, + "learning_rate": 6.044176706827308e-07, + "loss": 0.004, + "reward": 2.1898573637008667, + "reward_std": 0.2903239354491234, + "rewards/accuracy_reward": 1.1898574829101562, + "rewards/format_reward": 1.0, + "step": 197 + }, + { + "completion_length": 71.015625, + "epoch": 2.3855421686746987, + "grad_norm": 4.598411590922377, + "kl": 0.09716796875, + "learning_rate": 6.024096385542169e-07, + "loss": 0.0039, + "reward": 2.3405251502990723, + "reward_std": 0.1668776124715805, + "rewards/accuracy_reward": 1.3405250310897827, + "rewards/format_reward": 1.0, + "step": 198 + }, + { + "completion_length": 72.0234375, + "epoch": 2.397590361445783, + "grad_norm": 4.094960420612339, + "kl": 0.08447265625, + "learning_rate": 6.004016064257028e-07, + "loss": 0.0034, + "reward": 2.2692129611968994, + "reward_std": 0.22979120910167694, + "rewards/accuracy_reward": 1.2848379015922546, + "rewards/format_reward": 0.984375, + "step": 199 + }, + { + "completion_length": 76.34375, + "epoch": 2.4096385542168672, + "grad_norm": 5.228591551586785, + "kl": 0.0771484375, + "learning_rate": 5.983935742971888e-07, + "loss": 0.0031, + "reward": 2.29106342792511, + "reward_std": 0.22756240516901016, + "rewards/accuracy_reward": 1.2910634279251099, + "rewards/format_reward": 1.0, + "step": 200 + }, + { + "completion_length": 79.3828125, + "epoch": 2.4216867469879517, + "grad_norm": 3.532651567007306, + "kl": 0.140869140625, + "learning_rate": 5.963855421686746e-07, + "loss": 0.0056, + "reward": 2.218053698539734, + "reward_std": 0.24822543561458588, + "rewards/accuracy_reward": 1.2180536985397339, + "rewards/format_reward": 1.0, + "step": 201 + }, + { + "completion_length": 76.0, + "epoch": 2.433734939759036, + "grad_norm": 3.316768093202225, + "kl": 0.088134765625, + "learning_rate": 5.943775100401606e-07, + "loss": 0.0035, + "reward": 2.26613187789917, + "reward_std": 0.24750088155269623, + "rewards/accuracy_reward": 1.2739443182945251, + "rewards/format_reward": 0.9921875, + "step": 202 + }, + { + "completion_length": 70.5234375, + "epoch": 2.4457831325301207, + "grad_norm": 9.031966519770473, + "kl": 0.099853515625, + "learning_rate": 5.923694779116466e-07, + "loss": 0.004, + "reward": 2.317081928253174, + "reward_std": 0.24299181252717972, + "rewards/accuracy_reward": 1.3248944282531738, + "rewards/format_reward": 0.9921875, + "step": 203 + }, + { + "completion_length": 72.1484375, + "epoch": 2.4578313253012047, + "grad_norm": 4.923799185057533, + "kl": 0.09716796875, + "learning_rate": 5.903614457831325e-07, + "loss": 0.0039, + "reward": 2.202351689338684, + "reward_std": 0.24287213385105133, + "rewards/accuracy_reward": 1.2023517489433289, + "rewards/format_reward": 1.0, + "step": 204 + }, + { + "completion_length": 75.5390625, + "epoch": 2.4698795180722892, + "grad_norm": 10.424209527328602, + "kl": 0.0849609375, + "learning_rate": 5.883534136546184e-07, + "loss": 0.0034, + "reward": 2.3431246280670166, + "reward_std": 0.21441341936588287, + "rewards/accuracy_reward": 1.3431245684623718, + "rewards/format_reward": 1.0, + "step": 205 + }, + { + "completion_length": 74.1328125, + "epoch": 2.4819277108433733, + "grad_norm": 5.39794558294026, + "kl": 0.08349609375, + "learning_rate": 5.863453815261043e-07, + "loss": 0.0033, + "reward": 2.318004846572876, + "reward_std": 0.1649407297372818, + "rewards/accuracy_reward": 1.3180049657821655, + "rewards/format_reward": 1.0, + "step": 206 + }, + { + "completion_length": 70.828125, + "epoch": 2.4939759036144578, + "grad_norm": 5.651509118393077, + "kl": 0.099609375, + "learning_rate": 5.843373493975904e-07, + "loss": 0.004, + "reward": 2.2745083570480347, + "reward_std": 0.1795399785041809, + "rewards/accuracy_reward": 1.27450829744339, + "rewards/format_reward": 1.0, + "step": 207 + }, + { + "completion_length": 75.1484375, + "epoch": 2.5060240963855422, + "grad_norm": 3.374258945078158, + "kl": 0.099853515625, + "learning_rate": 5.823293172690763e-07, + "loss": 0.004, + "reward": 2.183190941810608, + "reward_std": 0.19665208458900452, + "rewards/accuracy_reward": 1.183190941810608, + "rewards/format_reward": 1.0, + "step": 208 + }, + { + "completion_length": 75.15625, + "epoch": 2.5180722891566267, + "grad_norm": 3.680961209255419, + "kl": 0.085693359375, + "learning_rate": 5.803212851405623e-07, + "loss": 0.0034, + "reward": 2.3783202171325684, + "reward_std": 0.21517369151115417, + "rewards/accuracy_reward": 1.3861328959465027, + "rewards/format_reward": 0.9921875, + "step": 209 + }, + { + "completion_length": 75.890625, + "epoch": 2.5301204819277108, + "grad_norm": 4.203577590596214, + "kl": 0.093017578125, + "learning_rate": 5.783132530120481e-07, + "loss": 0.0037, + "reward": 2.232303738594055, + "reward_std": 0.21822457760572433, + "rewards/accuracy_reward": 1.2401162385940552, + "rewards/format_reward": 0.9921875, + "step": 210 + }, + { + "completion_length": 72.5234375, + "epoch": 2.5421686746987953, + "grad_norm": 5.049709537985753, + "kl": 0.09033203125, + "learning_rate": 5.76305220883534e-07, + "loss": 0.0036, + "reward": 2.3138071298599243, + "reward_std": 0.18903522193431854, + "rewards/accuracy_reward": 1.3138071298599243, + "rewards/format_reward": 1.0, + "step": 211 + }, + { + "completion_length": 77.6796875, + "epoch": 2.5542168674698793, + "grad_norm": 4.79270453347689, + "kl": 0.10791015625, + "learning_rate": 5.742971887550201e-07, + "loss": 0.0043, + "reward": 2.35454523563385, + "reward_std": 0.260717436671257, + "rewards/accuracy_reward": 1.36235773563385, + "rewards/format_reward": 0.9921875, + "step": 212 + }, + { + "completion_length": 75.5234375, + "epoch": 2.566265060240964, + "grad_norm": 3.8110594359613694, + "kl": 0.132080078125, + "learning_rate": 5.72289156626506e-07, + "loss": 0.0053, + "reward": 2.3396618366241455, + "reward_std": 0.2776957154273987, + "rewards/accuracy_reward": 1.3474743366241455, + "rewards/format_reward": 0.9921875, + "step": 213 + }, + { + "completion_length": 78.8203125, + "epoch": 2.5783132530120483, + "grad_norm": 3.5277793226603467, + "kl": 0.082763671875, + "learning_rate": 5.70281124497992e-07, + "loss": 0.0033, + "reward": 2.282657027244568, + "reward_std": 0.20082392543554306, + "rewards/accuracy_reward": 1.2826570868492126, + "rewards/format_reward": 1.0, + "step": 214 + }, + { + "completion_length": 79.7265625, + "epoch": 2.5903614457831328, + "grad_norm": 5.661825173466666, + "kl": 0.070068359375, + "learning_rate": 5.682730923694778e-07, + "loss": 0.0028, + "reward": 2.2916386127471924, + "reward_std": 0.22843700647354126, + "rewards/accuracy_reward": 1.2916386723518372, + "rewards/format_reward": 1.0, + "step": 215 + }, + { + "completion_length": 75.484375, + "epoch": 2.602409638554217, + "grad_norm": 5.408656767411551, + "kl": 0.074951171875, + "learning_rate": 5.662650602409639e-07, + "loss": 0.003, + "reward": 2.4862678050994873, + "reward_std": 0.17430586367845535, + "rewards/accuracy_reward": 1.4862679243087769, + "rewards/format_reward": 1.0, + "step": 216 + }, + { + "completion_length": 75.4140625, + "epoch": 2.6144578313253013, + "grad_norm": 4.437169209890788, + "kl": 0.1123046875, + "learning_rate": 5.642570281124498e-07, + "loss": 0.0045, + "reward": 2.2881970405578613, + "reward_std": 0.24159938842058182, + "rewards/accuracy_reward": 1.3116344809532166, + "rewards/format_reward": 0.9765625, + "step": 217 + }, + { + "completion_length": 77.1484375, + "epoch": 2.6265060240963853, + "grad_norm": 3.7017405154535608, + "kl": 0.0849609375, + "learning_rate": 5.622489959839358e-07, + "loss": 0.0034, + "reward": 2.42057728767395, + "reward_std": 0.1918034851551056, + "rewards/accuracy_reward": 1.4205771684646606, + "rewards/format_reward": 1.0, + "step": 218 + }, + { + "completion_length": 74.9921875, + "epoch": 2.63855421686747, + "grad_norm": 3.0572748613034184, + "kl": 0.08056640625, + "learning_rate": 5.602409638554216e-07, + "loss": 0.0032, + "reward": 2.296902298927307, + "reward_std": 0.22776726633310318, + "rewards/accuracy_reward": 1.2969022989273071, + "rewards/format_reward": 1.0, + "step": 219 + }, + { + "completion_length": 77.9375, + "epoch": 2.6506024096385543, + "grad_norm": 5.142063259050984, + "kl": 0.08251953125, + "learning_rate": 5.582329317269075e-07, + "loss": 0.0033, + "reward": 2.411815643310547, + "reward_std": 0.20656804740428925, + "rewards/accuracy_reward": 1.4118155241012573, + "rewards/format_reward": 1.0, + "step": 220 + }, + { + "completion_length": 75.0625, + "epoch": 2.662650602409639, + "grad_norm": 9.244315362233946, + "kl": 0.094482421875, + "learning_rate": 5.562248995983936e-07, + "loss": 0.0038, + "reward": 2.2525359392166138, + "reward_std": 0.23683273047208786, + "rewards/accuracy_reward": 1.2681609392166138, + "rewards/format_reward": 0.984375, + "step": 221 + }, + { + "completion_length": 78.390625, + "epoch": 2.674698795180723, + "grad_norm": 4.89406748105177, + "kl": 0.078125, + "learning_rate": 5.542168674698795e-07, + "loss": 0.0031, + "reward": 2.33753764629364, + "reward_std": 0.21247170120477676, + "rewards/accuracy_reward": 1.3453501462936401, + "rewards/format_reward": 0.9921875, + "step": 222 + }, + { + "completion_length": 73.0859375, + "epoch": 2.6867469879518073, + "grad_norm": 3.6393688137680464, + "kl": 0.0810546875, + "learning_rate": 5.522088353413655e-07, + "loss": 0.0032, + "reward": 2.2808330059051514, + "reward_std": 0.1841505616903305, + "rewards/accuracy_reward": 1.280833125114441, + "rewards/format_reward": 1.0, + "step": 223 + }, + { + "completion_length": 77.1484375, + "epoch": 2.6987951807228914, + "grad_norm": 2.9614100491209516, + "kl": 0.08447265625, + "learning_rate": 5.502008032128513e-07, + "loss": 0.0034, + "reward": 2.256025791168213, + "reward_std": 0.22689195722341537, + "rewards/accuracy_reward": 1.271650791168213, + "rewards/format_reward": 0.984375, + "step": 224 + }, + { + "completion_length": 72.6015625, + "epoch": 2.710843373493976, + "grad_norm": 4.624802749562738, + "kl": 0.0810546875, + "learning_rate": 5.481927710843374e-07, + "loss": 0.0032, + "reward": 2.367666721343994, + "reward_std": 0.20605457574129105, + "rewards/accuracy_reward": 1.367666482925415, + "rewards/format_reward": 1.0, + "step": 225 + }, + { + "completion_length": 70.859375, + "epoch": 2.7228915662650603, + "grad_norm": 6.0943428059060505, + "kl": 0.10205078125, + "learning_rate": 5.461847389558233e-07, + "loss": 0.0041, + "reward": 2.3246583938598633, + "reward_std": 0.17254704982042313, + "rewards/accuracy_reward": 1.3324708938598633, + "rewards/format_reward": 0.9921875, + "step": 226 + }, + { + "completion_length": 75.640625, + "epoch": 2.734939759036145, + "grad_norm": 4.26546660385252, + "kl": 0.090087890625, + "learning_rate": 5.441767068273092e-07, + "loss": 0.0036, + "reward": 2.307809591293335, + "reward_std": 0.2002812698483467, + "rewards/accuracy_reward": 1.315622091293335, + "rewards/format_reward": 0.9921875, + "step": 227 + }, + { + "completion_length": 73.671875, + "epoch": 2.746987951807229, + "grad_norm": 3.4690497244218435, + "kl": 0.0927734375, + "learning_rate": 5.421686746987951e-07, + "loss": 0.0037, + "reward": 2.4064533710479736, + "reward_std": 0.1763758659362793, + "rewards/accuracy_reward": 1.4142658710479736, + "rewards/format_reward": 0.9921875, + "step": 228 + }, + { + "completion_length": 77.265625, + "epoch": 2.7590361445783134, + "grad_norm": 3.8015660942675313, + "kl": 0.107666015625, + "learning_rate": 5.401606425702811e-07, + "loss": 0.0043, + "reward": 2.417749524116516, + "reward_std": 0.20080577582120895, + "rewards/accuracy_reward": 1.4333745837211609, + "rewards/format_reward": 0.984375, + "step": 229 + }, + { + "completion_length": 78.6484375, + "epoch": 2.7710843373493974, + "grad_norm": 4.593078230781537, + "kl": 0.081298828125, + "learning_rate": 5.381526104417671e-07, + "loss": 0.0032, + "reward": 2.310904383659363, + "reward_std": 0.20601534098386765, + "rewards/accuracy_reward": 1.326529324054718, + "rewards/format_reward": 0.984375, + "step": 230 + }, + { + "completion_length": 69.75, + "epoch": 2.783132530120482, + "grad_norm": 4.781119598148597, + "kl": 0.092041015625, + "learning_rate": 5.36144578313253e-07, + "loss": 0.0037, + "reward": 2.4060455560684204, + "reward_std": 0.1945626586675644, + "rewards/accuracy_reward": 1.41385817527771, + "rewards/format_reward": 0.9921875, + "step": 231 + }, + { + "completion_length": 72.125, + "epoch": 2.7951807228915664, + "grad_norm": 3.6431689651666925, + "kl": 0.084716796875, + "learning_rate": 5.34136546184739e-07, + "loss": 0.0034, + "reward": 2.2687569856643677, + "reward_std": 0.20781449228525162, + "rewards/accuracy_reward": 1.2765693664550781, + "rewards/format_reward": 0.9921875, + "step": 232 + }, + { + "completion_length": 75.28125, + "epoch": 2.807228915662651, + "grad_norm": 3.463525581618983, + "kl": 0.0830078125, + "learning_rate": 5.321285140562248e-07, + "loss": 0.0033, + "reward": 2.2786985635757446, + "reward_std": 0.1869373545050621, + "rewards/accuracy_reward": 1.2865110039710999, + "rewards/format_reward": 0.9921875, + "step": 233 + }, + { + "completion_length": 72.390625, + "epoch": 2.819277108433735, + "grad_norm": 3.989550051539227, + "kl": 0.08935546875, + "learning_rate": 5.301204819277109e-07, + "loss": 0.0036, + "reward": 2.2122349739074707, + "reward_std": 0.17366793006658554, + "rewards/accuracy_reward": 1.212234914302826, + "rewards/format_reward": 1.0, + "step": 234 + }, + { + "completion_length": 68.4296875, + "epoch": 2.8313253012048194, + "grad_norm": 5.293732432179004, + "kl": 0.1162109375, + "learning_rate": 5.281124497991968e-07, + "loss": 0.0046, + "reward": 2.273004412651062, + "reward_std": 0.21551835536956787, + "rewards/accuracy_reward": 1.2730044722557068, + "rewards/format_reward": 1.0, + "step": 235 + }, + { + "completion_length": 70.4765625, + "epoch": 2.8433734939759034, + "grad_norm": 3.483964465031993, + "kl": 0.08642578125, + "learning_rate": 5.261044176706827e-07, + "loss": 0.0035, + "reward": 2.5097464323043823, + "reward_std": 0.21660751849412918, + "rewards/accuracy_reward": 1.509746491909027, + "rewards/format_reward": 1.0, + "step": 236 + }, + { + "completion_length": 67.1796875, + "epoch": 2.855421686746988, + "grad_norm": 3.2613871176315286, + "kl": 0.109619140625, + "learning_rate": 5.240963855421686e-07, + "loss": 0.0044, + "reward": 2.2154468297958374, + "reward_std": 0.2426525428891182, + "rewards/accuracy_reward": 1.2154468894004822, + "rewards/format_reward": 1.0, + "step": 237 + }, + { + "completion_length": 73.875, + "epoch": 2.8674698795180724, + "grad_norm": 5.04569953866162, + "kl": 0.105224609375, + "learning_rate": 5.220883534136546e-07, + "loss": 0.0042, + "reward": 2.3947439193725586, + "reward_std": 0.16551193594932556, + "rewards/accuracy_reward": 1.3947439193725586, + "rewards/format_reward": 1.0, + "step": 238 + }, + { + "completion_length": 70.03125, + "epoch": 2.8795180722891565, + "grad_norm": 3.2080049289623997, + "kl": 0.10986328125, + "learning_rate": 5.200803212851406e-07, + "loss": 0.0044, + "reward": 2.394848346710205, + "reward_std": 0.22504138201475143, + "rewards/accuracy_reward": 1.394848346710205, + "rewards/format_reward": 1.0, + "step": 239 + }, + { + "completion_length": 70.90625, + "epoch": 2.891566265060241, + "grad_norm": 3.843192487462901, + "kl": 0.1171875, + "learning_rate": 5.180722891566265e-07, + "loss": 0.0047, + "reward": 2.2219191789627075, + "reward_std": 0.2526251822710037, + "rewards/accuracy_reward": 1.2219191193580627, + "rewards/format_reward": 1.0, + "step": 240 + }, + { + "completion_length": 67.1328125, + "epoch": 2.9036144578313254, + "grad_norm": 3.0217979987505394, + "kl": 0.104248046875, + "learning_rate": 5.160642570281125e-07, + "loss": 0.0042, + "reward": 2.2357059717178345, + "reward_std": 0.181558758020401, + "rewards/accuracy_reward": 1.235705852508545, + "rewards/format_reward": 1.0, + "step": 241 + }, + { + "completion_length": 67.0390625, + "epoch": 2.9156626506024095, + "grad_norm": 4.171949473201647, + "kl": 0.1044921875, + "learning_rate": 5.140562248995983e-07, + "loss": 0.0042, + "reward": 2.3148874044418335, + "reward_std": 0.17748098075389862, + "rewards/accuracy_reward": 1.3148874640464783, + "rewards/format_reward": 1.0, + "step": 242 + }, + { + "completion_length": 65.8671875, + "epoch": 2.927710843373494, + "grad_norm": 8.908769866071971, + "kl": 0.11181640625, + "learning_rate": 5.120481927710843e-07, + "loss": 0.0045, + "reward": 2.2218422889709473, + "reward_std": 0.1961566060781479, + "rewards/accuracy_reward": 1.2296549081802368, + "rewards/format_reward": 0.9921875, + "step": 243 + }, + { + "completion_length": 63.6953125, + "epoch": 2.9397590361445785, + "grad_norm": 12.929344924116855, + "kl": 0.106201171875, + "learning_rate": 5.100401606425703e-07, + "loss": 0.0042, + "reward": 2.4831990003585815, + "reward_std": 0.17936265468597412, + "rewards/accuracy_reward": 1.4831989407539368, + "rewards/format_reward": 1.0, + "step": 244 + }, + { + "completion_length": 62.28125, + "epoch": 2.9518072289156625, + "grad_norm": 3.4705083145900404, + "kl": 0.111328125, + "learning_rate": 5.080321285140562e-07, + "loss": 0.0044, + "reward": 2.352734327316284, + "reward_std": 0.2174607664346695, + "rewards/accuracy_reward": 1.3683592081069946, + "rewards/format_reward": 0.984375, + "step": 245 + }, + { + "completion_length": 69.640625, + "epoch": 2.963855421686747, + "grad_norm": 4.178352503452598, + "kl": 0.111572265625, + "learning_rate": 5.060240963855421e-07, + "loss": 0.0045, + "reward": 2.3825145959854126, + "reward_std": 0.21491926908493042, + "rewards/accuracy_reward": 1.3903270959854126, + "rewards/format_reward": 0.9921875, + "step": 246 + }, + { + "completion_length": 65.875, + "epoch": 2.9759036144578315, + "grad_norm": 4.426857679190133, + "kl": 0.149169921875, + "learning_rate": 5.040160642570281e-07, + "loss": 0.006, + "reward": 2.1721856594085693, + "reward_std": 0.2390434294939041, + "rewards/accuracy_reward": 1.1721857190132141, + "rewards/format_reward": 1.0, + "step": 247 + }, + { + "completion_length": 70.9921875, + "epoch": 2.9879518072289155, + "grad_norm": 4.720913912936636, + "kl": 0.114013671875, + "learning_rate": 5.020080321285141e-07, + "loss": 0.0046, + "reward": 2.2051347494125366, + "reward_std": 0.2722553163766861, + "rewards/accuracy_reward": 1.2285721898078918, + "rewards/format_reward": 0.9765625, + "step": 248 + }, + { + "completion_length": 64.25000190734863, + "epoch": 3.0, + "grad_norm": 3.5181266600609904, + "kl": 0.11962890625, + "learning_rate": 5e-07, + "loss": 0.0048, + "reward": 2.1161320209503174, + "reward_std": 0.430472195148468, + "rewards/accuracy_reward": 1.1994653940200806, + "rewards/format_reward": 0.9166666865348816, + "step": 249 + }, + { + "completion_length": 68.1875, + "epoch": 3.0120481927710845, + "grad_norm": 3.5431810235066643, + "kl": 0.09619140625, + "learning_rate": 4.979919678714859e-07, + "loss": 0.0038, + "reward": 2.323817491531372, + "reward_std": 0.23299024999141693, + "rewards/accuracy_reward": 1.3316298723220825, + "rewards/format_reward": 0.9921875, + "step": 250 + }, + { + "completion_length": 71.6953125, + "epoch": 3.0240963855421685, + "grad_norm": 3.3542739826451173, + "kl": 0.08642578125, + "learning_rate": 4.959839357429718e-07, + "loss": 0.0035, + "reward": 2.411439895629883, + "reward_std": 0.19917739927768707, + "rewards/accuracy_reward": 1.4114398956298828, + "rewards/format_reward": 1.0, + "step": 251 + }, + { + "completion_length": 68.109375, + "epoch": 3.036144578313253, + "grad_norm": 12.151823073672764, + "kl": 0.110107421875, + "learning_rate": 4.939759036144578e-07, + "loss": 0.0044, + "reward": 2.5318474769592285, + "reward_std": 0.18056734651327133, + "rewards/accuracy_reward": 1.5396599173545837, + "rewards/format_reward": 0.9921875, + "step": 252 + }, + { + "completion_length": 72.578125, + "epoch": 3.0481927710843375, + "grad_norm": 3.219943316402962, + "kl": 0.099853515625, + "learning_rate": 4.919678714859438e-07, + "loss": 0.004, + "reward": 2.3200578689575195, + "reward_std": 0.15618911385536194, + "rewards/accuracy_reward": 1.3200578689575195, + "rewards/format_reward": 1.0, + "step": 253 + }, + { + "completion_length": 61.3828125, + "epoch": 3.0602409638554215, + "grad_norm": 3.865556225897638, + "kl": 0.10888671875, + "learning_rate": 4.899598393574297e-07, + "loss": 0.0044, + "reward": 2.209138035774231, + "reward_std": 0.17473262548446655, + "rewards/accuracy_reward": 1.2091379761695862, + "rewards/format_reward": 1.0, + "step": 254 + }, + { + "completion_length": 66.7421875, + "epoch": 3.072289156626506, + "grad_norm": 4.017362101946035, + "kl": 0.1259765625, + "learning_rate": 4.879518072289156e-07, + "loss": 0.005, + "reward": 2.139701724052429, + "reward_std": 0.22376088798046112, + "rewards/accuracy_reward": 1.1397016048431396, + "rewards/format_reward": 1.0, + "step": 255 + }, + { + "completion_length": 62.71875, + "epoch": 3.0843373493975905, + "grad_norm": 3.4288754746391947, + "kl": 0.140625, + "learning_rate": 4.859437751004016e-07, + "loss": 0.0056, + "reward": 2.2105259895324707, + "reward_std": 0.22984497249126434, + "rewards/accuracy_reward": 1.2261508703231812, + "rewards/format_reward": 0.984375, + "step": 256 + }, + { + "completion_length": 66.6953125, + "epoch": 3.0963855421686746, + "grad_norm": 3.481985490355864, + "kl": 0.1181640625, + "learning_rate": 4.839357429718875e-07, + "loss": 0.0047, + "reward": 2.5049203634262085, + "reward_std": 0.1857297122478485, + "rewards/accuracy_reward": 1.5049203634262085, + "rewards/format_reward": 1.0, + "step": 257 + }, + { + "completion_length": 67.484375, + "epoch": 3.108433734939759, + "grad_norm": 3.6977753194922403, + "kl": 0.107666015625, + "learning_rate": 4.819277108433735e-07, + "loss": 0.0043, + "reward": 2.3002774715423584, + "reward_std": 0.21863283962011337, + "rewards/accuracy_reward": 1.3080899119377136, + "rewards/format_reward": 0.9921875, + "step": 258 + }, + { + "completion_length": 71.984375, + "epoch": 3.1204819277108435, + "grad_norm": 3.2391554999759054, + "kl": 0.099853515625, + "learning_rate": 4.799196787148594e-07, + "loss": 0.004, + "reward": 2.404132843017578, + "reward_std": 0.19443362206220627, + "rewards/accuracy_reward": 1.4119452238082886, + "rewards/format_reward": 0.9921875, + "step": 259 + }, + { + "completion_length": 70.3984375, + "epoch": 3.1325301204819276, + "grad_norm": 3.8470897735347993, + "kl": 0.11181640625, + "learning_rate": 4.779116465863453e-07, + "loss": 0.0045, + "reward": 2.2314306497573853, + "reward_std": 0.1860732138156891, + "rewards/accuracy_reward": 1.2392430305480957, + "rewards/format_reward": 0.9921875, + "step": 260 + }, + { + "completion_length": 71.7109375, + "epoch": 3.144578313253012, + "grad_norm": 5.7256880192839965, + "kl": 0.101806640625, + "learning_rate": 4.7590361445783126e-07, + "loss": 0.0041, + "reward": 2.3397083282470703, + "reward_std": 0.21985551714897156, + "rewards/accuracy_reward": 1.3397083282470703, + "rewards/format_reward": 1.0, + "step": 261 + }, + { + "completion_length": 72.7265625, + "epoch": 3.1566265060240966, + "grad_norm": 4.6788843643036255, + "kl": 0.183837890625, + "learning_rate": 4.7389558232931724e-07, + "loss": 0.0074, + "reward": 2.288654088973999, + "reward_std": 0.25063957273960114, + "rewards/accuracy_reward": 1.296466588973999, + "rewards/format_reward": 0.9921875, + "step": 262 + }, + { + "completion_length": 66.96875, + "epoch": 3.1686746987951806, + "grad_norm": 4.000735227178484, + "kl": 0.1171875, + "learning_rate": 4.7188755020080317e-07, + "loss": 0.0047, + "reward": 2.385547637939453, + "reward_std": 0.179743941873312, + "rewards/accuracy_reward": 1.393360197544098, + "rewards/format_reward": 0.9921875, + "step": 263 + }, + { + "completion_length": 73.078125, + "epoch": 3.180722891566265, + "grad_norm": 3.2436175706744903, + "kl": 0.08837890625, + "learning_rate": 4.6987951807228915e-07, + "loss": 0.0035, + "reward": 2.3714927434921265, + "reward_std": 0.1866167113184929, + "rewards/accuracy_reward": 1.3793052434921265, + "rewards/format_reward": 0.9921875, + "step": 264 + }, + { + "completion_length": 67.7578125, + "epoch": 3.1927710843373496, + "grad_norm": 4.16773338040152, + "kl": 0.09619140625, + "learning_rate": 4.678714859437751e-07, + "loss": 0.0038, + "reward": 2.256360650062561, + "reward_std": 0.2188187688589096, + "rewards/accuracy_reward": 1.256360650062561, + "rewards/format_reward": 1.0, + "step": 265 + }, + { + "completion_length": 71.6796875, + "epoch": 3.2048192771084336, + "grad_norm": 3.7554898641141388, + "kl": 0.094482421875, + "learning_rate": 4.6586345381526106e-07, + "loss": 0.0038, + "reward": 2.285356283187866, + "reward_std": 0.2733229324221611, + "rewards/accuracy_reward": 1.2853562831878662, + "rewards/format_reward": 1.0, + "step": 266 + }, + { + "completion_length": 69.53125, + "epoch": 3.216867469879518, + "grad_norm": 3.1396081677261747, + "kl": 0.11572265625, + "learning_rate": 4.63855421686747e-07, + "loss": 0.0046, + "reward": 2.194140672683716, + "reward_std": 0.2116081416606903, + "rewards/accuracy_reward": 1.1941407322883606, + "rewards/format_reward": 1.0, + "step": 267 + }, + { + "completion_length": 67.8203125, + "epoch": 3.2289156626506026, + "grad_norm": 7.260439555595242, + "kl": 0.08837890625, + "learning_rate": 4.6184738955823296e-07, + "loss": 0.0035, + "reward": 2.252182364463806, + "reward_std": 0.1803755983710289, + "rewards/accuracy_reward": 1.259994924068451, + "rewards/format_reward": 0.9921875, + "step": 268 + }, + { + "completion_length": 67.390625, + "epoch": 3.2409638554216866, + "grad_norm": 3.5049860895757696, + "kl": 0.08935546875, + "learning_rate": 4.5983935742971884e-07, + "loss": 0.0036, + "reward": 2.2208237648010254, + "reward_std": 0.23105446994304657, + "rewards/accuracy_reward": 1.2286362648010254, + "rewards/format_reward": 0.9921875, + "step": 269 + }, + { + "completion_length": 70.8515625, + "epoch": 3.253012048192771, + "grad_norm": 5.489156591080696, + "kl": 0.131591796875, + "learning_rate": 4.5783132530120476e-07, + "loss": 0.0053, + "reward": 2.2373805046081543, + "reward_std": 0.2680865153670311, + "rewards/accuracy_reward": 1.2373805046081543, + "rewards/format_reward": 1.0, + "step": 270 + }, + { + "completion_length": 67.3359375, + "epoch": 3.2650602409638556, + "grad_norm": 3.943203757539833, + "kl": 0.102783203125, + "learning_rate": 4.5582329317269074e-07, + "loss": 0.0041, + "reward": 2.2856905460357666, + "reward_std": 0.2643607556819916, + "rewards/accuracy_reward": 1.2856906652450562, + "rewards/format_reward": 1.0, + "step": 271 + }, + { + "completion_length": 76.703125, + "epoch": 3.2771084337349397, + "grad_norm": 4.067837029288379, + "kl": 0.14794921875, + "learning_rate": 4.5381526104417667e-07, + "loss": 0.0059, + "reward": 2.2173361778259277, + "reward_std": 0.23457611352205276, + "rewards/accuracy_reward": 1.2251486778259277, + "rewards/format_reward": 0.9921875, + "step": 272 + }, + { + "completion_length": 70.9765625, + "epoch": 3.289156626506024, + "grad_norm": 3.356513487854019, + "kl": 0.105712890625, + "learning_rate": 4.5180722891566265e-07, + "loss": 0.0042, + "reward": 2.3274762630462646, + "reward_std": 0.1404755339026451, + "rewards/accuracy_reward": 1.327476143836975, + "rewards/format_reward": 1.0, + "step": 273 + }, + { + "completion_length": 73.5546875, + "epoch": 3.3012048192771086, + "grad_norm": 2.8662666869018194, + "kl": 0.087646484375, + "learning_rate": 4.497991967871486e-07, + "loss": 0.0035, + "reward": 2.4234249591827393, + "reward_std": 0.23345230519771576, + "rewards/accuracy_reward": 1.4234249591827393, + "rewards/format_reward": 1.0, + "step": 274 + }, + { + "completion_length": 76.2890625, + "epoch": 3.3132530120481927, + "grad_norm": 3.6359732134875027, + "kl": 0.0849609375, + "learning_rate": 4.4779116465863456e-07, + "loss": 0.0034, + "reward": 2.2799594402313232, + "reward_std": 0.17667143046855927, + "rewards/accuracy_reward": 1.2799595594406128, + "rewards/format_reward": 1.0, + "step": 275 + }, + { + "completion_length": 74.9296875, + "epoch": 3.325301204819277, + "grad_norm": 3.4769457078888513, + "kl": 0.1181640625, + "learning_rate": 4.4578313253012043e-07, + "loss": 0.0047, + "reward": 2.282673478126526, + "reward_std": 0.20452508330345154, + "rewards/accuracy_reward": 1.282673418521881, + "rewards/format_reward": 1.0, + "step": 276 + }, + { + "completion_length": 73.828125, + "epoch": 3.337349397590361, + "grad_norm": 5.230024279024117, + "kl": 0.0830078125, + "learning_rate": 4.437751004016064e-07, + "loss": 0.0033, + "reward": 2.2097089290618896, + "reward_std": 0.22180304676294327, + "rewards/accuracy_reward": 1.2097087502479553, + "rewards/format_reward": 1.0, + "step": 277 + }, + { + "completion_length": 72.7109375, + "epoch": 3.3493975903614457, + "grad_norm": 3.8728422379908416, + "kl": 0.095458984375, + "learning_rate": 4.4176706827309234e-07, + "loss": 0.0038, + "reward": 2.491241931915283, + "reward_std": 0.22739917039871216, + "rewards/accuracy_reward": 1.4912420511245728, + "rewards/format_reward": 1.0, + "step": 278 + }, + { + "completion_length": 78.5078125, + "epoch": 3.36144578313253, + "grad_norm": 3.6858021846036535, + "kl": 0.0908203125, + "learning_rate": 4.3975903614457827e-07, + "loss": 0.0036, + "reward": 2.243127226829529, + "reward_std": 0.22939348965883255, + "rewards/accuracy_reward": 1.2431272268295288, + "rewards/format_reward": 1.0, + "step": 279 + }, + { + "completion_length": 72.765625, + "epoch": 3.3734939759036147, + "grad_norm": 4.156042584491376, + "kl": 0.1044921875, + "learning_rate": 4.3775100401606425e-07, + "loss": 0.0042, + "reward": 2.2150485515594482, + "reward_std": 0.23025363683700562, + "rewards/accuracy_reward": 1.2228610515594482, + "rewards/format_reward": 0.9921875, + "step": 280 + }, + { + "completion_length": 77.0390625, + "epoch": 3.3855421686746987, + "grad_norm": 3.3549823921313475, + "kl": 0.100341796875, + "learning_rate": 4.3574297188755017e-07, + "loss": 0.004, + "reward": 2.211505889892578, + "reward_std": 0.24677567183971405, + "rewards/accuracy_reward": 1.227130949497223, + "rewards/format_reward": 0.984375, + "step": 281 + }, + { + "completion_length": 78.296875, + "epoch": 3.397590361445783, + "grad_norm": 3.5036767872389514, + "kl": 0.0859375, + "learning_rate": 4.3373493975903615e-07, + "loss": 0.0034, + "reward": 2.346588611602783, + "reward_std": 0.20112959295511246, + "rewards/accuracy_reward": 1.3465884923934937, + "rewards/format_reward": 1.0, + "step": 282 + }, + { + "completion_length": 84.484375, + "epoch": 3.4096385542168672, + "grad_norm": 3.0794227415803874, + "kl": 0.09326171875, + "learning_rate": 4.3172690763052203e-07, + "loss": 0.0037, + "reward": 2.230928421020508, + "reward_std": 0.26287955790758133, + "rewards/accuracy_reward": 1.2387409210205078, + "rewards/format_reward": 0.9921875, + "step": 283 + }, + { + "completion_length": 84.0546875, + "epoch": 3.4216867469879517, + "grad_norm": 9.632017573370238, + "kl": 0.086181640625, + "learning_rate": 4.29718875502008e-07, + "loss": 0.0034, + "reward": 2.2049087285995483, + "reward_std": 0.19046999514102936, + "rewards/accuracy_reward": 1.204908847808838, + "rewards/format_reward": 1.0, + "step": 284 + }, + { + "completion_length": 74.875, + "epoch": 3.433734939759036, + "grad_norm": 3.04437077789607, + "kl": 0.07861328125, + "learning_rate": 4.2771084337349393e-07, + "loss": 0.0031, + "reward": 2.3966974020004272, + "reward_std": 0.1937796175479889, + "rewards/accuracy_reward": 1.3966973423957825, + "rewards/format_reward": 1.0, + "step": 285 + }, + { + "completion_length": 75.8359375, + "epoch": 3.4457831325301207, + "grad_norm": 5.311045139915637, + "kl": 0.163330078125, + "learning_rate": 4.257028112449799e-07, + "loss": 0.0065, + "reward": 2.3752543926239014, + "reward_std": 0.2273067831993103, + "rewards/accuracy_reward": 1.3830668926239014, + "rewards/format_reward": 0.9921875, + "step": 286 + }, + { + "completion_length": 78.6328125, + "epoch": 3.4578313253012047, + "grad_norm": 3.0911678350526763, + "kl": 0.082763671875, + "learning_rate": 4.2369477911646584e-07, + "loss": 0.0033, + "reward": 2.3473113775253296, + "reward_std": 0.14994988590478897, + "rewards/accuracy_reward": 1.3473113775253296, + "rewards/format_reward": 1.0, + "step": 287 + }, + { + "completion_length": 79.1640625, + "epoch": 3.4698795180722892, + "grad_norm": 3.5847413181475947, + "kl": 0.0849609375, + "learning_rate": 4.216867469879518e-07, + "loss": 0.0034, + "reward": 2.433477997779846, + "reward_std": 0.1769290268421173, + "rewards/accuracy_reward": 1.4334778785705566, + "rewards/format_reward": 1.0, + "step": 288 + }, + { + "completion_length": 83.390625, + "epoch": 3.4819277108433733, + "grad_norm": 4.01569190307187, + "kl": 0.09521484375, + "learning_rate": 4.1967871485943775e-07, + "loss": 0.0038, + "reward": 2.2789034843444824, + "reward_std": 0.2845103293657303, + "rewards/accuracy_reward": 1.2867161631584167, + "rewards/format_reward": 0.9921875, + "step": 289 + }, + { + "completion_length": 81.90625, + "epoch": 3.4939759036144578, + "grad_norm": 3.286849126987869, + "kl": 0.08642578125, + "learning_rate": 4.176706827309237e-07, + "loss": 0.0035, + "reward": 2.362874150276184, + "reward_std": 0.19387810677289963, + "rewards/accuracy_reward": 1.362874150276184, + "rewards/format_reward": 1.0, + "step": 290 + }, + { + "completion_length": 82.6640625, + "epoch": 3.5060240963855422, + "grad_norm": 3.658103173473351, + "kl": 0.10888671875, + "learning_rate": 4.156626506024096e-07, + "loss": 0.0043, + "reward": 2.0810331106185913, + "reward_std": 0.3057002127170563, + "rewards/accuracy_reward": 1.088845670223236, + "rewards/format_reward": 0.9921875, + "step": 291 + }, + { + "completion_length": 78.921875, + "epoch": 3.5180722891566267, + "grad_norm": 3.7103596490236774, + "kl": 0.08349609375, + "learning_rate": 4.1365461847389553e-07, + "loss": 0.0033, + "reward": 2.511967420578003, + "reward_std": 0.16890805214643478, + "rewards/accuracy_reward": 1.5119673609733582, + "rewards/format_reward": 1.0, + "step": 292 + }, + { + "completion_length": 79.0703125, + "epoch": 3.5301204819277108, + "grad_norm": 4.407185593870522, + "kl": 0.099853515625, + "learning_rate": 4.116465863453815e-07, + "loss": 0.004, + "reward": 2.298495650291443, + "reward_std": 0.18783311545848846, + "rewards/accuracy_reward": 1.2984956502914429, + "rewards/format_reward": 1.0, + "step": 293 + }, + { + "completion_length": 77.796875, + "epoch": 3.5421686746987953, + "grad_norm": 4.826014110118868, + "kl": 0.09814453125, + "learning_rate": 4.0963855421686744e-07, + "loss": 0.0039, + "reward": 2.2871015071868896, + "reward_std": 0.2442024052143097, + "rewards/accuracy_reward": 1.2871016263961792, + "rewards/format_reward": 1.0, + "step": 294 + }, + { + "completion_length": 81.0390625, + "epoch": 3.5542168674698793, + "grad_norm": 5.044218587715949, + "kl": 0.1220703125, + "learning_rate": 4.076305220883534e-07, + "loss": 0.0049, + "reward": 2.3120492696762085, + "reward_std": 0.26864828169345856, + "rewards/accuracy_reward": 1.3198617696762085, + "rewards/format_reward": 0.9921875, + "step": 295 + }, + { + "completion_length": 81.8046875, + "epoch": 3.566265060240964, + "grad_norm": 4.035337217053536, + "kl": 0.102783203125, + "learning_rate": 4.0562248995983934e-07, + "loss": 0.0041, + "reward": 2.2244678735733032, + "reward_std": 0.19216852635145187, + "rewards/accuracy_reward": 1.2244678139686584, + "rewards/format_reward": 1.0, + "step": 296 + }, + { + "completion_length": 82.1875, + "epoch": 3.5783132530120483, + "grad_norm": 5.473424541297646, + "kl": 0.082275390625, + "learning_rate": 4.036144578313253e-07, + "loss": 0.0033, + "reward": 2.1482508182525635, + "reward_std": 0.2517557144165039, + "rewards/accuracy_reward": 1.1560633182525635, + "rewards/format_reward": 0.9921875, + "step": 297 + }, + { + "completion_length": 76.8828125, + "epoch": 3.5903614457831328, + "grad_norm": 3.624065660089473, + "kl": 0.099609375, + "learning_rate": 4.0160642570281125e-07, + "loss": 0.004, + "reward": 2.460606813430786, + "reward_std": 0.20688265562057495, + "rewards/accuracy_reward": 1.476231873035431, + "rewards/format_reward": 0.984375, + "step": 298 + }, + { + "completion_length": 73.8828125, + "epoch": 3.602409638554217, + "grad_norm": 3.2496622555871775, + "kl": 0.10302734375, + "learning_rate": 3.995983935742971e-07, + "loss": 0.0041, + "reward": 2.448202967643738, + "reward_std": 0.20513835549354553, + "rewards/accuracy_reward": 1.4482029676437378, + "rewards/format_reward": 1.0, + "step": 299 + }, + { + "completion_length": 73.8828125, + "epoch": 3.6144578313253013, + "grad_norm": 3.248403260656612, + "kl": 0.1142578125, + "learning_rate": 3.975903614457831e-07, + "loss": 0.0046, + "reward": 2.3579249382019043, + "reward_std": 0.26106585562229156, + "rewards/accuracy_reward": 1.3657374382019043, + "rewards/format_reward": 0.9921875, + "step": 300 + }, + { + "completion_length": 81.78125, + "epoch": 3.6265060240963853, + "grad_norm": 4.192951592702023, + "kl": 0.090087890625, + "learning_rate": 3.9558232931726903e-07, + "loss": 0.0036, + "reward": 2.320730686187744, + "reward_std": 0.17225497588515282, + "rewards/accuracy_reward": 1.3207308053970337, + "rewards/format_reward": 1.0, + "step": 301 + }, + { + "completion_length": 81.78125, + "epoch": 3.63855421686747, + "grad_norm": 3.914334064533718, + "kl": 0.082763671875, + "learning_rate": 3.93574297188755e-07, + "loss": 0.0033, + "reward": 2.2756303548812866, + "reward_std": 0.21440081298351288, + "rewards/accuracy_reward": 1.2834429144859314, + "rewards/format_reward": 0.9921875, + "step": 302 + }, + { + "completion_length": 83.984375, + "epoch": 3.6506024096385543, + "grad_norm": 2.9158995310046705, + "kl": 0.09326171875, + "learning_rate": 3.9156626506024094e-07, + "loss": 0.0037, + "reward": 2.340207576751709, + "reward_std": 0.22486132383346558, + "rewards/accuracy_reward": 1.3402075171470642, + "rewards/format_reward": 1.0, + "step": 303 + }, + { + "completion_length": 73.0078125, + "epoch": 3.662650602409639, + "grad_norm": 3.64523826351094, + "kl": 0.130615234375, + "learning_rate": 3.895582329317269e-07, + "loss": 0.0052, + "reward": 2.306045651435852, + "reward_std": 0.21042678505182266, + "rewards/accuracy_reward": 1.313858151435852, + "rewards/format_reward": 0.9921875, + "step": 304 + }, + { + "completion_length": 77.140625, + "epoch": 3.674698795180723, + "grad_norm": 4.763683185347457, + "kl": 0.09619140625, + "learning_rate": 3.8755020080321285e-07, + "loss": 0.0038, + "reward": 2.292635202407837, + "reward_std": 0.24200939387083054, + "rewards/accuracy_reward": 1.308260202407837, + "rewards/format_reward": 0.984375, + "step": 305 + }, + { + "completion_length": 80.6875, + "epoch": 3.6867469879518073, + "grad_norm": 15.378313149094321, + "kl": 0.130126953125, + "learning_rate": 3.8554216867469877e-07, + "loss": 0.0052, + "reward": 2.2641184329986572, + "reward_std": 0.20184506475925446, + "rewards/accuracy_reward": 1.2719308137893677, + "rewards/format_reward": 0.9921875, + "step": 306 + }, + { + "completion_length": 72.4453125, + "epoch": 3.6987951807228914, + "grad_norm": 6.1838290298686225, + "kl": 0.114501953125, + "learning_rate": 3.835341365461847e-07, + "loss": 0.0046, + "reward": 2.4186692237854004, + "reward_std": 0.20656991004943848, + "rewards/accuracy_reward": 1.4264817833900452, + "rewards/format_reward": 0.9921875, + "step": 307 + }, + { + "completion_length": 73.71875, + "epoch": 3.710843373493976, + "grad_norm": 3.6680281562358794, + "kl": 0.092041015625, + "learning_rate": 3.815261044176707e-07, + "loss": 0.0037, + "reward": 2.3598402738571167, + "reward_std": 0.1814076155424118, + "rewards/accuracy_reward": 1.3598402738571167, + "rewards/format_reward": 1.0, + "step": 308 + }, + { + "completion_length": 75.5625, + "epoch": 3.7228915662650603, + "grad_norm": 4.1513164017455635, + "kl": 0.11962890625, + "learning_rate": 3.795180722891566e-07, + "loss": 0.0048, + "reward": 2.2364041805267334, + "reward_std": 0.20799466967582703, + "rewards/accuracy_reward": 1.236404299736023, + "rewards/format_reward": 1.0, + "step": 309 + }, + { + "completion_length": 76.2109375, + "epoch": 3.734939759036145, + "grad_norm": 4.53835509987933, + "kl": 0.088623046875, + "learning_rate": 3.7751004016064253e-07, + "loss": 0.0036, + "reward": 2.3527251482009888, + "reward_std": 0.17692391574382782, + "rewards/accuracy_reward": 1.3527252078056335, + "rewards/format_reward": 1.0, + "step": 310 + }, + { + "completion_length": 80.4375, + "epoch": 3.746987951807229, + "grad_norm": 3.703393707261026, + "kl": 0.1103515625, + "learning_rate": 3.755020080321285e-07, + "loss": 0.0044, + "reward": 2.298377275466919, + "reward_std": 0.21109677106142044, + "rewards/accuracy_reward": 1.2983773350715637, + "rewards/format_reward": 1.0, + "step": 311 + }, + { + "completion_length": 77.8125, + "epoch": 3.7590361445783134, + "grad_norm": 3.914375784414754, + "kl": 0.138916015625, + "learning_rate": 3.7349397590361444e-07, + "loss": 0.0056, + "reward": 2.1520947217941284, + "reward_std": 0.19967754930257797, + "rewards/accuracy_reward": 1.1520947813987732, + "rewards/format_reward": 1.0, + "step": 312 + }, + { + "completion_length": 79.2578125, + "epoch": 3.7710843373493974, + "grad_norm": 5.606330092523797, + "kl": 0.091064453125, + "learning_rate": 3.714859437751004e-07, + "loss": 0.0036, + "reward": 2.3204472064971924, + "reward_std": 0.1748044565320015, + "rewards/accuracy_reward": 1.3204472661018372, + "rewards/format_reward": 1.0, + "step": 313 + }, + { + "completion_length": 74.84375, + "epoch": 3.783132530120482, + "grad_norm": 3.2348525038063736, + "kl": 0.08447265625, + "learning_rate": 3.694779116465863e-07, + "loss": 0.0034, + "reward": 2.496751070022583, + "reward_std": 0.2072158306837082, + "rewards/accuracy_reward": 1.496751070022583, + "rewards/format_reward": 1.0, + "step": 314 + }, + { + "completion_length": 74.296875, + "epoch": 3.7951807228915664, + "grad_norm": 3.7371491385040483, + "kl": 0.0771484375, + "learning_rate": 3.674698795180723e-07, + "loss": 0.0031, + "reward": 2.395453691482544, + "reward_std": 0.16877512633800507, + "rewards/accuracy_reward": 1.3954537510871887, + "rewards/format_reward": 1.0, + "step": 315 + }, + { + "completion_length": 72.8671875, + "epoch": 3.807228915662651, + "grad_norm": 5.799331345023467, + "kl": 0.09619140625, + "learning_rate": 3.654618473895582e-07, + "loss": 0.0039, + "reward": 2.307594895362854, + "reward_std": 0.1985296756029129, + "rewards/accuracy_reward": 1.307594895362854, + "rewards/format_reward": 1.0, + "step": 316 + }, + { + "completion_length": 72.84375, + "epoch": 3.819277108433735, + "grad_norm": 5.215215330938529, + "kl": 0.11083984375, + "learning_rate": 3.634538152610442e-07, + "loss": 0.0044, + "reward": 2.2713290452957153, + "reward_std": 0.15980049967765808, + "rewards/accuracy_reward": 1.2791414856910706, + "rewards/format_reward": 0.9921875, + "step": 317 + }, + { + "completion_length": 66.28125, + "epoch": 3.8313253012048194, + "grad_norm": 9.42828281313003, + "kl": 0.106201171875, + "learning_rate": 3.614457831325301e-07, + "loss": 0.0042, + "reward": 2.441011667251587, + "reward_std": 0.21370699256658554, + "rewards/accuracy_reward": 1.4566364884376526, + "rewards/format_reward": 0.984375, + "step": 318 + }, + { + "completion_length": 74.3359375, + "epoch": 3.8433734939759034, + "grad_norm": 3.380164477319568, + "kl": 0.094970703125, + "learning_rate": 3.5943775100401604e-07, + "loss": 0.0038, + "reward": 2.5070927143096924, + "reward_std": 0.16660126298666, + "rewards/accuracy_reward": 1.5149051547050476, + "rewards/format_reward": 0.9921875, + "step": 319 + }, + { + "completion_length": 71.3046875, + "epoch": 3.855421686746988, + "grad_norm": 4.006205885169367, + "kl": 0.128662109375, + "learning_rate": 3.57429718875502e-07, + "loss": 0.0051, + "reward": 2.3042829036712646, + "reward_std": 0.2031613141298294, + "rewards/accuracy_reward": 1.3042829036712646, + "rewards/format_reward": 1.0, + "step": 320 + }, + { + "completion_length": 73.9609375, + "epoch": 3.8674698795180724, + "grad_norm": 5.771036516275782, + "kl": 0.093017578125, + "learning_rate": 3.554216867469879e-07, + "loss": 0.0037, + "reward": 2.422416090965271, + "reward_std": 0.19139418005943298, + "rewards/accuracy_reward": 1.4302285313606262, + "rewards/format_reward": 0.9921875, + "step": 321 + }, + { + "completion_length": 71.734375, + "epoch": 3.8795180722891565, + "grad_norm": 5.860041479699707, + "kl": 0.110595703125, + "learning_rate": 3.5341365461847387e-07, + "loss": 0.0044, + "reward": 2.100473999977112, + "reward_std": 0.21565508097410202, + "rewards/accuracy_reward": 1.1004739999771118, + "rewards/format_reward": 1.0, + "step": 322 + }, + { + "completion_length": 69.046875, + "epoch": 3.891566265060241, + "grad_norm": 4.962719097630754, + "kl": 0.1396484375, + "learning_rate": 3.514056224899598e-07, + "loss": 0.0056, + "reward": 2.337049961090088, + "reward_std": 0.201468363404274, + "rewards/accuracy_reward": 1.337049961090088, + "rewards/format_reward": 1.0, + "step": 323 + }, + { + "completion_length": 70.0234375, + "epoch": 3.9036144578313254, + "grad_norm": 3.786778485554144, + "kl": 0.1064453125, + "learning_rate": 3.493975903614458e-07, + "loss": 0.0043, + "reward": 2.282514452934265, + "reward_std": 0.2470734864473343, + "rewards/accuracy_reward": 1.2903268933296204, + "rewards/format_reward": 0.9921875, + "step": 324 + }, + { + "completion_length": 66.5546875, + "epoch": 3.9156626506024095, + "grad_norm": 5.681847770854111, + "kl": 0.14599609375, + "learning_rate": 3.473895582329317e-07, + "loss": 0.0059, + "reward": 2.2830464839935303, + "reward_std": 0.16951018571853638, + "rewards/accuracy_reward": 1.2830466032028198, + "rewards/format_reward": 1.0, + "step": 325 + }, + { + "completion_length": 69.9765625, + "epoch": 3.927710843373494, + "grad_norm": 3.545177223680582, + "kl": 0.1123046875, + "learning_rate": 3.453815261044177e-07, + "loss": 0.0045, + "reward": 2.3249276876449585, + "reward_std": 0.23469389975070953, + "rewards/accuracy_reward": 1.3249276876449585, + "rewards/format_reward": 1.0, + "step": 326 + }, + { + "completion_length": 67.2109375, + "epoch": 3.9397590361445785, + "grad_norm": 4.464381426334607, + "kl": 0.111328125, + "learning_rate": 3.433734939759036e-07, + "loss": 0.0045, + "reward": 2.313346743583679, + "reward_std": 0.24960950016975403, + "rewards/accuracy_reward": 1.321159303188324, + "rewards/format_reward": 0.9921875, + "step": 327 + }, + { + "completion_length": 69.5390625, + "epoch": 3.9518072289156625, + "grad_norm": 5.503294892764904, + "kl": 0.13818359375, + "learning_rate": 3.413654618473896e-07, + "loss": 0.0055, + "reward": 2.250451922416687, + "reward_std": 0.19627484679222107, + "rewards/accuracy_reward": 1.2582644820213318, + "rewards/format_reward": 0.9921875, + "step": 328 + }, + { + "completion_length": 72.875, + "epoch": 3.963855421686747, + "grad_norm": 3.94333602961405, + "kl": 0.126953125, + "learning_rate": 3.3935742971887547e-07, + "loss": 0.0051, + "reward": 2.4282917976379395, + "reward_std": 0.23817364871501923, + "rewards/accuracy_reward": 1.4361043572425842, + "rewards/format_reward": 0.9921875, + "step": 329 + }, + { + "completion_length": 68.078125, + "epoch": 3.9759036144578315, + "grad_norm": 4.246221946155538, + "kl": 0.10302734375, + "learning_rate": 3.373493975903614e-07, + "loss": 0.0041, + "reward": 2.3756778240203857, + "reward_std": 0.23032685369253159, + "rewards/accuracy_reward": 1.3756778836250305, + "rewards/format_reward": 1.0, + "step": 330 + }, + { + "completion_length": 63.171875, + "epoch": 3.9879518072289155, + "grad_norm": 4.823180720092978, + "kl": 0.14111328125, + "learning_rate": 3.353413654618474e-07, + "loss": 0.0057, + "reward": 2.2716495990753174, + "reward_std": 0.25546562671661377, + "rewards/accuracy_reward": 1.2794621586799622, + "rewards/format_reward": 0.9921875, + "step": 331 + }, + { + "completion_length": 79.75000381469727, + "epoch": 4.0, + "grad_norm": 3.966089593429622, + "kl": 0.10986328125, + "learning_rate": 3.333333333333333e-07, + "loss": 0.0047, + "reward": 1.9844202995300293, + "reward_std": 0.41577973030507565, + "rewards/accuracy_reward": 0.9844204187393188, + "rewards/format_reward": 1.0, + "step": 332 + }, + { + "completion_length": 67.8984375, + "epoch": 4.0120481927710845, + "grad_norm": 3.4890518846644203, + "kl": 0.112548828125, + "learning_rate": 3.313253012048193e-07, + "loss": 0.0045, + "reward": 2.273194432258606, + "reward_std": 0.1845482587814331, + "rewards/accuracy_reward": 1.2810069918632507, + "rewards/format_reward": 0.9921875, + "step": 333 + }, + { + "completion_length": 70.1328125, + "epoch": 4.024096385542169, + "grad_norm": 3.1401475074211698, + "kl": 0.106201171875, + "learning_rate": 3.293172690763052e-07, + "loss": 0.0042, + "reward": 2.348654627799988, + "reward_std": 0.20452319085597992, + "rewards/accuracy_reward": 1.3564670085906982, + "rewards/format_reward": 0.9921875, + "step": 334 + }, + { + "completion_length": 67.4296875, + "epoch": 4.036144578313253, + "grad_norm": 4.049959483426693, + "kl": 0.107177734375, + "learning_rate": 3.273092369477912e-07, + "loss": 0.0043, + "reward": 2.270454525947571, + "reward_std": 0.21142029762268066, + "rewards/accuracy_reward": 1.2704546451568604, + "rewards/format_reward": 1.0, + "step": 335 + }, + { + "completion_length": 71.1484375, + "epoch": 4.048192771084337, + "grad_norm": 3.9561612834766273, + "kl": 0.097412109375, + "learning_rate": 3.2530120481927706e-07, + "loss": 0.0039, + "reward": 2.1833893060684204, + "reward_std": 0.1801520176231861, + "rewards/accuracy_reward": 1.1912018656730652, + "rewards/format_reward": 0.9921875, + "step": 336 + }, + { + "completion_length": 69.59375, + "epoch": 4.0602409638554215, + "grad_norm": 3.977655100011985, + "kl": 0.1474609375, + "learning_rate": 3.2329317269076304e-07, + "loss": 0.0059, + "reward": 2.2047336101531982, + "reward_std": 0.1999206244945526, + "rewards/accuracy_reward": 1.204733669757843, + "rewards/format_reward": 1.0, + "step": 337 + }, + { + "completion_length": 61.4765625, + "epoch": 4.072289156626506, + "grad_norm": 4.191698428231115, + "kl": 0.12939453125, + "learning_rate": 3.2128514056224897e-07, + "loss": 0.0052, + "reward": 2.3498200178146362, + "reward_std": 0.2275300845503807, + "rewards/accuracy_reward": 1.3498198986053467, + "rewards/format_reward": 1.0, + "step": 338 + }, + { + "completion_length": 64.4140625, + "epoch": 4.0843373493975905, + "grad_norm": 3.9067810348739114, + "kl": 0.116943359375, + "learning_rate": 3.192771084337349e-07, + "loss": 0.0047, + "reward": 2.352308511734009, + "reward_std": 0.22002745419740677, + "rewards/accuracy_reward": 1.3523083925247192, + "rewards/format_reward": 1.0, + "step": 339 + }, + { + "completion_length": 73.2890625, + "epoch": 4.096385542168675, + "grad_norm": 4.489032904646898, + "kl": 0.104736328125, + "learning_rate": 3.172690763052209e-07, + "loss": 0.0042, + "reward": 2.1710336208343506, + "reward_std": 0.17718148604035378, + "rewards/accuracy_reward": 1.1710334420204163, + "rewards/format_reward": 1.0, + "step": 340 + }, + { + "completion_length": 74.3671875, + "epoch": 4.108433734939759, + "grad_norm": 4.230949730619595, + "kl": 0.139892578125, + "learning_rate": 3.152610441767068e-07, + "loss": 0.0056, + "reward": 2.084486246109009, + "reward_std": 0.2170683741569519, + "rewards/accuracy_reward": 1.0922988057136536, + "rewards/format_reward": 0.9921875, + "step": 341 + }, + { + "completion_length": 65.5625, + "epoch": 4.120481927710843, + "grad_norm": 5.461293103432774, + "kl": 0.1044921875, + "learning_rate": 3.132530120481928e-07, + "loss": 0.0042, + "reward": 2.381394147872925, + "reward_std": 0.193039670586586, + "rewards/accuracy_reward": 1.38139408826828, + "rewards/format_reward": 1.0, + "step": 342 + }, + { + "completion_length": 66.15625, + "epoch": 4.132530120481928, + "grad_norm": 4.070866693962467, + "kl": 0.111572265625, + "learning_rate": 3.112449799196787e-07, + "loss": 0.0045, + "reward": 2.357278347015381, + "reward_std": 0.15215902030467987, + "rewards/accuracy_reward": 1.3729035258293152, + "rewards/format_reward": 0.984375, + "step": 343 + }, + { + "completion_length": 69.1328125, + "epoch": 4.144578313253012, + "grad_norm": 4.335873726549927, + "kl": 0.123046875, + "learning_rate": 3.0923694779116464e-07, + "loss": 0.0049, + "reward": 2.282222032546997, + "reward_std": 0.25280918926000595, + "rewards/accuracy_reward": 1.2978470921516418, + "rewards/format_reward": 0.984375, + "step": 344 + }, + { + "completion_length": 73.6015625, + "epoch": 4.156626506024097, + "grad_norm": 4.412489990442917, + "kl": 0.09765625, + "learning_rate": 3.0722891566265056e-07, + "loss": 0.0039, + "reward": 2.421238660812378, + "reward_std": 0.21779820322990417, + "rewards/accuracy_reward": 1.4290512800216675, + "rewards/format_reward": 0.9921875, + "step": 345 + }, + { + "completion_length": 67.3984375, + "epoch": 4.168674698795181, + "grad_norm": 3.7050619604015775, + "kl": 0.111083984375, + "learning_rate": 3.0522088353413654e-07, + "loss": 0.0044, + "reward": 2.4159966707229614, + "reward_std": 0.17116259038448334, + "rewards/accuracy_reward": 1.4159966707229614, + "rewards/format_reward": 1.0, + "step": 346 + }, + { + "completion_length": 68.7109375, + "epoch": 4.180722891566265, + "grad_norm": 4.638840034522594, + "kl": 0.119873046875, + "learning_rate": 3.0321285140562247e-07, + "loss": 0.0048, + "reward": 2.430918335914612, + "reward_std": 0.23829656839370728, + "rewards/accuracy_reward": 1.4309183359146118, + "rewards/format_reward": 1.0, + "step": 347 + }, + { + "completion_length": 68.203125, + "epoch": 4.192771084337349, + "grad_norm": 7.531973472034052, + "kl": 0.124267578125, + "learning_rate": 3.0120481927710845e-07, + "loss": 0.005, + "reward": 2.2654261589050293, + "reward_std": 0.214869923889637, + "rewards/accuracy_reward": 1.2966760993003845, + "rewards/format_reward": 0.96875, + "step": 348 + }, + { + "completion_length": 66.3046875, + "epoch": 4.204819277108434, + "grad_norm": 6.290139006407989, + "kl": 0.15673828125, + "learning_rate": 2.991967871485944e-07, + "loss": 0.0063, + "reward": 2.440833330154419, + "reward_std": 0.20570393651723862, + "rewards/accuracy_reward": 1.4642709493637085, + "rewards/format_reward": 0.9765625, + "step": 349 + }, + { + "completion_length": 68.5078125, + "epoch": 4.216867469879518, + "grad_norm": 3.870085506410607, + "kl": 0.11376953125, + "learning_rate": 2.971887550200803e-07, + "loss": 0.0046, + "reward": 2.4419082403182983, + "reward_std": 0.1332126259803772, + "rewards/accuracy_reward": 1.441908359527588, + "rewards/format_reward": 1.0, + "step": 350 + }, + { + "completion_length": 67.7109375, + "epoch": 4.228915662650603, + "grad_norm": 5.222390077968289, + "kl": 0.12548828125, + "learning_rate": 2.9518072289156623e-07, + "loss": 0.005, + "reward": 2.354392647743225, + "reward_std": 0.250136561691761, + "rewards/accuracy_reward": 1.3700175285339355, + "rewards/format_reward": 0.984375, + "step": 351 + }, + { + "completion_length": 63.75, + "epoch": 4.240963855421687, + "grad_norm": 5.7394258697520835, + "kl": 0.13671875, + "learning_rate": 2.9317269076305216e-07, + "loss": 0.0055, + "reward": 2.1846532821655273, + "reward_std": 0.27685467153787613, + "rewards/accuracy_reward": 1.2080907225608826, + "rewards/format_reward": 0.9765625, + "step": 352 + }, + { + "completion_length": 68.734375, + "epoch": 4.253012048192771, + "grad_norm": 3.522967170920438, + "kl": 0.10400390625, + "learning_rate": 2.9116465863453814e-07, + "loss": 0.0041, + "reward": 2.315014600753784, + "reward_std": 0.13816260546445847, + "rewards/accuracy_reward": 1.3150146007537842, + "rewards/format_reward": 1.0, + "step": 353 + }, + { + "completion_length": 72.8125, + "epoch": 4.265060240963855, + "grad_norm": 3.727859373676823, + "kl": 0.12939453125, + "learning_rate": 2.8915662650602407e-07, + "loss": 0.0052, + "reward": 2.206972360610962, + "reward_std": 0.23467965424060822, + "rewards/accuracy_reward": 1.2069722414016724, + "rewards/format_reward": 1.0, + "step": 354 + }, + { + "completion_length": 70.3359375, + "epoch": 4.27710843373494, + "grad_norm": 3.380662774166939, + "kl": 0.09716796875, + "learning_rate": 2.8714859437751005e-07, + "loss": 0.0039, + "reward": 2.1916306018829346, + "reward_std": 0.23339906334877014, + "rewards/accuracy_reward": 1.2072556018829346, + "rewards/format_reward": 0.984375, + "step": 355 + }, + { + "completion_length": 72.4375, + "epoch": 4.289156626506024, + "grad_norm": 3.5703829288777764, + "kl": 0.11376953125, + "learning_rate": 2.85140562248996e-07, + "loss": 0.0046, + "reward": 2.142443895339966, + "reward_std": 0.2050827294588089, + "rewards/accuracy_reward": 1.1580689549446106, + "rewards/format_reward": 0.984375, + "step": 356 + }, + { + "completion_length": 66.9921875, + "epoch": 4.301204819277109, + "grad_norm": 3.6787951883313275, + "kl": 0.119873046875, + "learning_rate": 2.8313253012048195e-07, + "loss": 0.0048, + "reward": 2.6013587713241577, + "reward_std": 0.17792491614818573, + "rewards/accuracy_reward": 1.6013588309288025, + "rewards/format_reward": 1.0, + "step": 357 + }, + { + "completion_length": 67.1875, + "epoch": 4.313253012048193, + "grad_norm": 7.9299540096420476, + "kl": 0.111328125, + "learning_rate": 2.811244979919679e-07, + "loss": 0.0044, + "reward": 2.2114800214767456, + "reward_std": 0.2541910707950592, + "rewards/accuracy_reward": 1.2271050810813904, + "rewards/format_reward": 0.984375, + "step": 358 + }, + { + "completion_length": 69.1953125, + "epoch": 4.325301204819277, + "grad_norm": 3.7315177619787687, + "kl": 0.10400390625, + "learning_rate": 2.7911646586345376e-07, + "loss": 0.0042, + "reward": 2.2850147485733032, + "reward_std": 0.24116653203964233, + "rewards/accuracy_reward": 1.3084524869918823, + "rewards/format_reward": 0.9765625, + "step": 359 + }, + { + "completion_length": 76.6640625, + "epoch": 4.337349397590361, + "grad_norm": 3.8031600707561886, + "kl": 0.08984375, + "learning_rate": 2.7710843373493974e-07, + "loss": 0.0036, + "reward": 2.372725009918213, + "reward_std": 0.23598377406597137, + "rewards/accuracy_reward": 1.380537509918213, + "rewards/format_reward": 0.9921875, + "step": 360 + }, + { + "completion_length": 72.6015625, + "epoch": 4.349397590361446, + "grad_norm": 6.29903230301134, + "kl": 0.10205078125, + "learning_rate": 2.7510040160642566e-07, + "loss": 0.0041, + "reward": 2.3671088218688965, + "reward_std": 0.21375955641269684, + "rewards/accuracy_reward": 1.3749213814735413, + "rewards/format_reward": 0.9921875, + "step": 361 + }, + { + "completion_length": 74.546875, + "epoch": 4.36144578313253, + "grad_norm": 4.5097271327174555, + "kl": 0.100341796875, + "learning_rate": 2.7309236947791164e-07, + "loss": 0.004, + "reward": 2.338581085205078, + "reward_std": 0.21793486177921295, + "rewards/accuracy_reward": 1.3463934063911438, + "rewards/format_reward": 0.9921875, + "step": 362 + }, + { + "completion_length": 73.203125, + "epoch": 4.373493975903615, + "grad_norm": 7.563928087147195, + "kl": 0.093505859375, + "learning_rate": 2.7108433734939757e-07, + "loss": 0.0037, + "reward": 2.4811813831329346, + "reward_std": 0.1661686971783638, + "rewards/accuracy_reward": 1.4811814427375793, + "rewards/format_reward": 1.0, + "step": 363 + }, + { + "completion_length": 72.2109375, + "epoch": 4.385542168674699, + "grad_norm": 4.157739455544304, + "kl": 0.11767578125, + "learning_rate": 2.6907630522088355e-07, + "loss": 0.0047, + "reward": 2.227518320083618, + "reward_std": 0.2459297701716423, + "rewards/accuracy_reward": 1.235330879688263, + "rewards/format_reward": 0.9921875, + "step": 364 + }, + { + "completion_length": 73.125, + "epoch": 4.397590361445783, + "grad_norm": 3.957643739786318, + "kl": 0.130126953125, + "learning_rate": 2.670682730923695e-07, + "loss": 0.0052, + "reward": 2.398737668991089, + "reward_std": 0.2508920058608055, + "rewards/accuracy_reward": 1.406550109386444, + "rewards/format_reward": 0.9921875, + "step": 365 + }, + { + "completion_length": 80.6484375, + "epoch": 4.409638554216867, + "grad_norm": 8.267939908268028, + "kl": 0.126220703125, + "learning_rate": 2.6506024096385546e-07, + "loss": 0.005, + "reward": 2.1884970664978027, + "reward_std": 0.32723745703697205, + "rewards/accuracy_reward": 1.2119346857070923, + "rewards/format_reward": 0.9765625, + "step": 366 + }, + { + "completion_length": 80.09375, + "epoch": 4.421686746987952, + "grad_norm": 3.0023836541953988, + "kl": 0.089111328125, + "learning_rate": 2.6305220883534133e-07, + "loss": 0.0036, + "reward": 2.4019484519958496, + "reward_std": 0.20879995077848434, + "rewards/accuracy_reward": 1.4019483923912048, + "rewards/format_reward": 1.0, + "step": 367 + }, + { + "completion_length": 76.890625, + "epoch": 4.433734939759036, + "grad_norm": 3.8760535577901916, + "kl": 0.110107421875, + "learning_rate": 2.610441767068273e-07, + "loss": 0.0044, + "reward": 2.217389702796936, + "reward_std": 0.20581622421741486, + "rewards/accuracy_reward": 1.225202202796936, + "rewards/format_reward": 0.9921875, + "step": 368 + }, + { + "completion_length": 70.046875, + "epoch": 4.445783132530121, + "grad_norm": 4.189426211226252, + "kl": 0.09912109375, + "learning_rate": 2.5903614457831324e-07, + "loss": 0.004, + "reward": 2.3884357213974, + "reward_std": 0.23216703534126282, + "rewards/accuracy_reward": 1.4118732213974, + "rewards/format_reward": 0.9765625, + "step": 369 + }, + { + "completion_length": 75.3125, + "epoch": 4.457831325301205, + "grad_norm": 3.5709834038432886, + "kl": 0.112060546875, + "learning_rate": 2.5702811244979916e-07, + "loss": 0.0045, + "reward": 2.4395360946655273, + "reward_std": 0.25345855951309204, + "rewards/accuracy_reward": 1.4551611542701721, + "rewards/format_reward": 0.984375, + "step": 370 + }, + { + "completion_length": 76.03125, + "epoch": 4.469879518072289, + "grad_norm": 3.8012985013892897, + "kl": 0.11962890625, + "learning_rate": 2.5502008032128514e-07, + "loss": 0.0048, + "reward": 2.2614444494247437, + "reward_std": 0.25984859466552734, + "rewards/accuracy_reward": 1.2692569494247437, + "rewards/format_reward": 0.9921875, + "step": 371 + }, + { + "completion_length": 72.34375, + "epoch": 4.481927710843373, + "grad_norm": 3.81905493683615, + "kl": 0.118408203125, + "learning_rate": 2.5301204819277107e-07, + "loss": 0.0047, + "reward": 2.24534273147583, + "reward_std": 0.2783522978425026, + "rewards/accuracy_reward": 1.25315523147583, + "rewards/format_reward": 0.9921875, + "step": 372 + }, + { + "completion_length": 73.625, + "epoch": 4.493975903614458, + "grad_norm": 5.859434170398068, + "kl": 0.129638671875, + "learning_rate": 2.5100401606425705e-07, + "loss": 0.0052, + "reward": 2.242166519165039, + "reward_std": 0.19818732887506485, + "rewards/accuracy_reward": 1.2421664595603943, + "rewards/format_reward": 1.0, + "step": 373 + }, + { + "completion_length": 70.7734375, + "epoch": 4.506024096385542, + "grad_norm": 4.577359942879205, + "kl": 0.113037109375, + "learning_rate": 2.489959839357429e-07, + "loss": 0.0045, + "reward": 2.40807843208313, + "reward_std": 0.16506175324320793, + "rewards/accuracy_reward": 1.408078372478485, + "rewards/format_reward": 1.0, + "step": 374 + }, + { + "completion_length": 71.6484375, + "epoch": 4.518072289156627, + "grad_norm": 3.6969886550918627, + "kl": 0.0947265625, + "learning_rate": 2.469879518072289e-07, + "loss": 0.0038, + "reward": 2.4090828895568848, + "reward_std": 0.17872843891382217, + "rewards/accuracy_reward": 1.4090829491615295, + "rewards/format_reward": 1.0, + "step": 375 + }, + { + "completion_length": 75.640625, + "epoch": 4.530120481927711, + "grad_norm": 3.182069910394249, + "kl": 0.112548828125, + "learning_rate": 2.4497991967871483e-07, + "loss": 0.0045, + "reward": 2.429325222969055, + "reward_std": 0.18355486541986465, + "rewards/accuracy_reward": 1.4371376037597656, + "rewards/format_reward": 0.9921875, + "step": 376 + }, + { + "completion_length": 76.8515625, + "epoch": 4.542168674698795, + "grad_norm": 4.3761923522139625, + "kl": 0.103515625, + "learning_rate": 2.429718875502008e-07, + "loss": 0.0041, + "reward": 2.215627670288086, + "reward_std": 0.29024538397789, + "rewards/accuracy_reward": 1.2234401106834412, + "rewards/format_reward": 0.9921875, + "step": 377 + }, + { + "completion_length": 72.640625, + "epoch": 4.554216867469879, + "grad_norm": 5.739152465768093, + "kl": 0.096923828125, + "learning_rate": 2.4096385542168674e-07, + "loss": 0.0039, + "reward": 2.3864386081695557, + "reward_std": 0.14991050213575363, + "rewards/accuracy_reward": 1.3864384889602661, + "rewards/format_reward": 1.0, + "step": 378 + }, + { + "completion_length": 73.7890625, + "epoch": 4.566265060240964, + "grad_norm": 4.330609617515541, + "kl": 0.105712890625, + "learning_rate": 2.3895582329317267e-07, + "loss": 0.0042, + "reward": 2.2676793336868286, + "reward_std": 0.1841476932168007, + "rewards/accuracy_reward": 1.2754917740821838, + "rewards/format_reward": 0.9921875, + "step": 379 + }, + { + "completion_length": 69.5859375, + "epoch": 4.578313253012048, + "grad_norm": 16.70825245009543, + "kl": 0.103515625, + "learning_rate": 2.3694779116465862e-07, + "loss": 0.0041, + "reward": 2.3687047958374023, + "reward_std": 0.23368250578641891, + "rewards/accuracy_reward": 1.3765172958374023, + "rewards/format_reward": 0.9921875, + "step": 380 + }, + { + "completion_length": 68.5703125, + "epoch": 4.590361445783133, + "grad_norm": 4.946973705468274, + "kl": 0.11865234375, + "learning_rate": 2.3493975903614457e-07, + "loss": 0.0047, + "reward": 2.409714102745056, + "reward_std": 0.17494437843561172, + "rewards/accuracy_reward": 1.4175265431404114, + "rewards/format_reward": 0.9921875, + "step": 381 + }, + { + "completion_length": 69.09375, + "epoch": 4.602409638554217, + "grad_norm": 3.4407209788639155, + "kl": 0.108154296875, + "learning_rate": 2.3293172690763053e-07, + "loss": 0.0043, + "reward": 2.3722596168518066, + "reward_std": 0.2456066906452179, + "rewards/accuracy_reward": 1.3722596764564514, + "rewards/format_reward": 1.0, + "step": 382 + }, + { + "completion_length": 73.40625, + "epoch": 4.614457831325301, + "grad_norm": 6.785057754949663, + "kl": 0.093017578125, + "learning_rate": 2.3092369477911648e-07, + "loss": 0.0037, + "reward": 2.390730619430542, + "reward_std": 0.13034258037805557, + "rewards/accuracy_reward": 1.390730619430542, + "rewards/format_reward": 1.0, + "step": 383 + }, + { + "completion_length": 69.578125, + "epoch": 4.626506024096385, + "grad_norm": 4.146766679362004, + "kl": 0.110107421875, + "learning_rate": 2.2891566265060238e-07, + "loss": 0.0044, + "reward": 2.457837224006653, + "reward_std": 0.19646844267845154, + "rewards/accuracy_reward": 1.465649664402008, + "rewards/format_reward": 0.9921875, + "step": 384 + }, + { + "completion_length": 71.4765625, + "epoch": 4.63855421686747, + "grad_norm": 3.5134218173180884, + "kl": 0.10791015625, + "learning_rate": 2.2690763052208834e-07, + "loss": 0.0043, + "reward": 2.2395870685577393, + "reward_std": 0.23986083269119263, + "rewards/accuracy_reward": 1.2630245089530945, + "rewards/format_reward": 0.9765625, + "step": 385 + }, + { + "completion_length": 67.8984375, + "epoch": 4.650602409638554, + "grad_norm": 3.5532098801033323, + "kl": 0.112060546875, + "learning_rate": 2.248995983935743e-07, + "loss": 0.0045, + "reward": 2.155800759792328, + "reward_std": 0.26599714159965515, + "rewards/accuracy_reward": 1.1714258790016174, + "rewards/format_reward": 0.984375, + "step": 386 + }, + { + "completion_length": 67.921875, + "epoch": 4.662650602409639, + "grad_norm": 3.977191337497143, + "kl": 0.12353515625, + "learning_rate": 2.2289156626506022e-07, + "loss": 0.0049, + "reward": 2.1573885679244995, + "reward_std": 0.19674725830554962, + "rewards/accuracy_reward": 1.165201187133789, + "rewards/format_reward": 0.9921875, + "step": 387 + }, + { + "completion_length": 73.3671875, + "epoch": 4.674698795180722, + "grad_norm": 3.4384187805900894, + "kl": 0.1005859375, + "learning_rate": 2.2088353413654617e-07, + "loss": 0.004, + "reward": 2.238619089126587, + "reward_std": 0.1663391888141632, + "rewards/accuracy_reward": 1.2386190295219421, + "rewards/format_reward": 1.0, + "step": 388 + }, + { + "completion_length": 71.3515625, + "epoch": 4.686746987951807, + "grad_norm": 3.6715987846617737, + "kl": 0.1103515625, + "learning_rate": 2.1887550200803212e-07, + "loss": 0.0044, + "reward": 2.2813053131103516, + "reward_std": 0.20307840406894684, + "rewards/accuracy_reward": 1.2891177535057068, + "rewards/format_reward": 0.9921875, + "step": 389 + }, + { + "completion_length": 67.8671875, + "epoch": 4.698795180722891, + "grad_norm": 4.1990886176906566, + "kl": 0.1181640625, + "learning_rate": 2.1686746987951808e-07, + "loss": 0.0047, + "reward": 2.3316123485565186, + "reward_std": 0.18899912387132645, + "rewards/accuracy_reward": 1.339424967765808, + "rewards/format_reward": 0.9921875, + "step": 390 + }, + { + "completion_length": 73.5390625, + "epoch": 4.710843373493976, + "grad_norm": 4.5848307121684035, + "kl": 0.11767578125, + "learning_rate": 2.14859437751004e-07, + "loss": 0.0047, + "reward": 2.3556346893310547, + "reward_std": 0.17518161982297897, + "rewards/accuracy_reward": 1.3634473085403442, + "rewards/format_reward": 0.9921875, + "step": 391 + }, + { + "completion_length": 73.3828125, + "epoch": 4.72289156626506, + "grad_norm": 4.308895887462787, + "kl": 0.09716796875, + "learning_rate": 2.1285140562248996e-07, + "loss": 0.0039, + "reward": 2.3230199813842773, + "reward_std": 0.2215501293540001, + "rewards/accuracy_reward": 1.3230200409889221, + "rewards/format_reward": 1.0, + "step": 392 + }, + { + "completion_length": 71.625, + "epoch": 4.734939759036145, + "grad_norm": 3.8869195849917335, + "kl": 0.117919921875, + "learning_rate": 2.108433734939759e-07, + "loss": 0.0047, + "reward": 2.311624765396118, + "reward_std": 0.233637273311615, + "rewards/accuracy_reward": 1.3116250038146973, + "rewards/format_reward": 1.0, + "step": 393 + }, + { + "completion_length": 67.828125, + "epoch": 4.746987951807229, + "grad_norm": 4.950759054297939, + "kl": 0.10888671875, + "learning_rate": 2.0883534136546184e-07, + "loss": 0.0044, + "reward": 2.379747152328491, + "reward_std": 0.19298578798770905, + "rewards/accuracy_reward": 1.3797469735145569, + "rewards/format_reward": 1.0, + "step": 394 + }, + { + "completion_length": 72.2578125, + "epoch": 4.759036144578313, + "grad_norm": 45.47765651174386, + "kl": 0.126708984375, + "learning_rate": 2.0682730923694776e-07, + "loss": 0.0051, + "reward": 2.078563928604126, + "reward_std": 0.253988578915596, + "rewards/accuracy_reward": 1.0941888689994812, + "rewards/format_reward": 0.984375, + "step": 395 + }, + { + "completion_length": 71.6484375, + "epoch": 4.771084337349397, + "grad_norm": 6.044646695827286, + "kl": 0.13916015625, + "learning_rate": 2.0481927710843372e-07, + "loss": 0.0056, + "reward": 2.485829472541809, + "reward_std": 0.180104598402977, + "rewards/accuracy_reward": 1.4858292937278748, + "rewards/format_reward": 1.0, + "step": 396 + }, + { + "completion_length": 65.09375, + "epoch": 4.783132530120482, + "grad_norm": 4.360820446081869, + "kl": 0.1416015625, + "learning_rate": 2.0281124497991967e-07, + "loss": 0.0057, + "reward": 2.1638635396957397, + "reward_std": 0.31551285088062286, + "rewards/accuracy_reward": 1.1873010993003845, + "rewards/format_reward": 0.9765625, + "step": 397 + }, + { + "completion_length": 70.6328125, + "epoch": 4.795180722891566, + "grad_norm": 5.234619949658262, + "kl": 0.115966796875, + "learning_rate": 2.0080321285140563e-07, + "loss": 0.0046, + "reward": 2.424190402030945, + "reward_std": 0.23157334327697754, + "rewards/accuracy_reward": 1.4241904616355896, + "rewards/format_reward": 1.0, + "step": 398 + }, + { + "completion_length": 70.4375, + "epoch": 4.807228915662651, + "grad_norm": 5.2543384630783265, + "kl": 0.12060546875, + "learning_rate": 1.9879518072289155e-07, + "loss": 0.0048, + "reward": 2.3333520889282227, + "reward_std": 0.2145429253578186, + "rewards/accuracy_reward": 1.3411647081375122, + "rewards/format_reward": 0.9921875, + "step": 399 + }, + { + "completion_length": 65.421875, + "epoch": 4.8192771084337345, + "grad_norm": 6.050688926597152, + "kl": 0.125732421875, + "learning_rate": 1.967871485943775e-07, + "loss": 0.005, + "reward": 2.412783145904541, + "reward_std": 0.2059781178832054, + "rewards/accuracy_reward": 1.420595645904541, + "rewards/format_reward": 0.9921875, + "step": 400 + }, + { + "completion_length": 63.5546875, + "epoch": 4.831325301204819, + "grad_norm": 4.14350718873446, + "kl": 0.143798828125, + "learning_rate": 1.9477911646586346e-07, + "loss": 0.0057, + "reward": 2.3667309284210205, + "reward_std": 0.1764308363199234, + "rewards/accuracy_reward": 1.3745434284210205, + "rewards/format_reward": 0.9921875, + "step": 401 + }, + { + "completion_length": 71.8671875, + "epoch": 4.843373493975903, + "grad_norm": 4.134424932683493, + "kl": 0.126953125, + "learning_rate": 1.9277108433734939e-07, + "loss": 0.0051, + "reward": 2.2129541635513306, + "reward_std": 0.1565767452120781, + "rewards/accuracy_reward": 1.2129541635513306, + "rewards/format_reward": 1.0, + "step": 402 + }, + { + "completion_length": 64.0390625, + "epoch": 4.855421686746988, + "grad_norm": 4.135875391105592, + "kl": 0.166015625, + "learning_rate": 1.9076305220883534e-07, + "loss": 0.0066, + "reward": 2.3259581327438354, + "reward_std": 0.2349315583705902, + "rewards/accuracy_reward": 1.3259583115577698, + "rewards/format_reward": 1.0, + "step": 403 + }, + { + "completion_length": 66.515625, + "epoch": 4.867469879518072, + "grad_norm": 4.276605246406482, + "kl": 0.138916015625, + "learning_rate": 1.8875502008032127e-07, + "loss": 0.0056, + "reward": 2.306966781616211, + "reward_std": 0.2081274688243866, + "rewards/accuracy_reward": 1.3069666624069214, + "rewards/format_reward": 1.0, + "step": 404 + }, + { + "completion_length": 62.28125, + "epoch": 4.879518072289157, + "grad_norm": 4.594134632277065, + "kl": 0.1826171875, + "learning_rate": 1.8674698795180722e-07, + "loss": 0.0073, + "reward": 2.126552700996399, + "reward_std": 0.255823478102684, + "rewards/accuracy_reward": 1.1421778202056885, + "rewards/format_reward": 0.984375, + "step": 405 + }, + { + "completion_length": 62.3671875, + "epoch": 4.891566265060241, + "grad_norm": 3.568434088807843, + "kl": 0.14013671875, + "learning_rate": 1.8473895582329315e-07, + "loss": 0.0056, + "reward": 2.417848587036133, + "reward_std": 0.22225632518529892, + "rewards/accuracy_reward": 1.4334735870361328, + "rewards/format_reward": 0.984375, + "step": 406 + }, + { + "completion_length": 66.5078125, + "epoch": 4.903614457831325, + "grad_norm": 4.123527789276523, + "kl": 0.10986328125, + "learning_rate": 1.827309236947791e-07, + "loss": 0.0044, + "reward": 2.294624924659729, + "reward_std": 0.19924252480268478, + "rewards/accuracy_reward": 1.3024373650550842, + "rewards/format_reward": 0.9921875, + "step": 407 + }, + { + "completion_length": 66.390625, + "epoch": 4.9156626506024095, + "grad_norm": 3.62978164804241, + "kl": 0.12890625, + "learning_rate": 1.8072289156626505e-07, + "loss": 0.0051, + "reward": 2.543404698371887, + "reward_std": 0.1362360306084156, + "rewards/accuracy_reward": 1.5434046983718872, + "rewards/format_reward": 1.0, + "step": 408 + }, + { + "completion_length": 63.9765625, + "epoch": 4.927710843373494, + "grad_norm": 4.35384844886202, + "kl": 0.12890625, + "learning_rate": 1.78714859437751e-07, + "loss": 0.0052, + "reward": 2.418124198913574, + "reward_std": 0.22236012667417526, + "rewards/accuracy_reward": 1.4337490797042847, + "rewards/format_reward": 0.984375, + "step": 409 + }, + { + "completion_length": 68.90625, + "epoch": 4.9397590361445785, + "grad_norm": 5.014972518639089, + "kl": 0.1103515625, + "learning_rate": 1.7670682730923694e-07, + "loss": 0.0044, + "reward": 2.4006751775741577, + "reward_std": 0.16714774072170258, + "rewards/accuracy_reward": 1.4006752967834473, + "rewards/format_reward": 1.0, + "step": 410 + }, + { + "completion_length": 69.59375, + "epoch": 4.951807228915663, + "grad_norm": 7.696032017895469, + "kl": 0.13916015625, + "learning_rate": 1.746987951807229e-07, + "loss": 0.0056, + "reward": 2.395194172859192, + "reward_std": 0.16039493680000305, + "rewards/accuracy_reward": 1.3951941132545471, + "rewards/format_reward": 1.0, + "step": 411 + }, + { + "completion_length": 70.125, + "epoch": 4.9638554216867465, + "grad_norm": 4.628350833888434, + "kl": 0.149169921875, + "learning_rate": 1.7269076305220884e-07, + "loss": 0.006, + "reward": 2.1348607540130615, + "reward_std": 0.1709538996219635, + "rewards/accuracy_reward": 1.1348606944084167, + "rewards/format_reward": 1.0, + "step": 412 + }, + { + "completion_length": 66.2109375, + "epoch": 4.975903614457831, + "grad_norm": 3.188607704812383, + "kl": 0.12646484375, + "learning_rate": 1.706827309236948e-07, + "loss": 0.0051, + "reward": 2.302504062652588, + "reward_std": 0.2623682767152786, + "rewards/accuracy_reward": 1.3181291222572327, + "rewards/format_reward": 0.984375, + "step": 413 + }, + { + "completion_length": 64.171875, + "epoch": 4.9879518072289155, + "grad_norm": 3.9665667179390773, + "kl": 0.128662109375, + "learning_rate": 1.686746987951807e-07, + "loss": 0.0052, + "reward": 2.4097338914871216, + "reward_std": 0.17293449118733406, + "rewards/accuracy_reward": 1.4097338318824768, + "rewards/format_reward": 1.0, + "step": 414 + }, + { + "completion_length": 77.33333587646484, + "epoch": 5.0, + "grad_norm": 3.313170759959086, + "kl": 0.1083984375, + "learning_rate": 1.6666666666666665e-07, + "loss": 0.004, + "reward": 2.2759520411491394, + "reward_std": 0.1403224766254425, + "rewards/accuracy_reward": 1.2759520411491394, + "rewards/format_reward": 1.0, + "step": 415 + }, + { + "completion_length": 66.3203125, + "epoch": 5.0120481927710845, + "grad_norm": 4.277881132595083, + "kl": 0.14306640625, + "learning_rate": 1.646586345381526e-07, + "loss": 0.0057, + "reward": 2.373741865158081, + "reward_std": 0.20744601637125015, + "rewards/accuracy_reward": 1.3815542459487915, + "rewards/format_reward": 0.9921875, + "step": 416 + }, + { + "completion_length": 66.53125, + "epoch": 5.024096385542169, + "grad_norm": 3.9929439696450575, + "kl": 0.12939453125, + "learning_rate": 1.6265060240963853e-07, + "loss": 0.0052, + "reward": 2.35166335105896, + "reward_std": 0.2503097951412201, + "rewards/accuracy_reward": 1.35166335105896, + "rewards/format_reward": 1.0, + "step": 417 + }, + { + "completion_length": 68.625, + "epoch": 5.036144578313253, + "grad_norm": 4.023924792103433, + "kl": 0.114013671875, + "learning_rate": 1.6064257028112448e-07, + "loss": 0.0046, + "reward": 2.2476612329483032, + "reward_std": 0.185993991792202, + "rewards/accuracy_reward": 1.2554737329483032, + "rewards/format_reward": 0.9921875, + "step": 418 + }, + { + "completion_length": 65.7421875, + "epoch": 5.048192771084337, + "grad_norm": 3.5711137415239618, + "kl": 0.134033203125, + "learning_rate": 1.5863453815261044e-07, + "loss": 0.0054, + "reward": 2.2856324911117554, + "reward_std": 0.14102690666913986, + "rewards/accuracy_reward": 1.2856324911117554, + "rewards/format_reward": 1.0, + "step": 419 + }, + { + "completion_length": 65.1328125, + "epoch": 5.0602409638554215, + "grad_norm": 5.8881280705003505, + "kl": 0.1259765625, + "learning_rate": 1.566265060240964e-07, + "loss": 0.005, + "reward": 2.474275588989258, + "reward_std": 0.2030300498008728, + "rewards/accuracy_reward": 1.474275529384613, + "rewards/format_reward": 1.0, + "step": 420 + }, + { + "completion_length": 59.453125, + "epoch": 5.072289156626506, + "grad_norm": 17.487945694806488, + "kl": 0.1279296875, + "learning_rate": 1.5461847389558232e-07, + "loss": 0.0051, + "reward": 2.468233823776245, + "reward_std": 0.17333931475877762, + "rewards/accuracy_reward": 1.4682338237762451, + "rewards/format_reward": 1.0, + "step": 421 + }, + { + "completion_length": 67.7421875, + "epoch": 5.0843373493975905, + "grad_norm": 4.5642738703913865, + "kl": 0.12646484375, + "learning_rate": 1.5261044176706827e-07, + "loss": 0.0051, + "reward": 2.39510977268219, + "reward_std": 0.1837218478322029, + "rewards/accuracy_reward": 1.3951098918914795, + "rewards/format_reward": 1.0, + "step": 422 + }, + { + "completion_length": 64.515625, + "epoch": 5.096385542168675, + "grad_norm": 7.684070732359071, + "kl": 0.139892578125, + "learning_rate": 1.5060240963855423e-07, + "loss": 0.0056, + "reward": 2.16294264793396, + "reward_std": 0.14895135164260864, + "rewards/accuracy_reward": 1.1707550883293152, + "rewards/format_reward": 0.9921875, + "step": 423 + }, + { + "completion_length": 64.46875, + "epoch": 5.108433734939759, + "grad_norm": 3.930344733874979, + "kl": 0.11669921875, + "learning_rate": 1.4859437751004015e-07, + "loss": 0.0047, + "reward": 2.3980486392974854, + "reward_std": 0.15896277129650116, + "rewards/accuracy_reward": 1.3980485796928406, + "rewards/format_reward": 1.0, + "step": 424 + }, + { + "completion_length": 68.875, + "epoch": 5.120481927710843, + "grad_norm": 6.912033255857147, + "kl": 0.118896484375, + "learning_rate": 1.4658634538152608e-07, + "loss": 0.0048, + "reward": 2.4401201009750366, + "reward_std": 0.18969366699457169, + "rewards/accuracy_reward": 1.440119981765747, + "rewards/format_reward": 1.0, + "step": 425 + }, + { + "completion_length": 65.609375, + "epoch": 5.132530120481928, + "grad_norm": 3.6477005267341163, + "kl": 0.1708984375, + "learning_rate": 1.4457831325301203e-07, + "loss": 0.0068, + "reward": 2.300011992454529, + "reward_std": 0.2104162722826004, + "rewards/accuracy_reward": 1.300011932849884, + "rewards/format_reward": 1.0, + "step": 426 + }, + { + "completion_length": 65.0859375, + "epoch": 5.144578313253012, + "grad_norm": 5.390081007205584, + "kl": 0.12548828125, + "learning_rate": 1.42570281124498e-07, + "loss": 0.005, + "reward": 2.407547354698181, + "reward_std": 0.19479839503765106, + "rewards/accuracy_reward": 1.4075472354888916, + "rewards/format_reward": 1.0, + "step": 427 + }, + { + "completion_length": 65.8046875, + "epoch": 5.156626506024097, + "grad_norm": 5.842696773596783, + "kl": 0.12255859375, + "learning_rate": 1.4056224899598394e-07, + "loss": 0.0049, + "reward": 2.2872836589813232, + "reward_std": 0.2501709461212158, + "rewards/accuracy_reward": 1.2950963973999023, + "rewards/format_reward": 0.9921875, + "step": 428 + }, + { + "completion_length": 67.2890625, + "epoch": 5.168674698795181, + "grad_norm": 3.9373211288360612, + "kl": 0.134765625, + "learning_rate": 1.3855421686746987e-07, + "loss": 0.0054, + "reward": 2.4114162921905518, + "reward_std": 0.22173649817705154, + "rewards/accuracy_reward": 1.419228732585907, + "rewards/format_reward": 0.9921875, + "step": 429 + }, + { + "completion_length": 65.7265625, + "epoch": 5.180722891566265, + "grad_norm": 5.989728831260378, + "kl": 0.20263671875, + "learning_rate": 1.3654618473895582e-07, + "loss": 0.0081, + "reward": 2.349661111831665, + "reward_std": 0.24485966563224792, + "rewards/accuracy_reward": 1.3496609926223755, + "rewards/format_reward": 1.0, + "step": 430 + }, + { + "completion_length": 71.0390625, + "epoch": 5.192771084337349, + "grad_norm": 4.9722233041190425, + "kl": 0.11083984375, + "learning_rate": 1.3453815261044177e-07, + "loss": 0.0044, + "reward": 2.423168659210205, + "reward_std": 0.16536322236061096, + "rewards/accuracy_reward": 1.4231685996055603, + "rewards/format_reward": 1.0, + "step": 431 + }, + { + "completion_length": 66.234375, + "epoch": 5.204819277108434, + "grad_norm": 3.5058259130400162, + "kl": 0.1376953125, + "learning_rate": 1.3253012048192773e-07, + "loss": 0.0055, + "reward": 2.2352651357650757, + "reward_std": 0.18688317388296127, + "rewards/accuracy_reward": 1.2352651357650757, + "rewards/format_reward": 1.0, + "step": 432 + }, + { + "completion_length": 72.8203125, + "epoch": 5.216867469879518, + "grad_norm": 3.8748331360003485, + "kl": 0.130859375, + "learning_rate": 1.3052208835341366e-07, + "loss": 0.0052, + "reward": 2.3151748180389404, + "reward_std": 0.21110112965106964, + "rewards/accuracy_reward": 1.3229871988296509, + "rewards/format_reward": 0.9921875, + "step": 433 + }, + { + "completion_length": 68.8671875, + "epoch": 5.228915662650603, + "grad_norm": 3.985332448415374, + "kl": 0.1220703125, + "learning_rate": 1.2851405622489958e-07, + "loss": 0.0049, + "reward": 2.26615047454834, + "reward_std": 0.20259422063827515, + "rewards/accuracy_reward": 1.2739630937576294, + "rewards/format_reward": 0.9921875, + "step": 434 + }, + { + "completion_length": 64.0234375, + "epoch": 5.240963855421687, + "grad_norm": 4.209088113123041, + "kl": 0.119873046875, + "learning_rate": 1.2650602409638554e-07, + "loss": 0.0048, + "reward": 2.345677137374878, + "reward_std": 0.16655350476503372, + "rewards/accuracy_reward": 1.345677137374878, + "rewards/format_reward": 1.0, + "step": 435 + }, + { + "completion_length": 72.2109375, + "epoch": 5.253012048192771, + "grad_norm": 3.7180924645581994, + "kl": 0.13427734375, + "learning_rate": 1.2449799196787146e-07, + "loss": 0.0054, + "reward": 2.163213849067688, + "reward_std": 0.3149610310792923, + "rewards/accuracy_reward": 1.1866515278816223, + "rewards/format_reward": 0.9765625, + "step": 436 + }, + { + "completion_length": 65.328125, + "epoch": 5.265060240963855, + "grad_norm": 3.8280472693841556, + "kl": 0.12744140625, + "learning_rate": 1.2248995983935742e-07, + "loss": 0.0051, + "reward": 2.3446794748306274, + "reward_std": 0.22430174052715302, + "rewards/accuracy_reward": 1.3446794152259827, + "rewards/format_reward": 1.0, + "step": 437 + }, + { + "completion_length": 64.65625, + "epoch": 5.27710843373494, + "grad_norm": 5.861122122648032, + "kl": 0.12060546875, + "learning_rate": 1.2048192771084337e-07, + "loss": 0.0048, + "reward": 2.379356861114502, + "reward_std": 0.1506607085466385, + "rewards/accuracy_reward": 1.3871691226959229, + "rewards/format_reward": 0.9921875, + "step": 438 + }, + { + "completion_length": 71.1171875, + "epoch": 5.289156626506024, + "grad_norm": 3.8119653679452092, + "kl": 0.12353515625, + "learning_rate": 1.1847389558232931e-07, + "loss": 0.0049, + "reward": 2.388357400894165, + "reward_std": 0.23687779903411865, + "rewards/accuracy_reward": 1.3961697816848755, + "rewards/format_reward": 0.9921875, + "step": 439 + }, + { + "completion_length": 72.3515625, + "epoch": 5.301204819277109, + "grad_norm": 3.9178115284886372, + "kl": 0.095458984375, + "learning_rate": 1.1646586345381526e-07, + "loss": 0.0038, + "reward": 2.6513583660125732, + "reward_std": 0.17830242216587067, + "rewards/accuracy_reward": 1.6513583660125732, + "rewards/format_reward": 1.0, + "step": 440 + }, + { + "completion_length": 68.921875, + "epoch": 5.313253012048193, + "grad_norm": 4.623442869387058, + "kl": 0.100830078125, + "learning_rate": 1.1445783132530119e-07, + "loss": 0.004, + "reward": 2.549654483795166, + "reward_std": 0.16079290956258774, + "rewards/accuracy_reward": 1.5574671030044556, + "rewards/format_reward": 0.9921875, + "step": 441 + }, + { + "completion_length": 71.3203125, + "epoch": 5.325301204819277, + "grad_norm": 5.278895722638805, + "kl": 0.10986328125, + "learning_rate": 1.1244979919678714e-07, + "loss": 0.0044, + "reward": 2.203883409500122, + "reward_std": 0.258064404129982, + "rewards/accuracy_reward": 1.2116957902908325, + "rewards/format_reward": 0.9921875, + "step": 442 + }, + { + "completion_length": 69.515625, + "epoch": 5.337349397590361, + "grad_norm": 4.142710717599773, + "kl": 0.113525390625, + "learning_rate": 1.1044176706827308e-07, + "loss": 0.0045, + "reward": 2.1769516468048096, + "reward_std": 0.275626465678215, + "rewards/accuracy_reward": 1.1769516468048096, + "rewards/format_reward": 1.0, + "step": 443 + }, + { + "completion_length": 68.3203125, + "epoch": 5.349397590361446, + "grad_norm": 4.180078412016221, + "kl": 0.147216796875, + "learning_rate": 1.0843373493975904e-07, + "loss": 0.0059, + "reward": 2.381720542907715, + "reward_std": 0.20287376642227173, + "rewards/accuracy_reward": 1.3817205429077148, + "rewards/format_reward": 1.0, + "step": 444 + }, + { + "completion_length": 69.7421875, + "epoch": 5.36144578313253, + "grad_norm": 3.7523897150785603, + "kl": 0.12939453125, + "learning_rate": 1.0642570281124498e-07, + "loss": 0.0052, + "reward": 2.3669261932373047, + "reward_std": 0.2056456208229065, + "rewards/accuracy_reward": 1.3747385740280151, + "rewards/format_reward": 0.9921875, + "step": 445 + }, + { + "completion_length": 67.7109375, + "epoch": 5.373493975903615, + "grad_norm": 4.924758819089559, + "kl": 0.185546875, + "learning_rate": 1.0441767068273092e-07, + "loss": 0.0074, + "reward": 2.4100332260131836, + "reward_std": 0.22913093864917755, + "rewards/accuracy_reward": 1.4178457260131836, + "rewards/format_reward": 0.9921875, + "step": 446 + }, + { + "completion_length": 69.1875, + "epoch": 5.385542168674699, + "grad_norm": 3.080626056952063, + "kl": 0.122314453125, + "learning_rate": 1.0240963855421686e-07, + "loss": 0.0049, + "reward": 2.3073067665100098, + "reward_std": 0.23586007952690125, + "rewards/accuracy_reward": 1.315119206905365, + "rewards/format_reward": 0.9921875, + "step": 447 + }, + { + "completion_length": 67.59375, + "epoch": 5.397590361445783, + "grad_norm": 3.8573400804993314, + "kl": 0.128662109375, + "learning_rate": 1.0040160642570281e-07, + "loss": 0.0051, + "reward": 2.2195699214935303, + "reward_std": 0.18059836328029633, + "rewards/accuracy_reward": 1.2195698618888855, + "rewards/format_reward": 1.0, + "step": 448 + }, + { + "completion_length": 65.0078125, + "epoch": 5.409638554216867, + "grad_norm": 9.729377045307634, + "kl": 0.110107421875, + "learning_rate": 9.839357429718875e-08, + "loss": 0.0044, + "reward": 2.335146427154541, + "reward_std": 0.20962534099817276, + "rewards/accuracy_reward": 1.3429590463638306, + "rewards/format_reward": 0.9921875, + "step": 449 + }, + { + "completion_length": 76.171875, + "epoch": 5.421686746987952, + "grad_norm": 5.139417091846479, + "kl": 0.17626953125, + "learning_rate": 9.638554216867469e-08, + "loss": 0.0071, + "reward": 2.2514326572418213, + "reward_std": 0.18450473248958588, + "rewards/accuracy_reward": 1.2592450976371765, + "rewards/format_reward": 0.9921875, + "step": 450 + }, + { + "completion_length": 68.046875, + "epoch": 5.433734939759036, + "grad_norm": 3.961385062957452, + "kl": 0.10693359375, + "learning_rate": 9.437751004016063e-08, + "loss": 0.0043, + "reward": 2.328533172607422, + "reward_std": 0.18290965259075165, + "rewards/accuracy_reward": 1.3285331726074219, + "rewards/format_reward": 1.0, + "step": 451 + }, + { + "completion_length": 68.6953125, + "epoch": 5.445783132530121, + "grad_norm": 4.887519681333338, + "kl": 0.103759765625, + "learning_rate": 9.236947791164657e-08, + "loss": 0.0042, + "reward": 2.3144426345825195, + "reward_std": 0.21034369617700577, + "rewards/accuracy_reward": 1.3144426941871643, + "rewards/format_reward": 1.0, + "step": 452 + }, + { + "completion_length": 68.0, + "epoch": 5.457831325301205, + "grad_norm": 3.80893967356862, + "kl": 0.127685546875, + "learning_rate": 9.036144578313253e-08, + "loss": 0.0051, + "reward": 2.4345412254333496, + "reward_std": 0.2006332352757454, + "rewards/accuracy_reward": 1.4345412254333496, + "rewards/format_reward": 1.0, + "step": 453 + }, + { + "completion_length": 67.046875, + "epoch": 5.469879518072289, + "grad_norm": 4.2954066473287815, + "kl": 0.12841796875, + "learning_rate": 8.835341365461847e-08, + "loss": 0.0052, + "reward": 2.353352427482605, + "reward_std": 0.22566306591033936, + "rewards/accuracy_reward": 1.353352427482605, + "rewards/format_reward": 1.0, + "step": 454 + }, + { + "completion_length": 64.8984375, + "epoch": 5.481927710843373, + "grad_norm": 4.546803918905019, + "kl": 0.1337890625, + "learning_rate": 8.634538152610442e-08, + "loss": 0.0054, + "reward": 2.3113902807235718, + "reward_std": 0.20004340261220932, + "rewards/accuracy_reward": 1.3192027807235718, + "rewards/format_reward": 0.9921875, + "step": 455 + }, + { + "completion_length": 66.1640625, + "epoch": 5.493975903614458, + "grad_norm": 3.5466190382737883, + "kl": 0.123046875, + "learning_rate": 8.433734939759035e-08, + "loss": 0.0049, + "reward": 2.3270002603530884, + "reward_std": 0.21506989747285843, + "rewards/accuracy_reward": 1.3270001411437988, + "rewards/format_reward": 1.0, + "step": 456 + }, + { + "completion_length": 72.3984375, + "epoch": 5.506024096385542, + "grad_norm": 5.213818604387868, + "kl": 0.1328125, + "learning_rate": 8.23293172690763e-08, + "loss": 0.0053, + "reward": 2.4117329120635986, + "reward_std": 0.21075783669948578, + "rewards/accuracy_reward": 1.411732792854309, + "rewards/format_reward": 1.0, + "step": 457 + }, + { + "completion_length": 63.4140625, + "epoch": 5.518072289156627, + "grad_norm": 4.087135154378612, + "kl": 0.1142578125, + "learning_rate": 8.032128514056224e-08, + "loss": 0.0046, + "reward": 2.2361518144607544, + "reward_std": 0.15534771978855133, + "rewards/accuracy_reward": 1.2361518740653992, + "rewards/format_reward": 1.0, + "step": 458 + }, + { + "completion_length": 66.6796875, + "epoch": 5.530120481927711, + "grad_norm": 3.8509871084036083, + "kl": 0.12255859375, + "learning_rate": 7.83132530120482e-08, + "loss": 0.0049, + "reward": 2.402904510498047, + "reward_std": 0.18761365860700607, + "rewards/accuracy_reward": 1.4029043912887573, + "rewards/format_reward": 1.0, + "step": 459 + }, + { + "completion_length": 67.921875, + "epoch": 5.542168674698795, + "grad_norm": 3.8868143152174714, + "kl": 0.1201171875, + "learning_rate": 7.630522088353414e-08, + "loss": 0.0048, + "reward": 2.202209234237671, + "reward_std": 0.20886321365833282, + "rewards/accuracy_reward": 1.2022093534469604, + "rewards/format_reward": 1.0, + "step": 460 + }, + { + "completion_length": 69.84375, + "epoch": 5.554216867469879, + "grad_norm": 9.828452094441177, + "kl": 0.138427734375, + "learning_rate": 7.429718875502008e-08, + "loss": 0.0055, + "reward": 2.255289673805237, + "reward_std": 0.3091956526041031, + "rewards/accuracy_reward": 1.2787271738052368, + "rewards/format_reward": 0.9765625, + "step": 461 + }, + { + "completion_length": 67.7265625, + "epoch": 5.566265060240964, + "grad_norm": 3.5884325923981777, + "kl": 0.14501953125, + "learning_rate": 7.228915662650602e-08, + "loss": 0.0058, + "reward": 2.389763116836548, + "reward_std": 0.1989041194319725, + "rewards/accuracy_reward": 1.3897631168365479, + "rewards/format_reward": 1.0, + "step": 462 + }, + { + "completion_length": 63.4765625, + "epoch": 5.578313253012048, + "grad_norm": 3.943165256338966, + "kl": 0.15185546875, + "learning_rate": 7.028112449799197e-08, + "loss": 0.0061, + "reward": 2.2263519763946533, + "reward_std": 0.22419632971286774, + "rewards/accuracy_reward": 1.2341644763946533, + "rewards/format_reward": 0.9921875, + "step": 463 + }, + { + "completion_length": 67.734375, + "epoch": 5.590361445783133, + "grad_norm": 8.892123036444877, + "kl": 0.126953125, + "learning_rate": 6.827309236947791e-08, + "loss": 0.0051, + "reward": 2.3126423358917236, + "reward_std": 0.17722339183092117, + "rewards/accuracy_reward": 1.3126422762870789, + "rewards/format_reward": 1.0, + "step": 464 + }, + { + "completion_length": 75.5546875, + "epoch": 5.602409638554217, + "grad_norm": 4.229071556328315, + "kl": 0.1240234375, + "learning_rate": 6.626506024096386e-08, + "loss": 0.005, + "reward": 2.2280049324035645, + "reward_std": 0.22474994510412216, + "rewards/accuracy_reward": 1.235817551612854, + "rewards/format_reward": 0.9921875, + "step": 465 + }, + { + "completion_length": 66.9609375, + "epoch": 5.614457831325301, + "grad_norm": 4.577684554062664, + "kl": 0.12451171875, + "learning_rate": 6.425702811244979e-08, + "loss": 0.005, + "reward": 2.2235909700393677, + "reward_std": 0.22441789507865906, + "rewards/accuracy_reward": 1.2392158508300781, + "rewards/format_reward": 0.984375, + "step": 466 + }, + { + "completion_length": 70.4375, + "epoch": 5.626506024096385, + "grad_norm": 4.349159327486559, + "kl": 0.112548828125, + "learning_rate": 6.224899598393573e-08, + "loss": 0.0045, + "reward": 2.3591808080673218, + "reward_std": 0.1966349333524704, + "rewards/accuracy_reward": 1.3669933080673218, + "rewards/format_reward": 0.9921875, + "step": 467 + }, + { + "completion_length": 69.4453125, + "epoch": 5.63855421686747, + "grad_norm": 3.0423100870405437, + "kl": 0.138671875, + "learning_rate": 6.024096385542168e-08, + "loss": 0.0055, + "reward": 2.4168301820755005, + "reward_std": 0.23313428461551666, + "rewards/accuracy_reward": 1.4246427416801453, + "rewards/format_reward": 0.9921875, + "step": 468 + }, + { + "completion_length": 67.9453125, + "epoch": 5.650602409638554, + "grad_norm": 4.8492295392656075, + "kl": 0.124755859375, + "learning_rate": 5.823293172690763e-08, + "loss": 0.005, + "reward": 2.3264076709747314, + "reward_std": 0.18676774948835373, + "rewards/accuracy_reward": 1.3264076709747314, + "rewards/format_reward": 1.0, + "step": 469 + }, + { + "completion_length": 68.3984375, + "epoch": 5.662650602409639, + "grad_norm": 3.7143887896006706, + "kl": 0.118896484375, + "learning_rate": 5.622489959839357e-08, + "loss": 0.0048, + "reward": 2.275146722793579, + "reward_std": 0.23441863059997559, + "rewards/accuracy_reward": 1.2907716631889343, + "rewards/format_reward": 0.984375, + "step": 470 + }, + { + "completion_length": 69.703125, + "epoch": 5.674698795180722, + "grad_norm": 6.421818895030251, + "kl": 0.105712890625, + "learning_rate": 5.421686746987952e-08, + "loss": 0.0042, + "reward": 2.3713172674179077, + "reward_std": 0.17046835273504257, + "rewards/accuracy_reward": 1.3713172674179077, + "rewards/format_reward": 1.0, + "step": 471 + }, + { + "completion_length": 71.7578125, + "epoch": 5.686746987951807, + "grad_norm": 3.7429303333646846, + "kl": 0.17333984375, + "learning_rate": 5.220883534136546e-08, + "loss": 0.0069, + "reward": 2.21248197555542, + "reward_std": 0.1897253841161728, + "rewards/accuracy_reward": 1.2202943563461304, + "rewards/format_reward": 0.9921875, + "step": 472 + }, + { + "completion_length": 66.0625, + "epoch": 5.698795180722891, + "grad_norm": 4.6125292648898375, + "kl": 0.1171875, + "learning_rate": 5.0200803212851406e-08, + "loss": 0.0047, + "reward": 2.3862085342407227, + "reward_std": 0.14106625318527222, + "rewards/accuracy_reward": 1.3940210938453674, + "rewards/format_reward": 0.9921875, + "step": 473 + }, + { + "completion_length": 71.4296875, + "epoch": 5.710843373493976, + "grad_norm": 4.192704287374918, + "kl": 0.108642578125, + "learning_rate": 4.8192771084337347e-08, + "loss": 0.0043, + "reward": 2.3476767539978027, + "reward_std": 0.20362288504838943, + "rewards/accuracy_reward": 1.3476767539978027, + "rewards/format_reward": 1.0, + "step": 474 + }, + { + "completion_length": 67.2109375, + "epoch": 5.72289156626506, + "grad_norm": 4.1447657242460645, + "kl": 0.1298828125, + "learning_rate": 4.618473895582329e-08, + "loss": 0.0052, + "reward": 2.266420602798462, + "reward_std": 0.2129717692732811, + "rewards/accuracy_reward": 1.2664207220077515, + "rewards/format_reward": 1.0, + "step": 475 + }, + { + "completion_length": 66.546875, + "epoch": 5.734939759036145, + "grad_norm": 3.4345215566799574, + "kl": 0.106201171875, + "learning_rate": 4.4176706827309234e-08, + "loss": 0.0042, + "reward": 2.352730870246887, + "reward_std": 0.1454787813127041, + "rewards/accuracy_reward": 1.3605434894561768, + "rewards/format_reward": 0.9921875, + "step": 476 + }, + { + "completion_length": 71.828125, + "epoch": 5.746987951807229, + "grad_norm": 4.187659893839478, + "kl": 0.111328125, + "learning_rate": 4.2168674698795174e-08, + "loss": 0.0045, + "reward": 2.2670211791992188, + "reward_std": 0.22116923332214355, + "rewards/accuracy_reward": 1.267021119594574, + "rewards/format_reward": 1.0, + "step": 477 + }, + { + "completion_length": 69.1875, + "epoch": 5.759036144578313, + "grad_norm": 3.8623536023281617, + "kl": 0.114013671875, + "learning_rate": 4.016064257028112e-08, + "loss": 0.0046, + "reward": 2.222132921218872, + "reward_std": 0.23479964584112167, + "rewards/accuracy_reward": 1.2221328020095825, + "rewards/format_reward": 1.0, + "step": 478 + }, + { + "completion_length": 70.9296875, + "epoch": 5.771084337349397, + "grad_norm": 4.262446208684037, + "kl": 0.09375, + "learning_rate": 3.815261044176707e-08, + "loss": 0.0037, + "reward": 2.2334243059158325, + "reward_std": 0.21778832376003265, + "rewards/accuracy_reward": 1.2334243059158325, + "rewards/format_reward": 1.0, + "step": 479 + }, + { + "completion_length": 68.2421875, + "epoch": 5.783132530120482, + "grad_norm": 3.475197673617196, + "kl": 0.10595703125, + "learning_rate": 3.614457831325301e-08, + "loss": 0.0042, + "reward": 2.4461944103240967, + "reward_std": 0.21106188744306564, + "rewards/accuracy_reward": 1.4540069103240967, + "rewards/format_reward": 0.9921875, + "step": 480 + }, + { + "completion_length": 70.3671875, + "epoch": 5.795180722891566, + "grad_norm": 4.56883704942929, + "kl": 0.11865234375, + "learning_rate": 3.4136546184738955e-08, + "loss": 0.0047, + "reward": 2.441108226776123, + "reward_std": 0.2091435343027115, + "rewards/accuracy_reward": 1.441108226776123, + "rewards/format_reward": 1.0, + "step": 481 + }, + { + "completion_length": 69.171875, + "epoch": 5.807228915662651, + "grad_norm": 3.959761896565078, + "kl": 0.12451171875, + "learning_rate": 3.2128514056224896e-08, + "loss": 0.005, + "reward": 2.3847368955612183, + "reward_std": 0.14646587148308754, + "rewards/accuracy_reward": 1.3847368359565735, + "rewards/format_reward": 1.0, + "step": 482 + }, + { + "completion_length": 75.3125, + "epoch": 5.8192771084337345, + "grad_norm": 4.6238410926161855, + "kl": 0.108642578125, + "learning_rate": 3.012048192771084e-08, + "loss": 0.0043, + "reward": 2.2356351613998413, + "reward_std": 0.3032216280698776, + "rewards/accuracy_reward": 1.2434476613998413, + "rewards/format_reward": 0.9921875, + "step": 483 + }, + { + "completion_length": 70.921875, + "epoch": 5.831325301204819, + "grad_norm": 4.963499305554948, + "kl": 0.082275390625, + "learning_rate": 2.8112449799196786e-08, + "loss": 0.0033, + "reward": 2.3230150938034058, + "reward_std": 0.16892920434474945, + "rewards/accuracy_reward": 1.3230149745941162, + "rewards/format_reward": 1.0, + "step": 484 + }, + { + "completion_length": 69.3359375, + "epoch": 5.843373493975903, + "grad_norm": 4.069771837808966, + "kl": 0.1396484375, + "learning_rate": 2.610441767068273e-08, + "loss": 0.0056, + "reward": 2.327863335609436, + "reward_std": 0.23238816112279892, + "rewards/accuracy_reward": 1.3434883952140808, + "rewards/format_reward": 0.984375, + "step": 485 + }, + { + "completion_length": 68.875, + "epoch": 5.855421686746988, + "grad_norm": 4.471391988945464, + "kl": 0.13330078125, + "learning_rate": 2.4096385542168673e-08, + "loss": 0.0053, + "reward": 2.331111192703247, + "reward_std": 0.1987084299325943, + "rewards/accuracy_reward": 1.3389237523078918, + "rewards/format_reward": 0.9921875, + "step": 486 + }, + { + "completion_length": 72.2734375, + "epoch": 5.867469879518072, + "grad_norm": 4.3661266337784514, + "kl": 0.128173828125, + "learning_rate": 2.2088353413654617e-08, + "loss": 0.0051, + "reward": 2.2740135192871094, + "reward_std": 0.17679665982723236, + "rewards/accuracy_reward": 1.2740132808685303, + "rewards/format_reward": 1.0, + "step": 487 + }, + { + "completion_length": 69.328125, + "epoch": 5.879518072289157, + "grad_norm": 4.78815312664634, + "kl": 0.150634765625, + "learning_rate": 2.008032128514056e-08, + "loss": 0.006, + "reward": 2.2422866821289062, + "reward_std": 0.23693696409463882, + "rewards/accuracy_reward": 1.2422866821289062, + "rewards/format_reward": 1.0, + "step": 488 + }, + { + "completion_length": 71.4140625, + "epoch": 5.891566265060241, + "grad_norm": 6.245102077972556, + "kl": 0.121826171875, + "learning_rate": 1.8072289156626504e-08, + "loss": 0.0049, + "reward": 2.315194010734558, + "reward_std": 0.1885218769311905, + "rewards/accuracy_reward": 1.3230066299438477, + "rewards/format_reward": 0.9921875, + "step": 489 + }, + { + "completion_length": 63.8984375, + "epoch": 5.903614457831325, + "grad_norm": 4.510763484461414, + "kl": 0.122314453125, + "learning_rate": 1.6064257028112448e-08, + "loss": 0.0049, + "reward": 2.3149102926254272, + "reward_std": 0.1639706939458847, + "rewards/accuracy_reward": 1.3149102926254272, + "rewards/format_reward": 1.0, + "step": 490 + }, + { + "completion_length": 66.0, + "epoch": 5.9156626506024095, + "grad_norm": 4.091329557372317, + "kl": 0.1435546875, + "learning_rate": 1.4056224899598393e-08, + "loss": 0.0058, + "reward": 2.4370064735412598, + "reward_std": 0.15971215814352036, + "rewards/accuracy_reward": 1.4370064735412598, + "rewards/format_reward": 1.0, + "step": 491 + }, + { + "completion_length": 70.484375, + "epoch": 5.927710843373494, + "grad_norm": 4.3856574896033305, + "kl": 0.155029296875, + "learning_rate": 1.2048192771084337e-08, + "loss": 0.0062, + "reward": 2.351839542388916, + "reward_std": 0.2616487815976143, + "rewards/accuracy_reward": 1.359652042388916, + "rewards/format_reward": 0.9921875, + "step": 492 + }, + { + "completion_length": 74.171875, + "epoch": 5.9397590361445785, + "grad_norm": 3.3373281083458974, + "kl": 0.107177734375, + "learning_rate": 1.004016064257028e-08, + "loss": 0.0043, + "reward": 2.3034894466400146, + "reward_std": 0.12144535779953003, + "rewards/accuracy_reward": 1.3113019466400146, + "rewards/format_reward": 0.9921875, + "step": 493 + }, + { + "completion_length": 72.8515625, + "epoch": 5.951807228915663, + "grad_norm": 3.3157754210190773, + "kl": 0.097412109375, + "learning_rate": 8.032128514056224e-09, + "loss": 0.0039, + "reward": 2.421133041381836, + "reward_std": 0.16620434820652008, + "rewards/accuracy_reward": 1.421133041381836, + "rewards/format_reward": 1.0, + "step": 494 + }, + { + "completion_length": 76.1328125, + "epoch": 5.9638554216867465, + "grad_norm": 3.788575194538334, + "kl": 0.12158203125, + "learning_rate": 6.024096385542168e-09, + "loss": 0.0049, + "reward": 2.3588104248046875, + "reward_std": 0.1766229048371315, + "rewards/accuracy_reward": 1.358810544013977, + "rewards/format_reward": 1.0, + "step": 495 + }, + { + "completion_length": 71.515625, + "epoch": 5.975903614457831, + "grad_norm": 4.2730966058785835, + "kl": 0.11962890625, + "learning_rate": 4.016064257028112e-09, + "loss": 0.0048, + "reward": 2.3155951499938965, + "reward_std": 0.25304850190877914, + "rewards/accuracy_reward": 1.3234076499938965, + "rewards/format_reward": 0.9921875, + "step": 496 + }, + { + "completion_length": 68.859375, + "epoch": 5.9879518072289155, + "grad_norm": 4.371956801820215, + "kl": 0.119140625, + "learning_rate": 2.008032128514056e-09, + "loss": 0.0048, + "reward": 2.3737374544143677, + "reward_std": 0.20605729520320892, + "rewards/accuracy_reward": 1.373737394809723, + "rewards/format_reward": 1.0, + "step": 497 + }, + { + "completion_length": 60.75000190734863, + "epoch": 6.0, + "grad_norm": 3.9720317304626964, + "kl": 0.1171875, + "learning_rate": 0.0, + "loss": 0.0046, + "reward": 2.4247955083847046, + "reward_std": 0.17968511581420898, + "rewards/accuracy_reward": 1.4247953295707703, + "rewards/format_reward": 1.0, + "step": 498 + } + ], + "logging_steps": 1.0, + "max_steps": 498, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}