{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996, "eval_steps": 175, "global_step": 357, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 1463.9508056640625, "epoch": 0.014, "grad_norm": 0.08969205617904663, "kl": 4.4073377336774555e-05, "learning_rate": 4.166666666666667e-07, "loss": 0.0, "reward": 0.42698413165552274, "reward_std": 0.23091329676764352, "rewards/accuracy_reward": 0.42698413165552274, "rewards/format_reward": 0.0, "step": 5 }, { "completion_length": 1412.350807407924, "epoch": 0.028, "grad_norm": 0.15431873500347137, "kl": 7.331030709402901e-05, "learning_rate": 8.333333333333334e-07, "loss": 0.0, "reward": 0.4365079395473003, "reward_std": 0.2852135394300733, "rewards/accuracy_reward": 0.4365079395473003, "rewards/format_reward": 0.0, "step": 10 }, { "completion_length": 1309.3492135184151, "epoch": 0.042, "grad_norm": 0.1307893842458725, "kl": 8.476121085030692e-05, "learning_rate": 1.25e-06, "loss": 0.0, "reward": 0.44603175105793136, "reward_std": 0.24500442828450883, "rewards/accuracy_reward": 0.44603175105793136, "rewards/format_reward": 0.0, "step": 15 }, { "completion_length": 1416.9206386021206, "epoch": 0.056, "grad_norm": 0.11694646626710892, "kl": 0.0003507069178989955, "learning_rate": 1.6666666666666669e-06, "loss": 0.0, "reward": 0.39682540254933496, "reward_std": 0.25520533621311187, "rewards/accuracy_reward": 0.39682540254933496, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 1341.8523890904019, "epoch": 0.07, "grad_norm": 0.09343304485082626, "kl": 0.003379276820591518, "learning_rate": 2.0833333333333334e-06, "loss": 0.0001, "reward": 0.48412699103355405, "reward_std": 0.2393546883549009, "rewards/accuracy_reward": 0.48412699103355405, "rewards/format_reward": 0.0, "step": 25 }, { "completion_length": 1409.112721470424, "epoch": 0.084, "grad_norm": 0.2270783931016922, "kl": 0.011235591343470982, "learning_rate": 2.5e-06, "loss": 0.0004, "reward": 0.398412706383637, "reward_std": 0.26068197318485803, "rewards/accuracy_reward": 0.398412706383637, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 1369.4634974888393, "epoch": 0.098, "grad_norm": 0.14271539449691772, "kl": 0.004549898420061384, "learning_rate": 2.9166666666666666e-06, "loss": 0.0002, "reward": 0.4873015942318099, "reward_std": 0.2361013719013759, "rewards/accuracy_reward": 0.4873015942318099, "rewards/format_reward": 0.0, "step": 35 }, { "completion_length": 1378.576211983817, "epoch": 0.112, "grad_norm": 0.1330540031194687, "kl": 0.0018808909824916294, "learning_rate": 2.9988507474879197e-06, "loss": 0.0001, "reward": 0.3857142909296921, "reward_std": 0.2342351802757808, "rewards/accuracy_reward": 0.3857142909296921, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 1429.8317557198661, "epoch": 0.126, "grad_norm": 0.14228956401348114, "kl": 0.003021676199776786, "learning_rate": 2.9941849271855037e-06, "loss": 0.0001, "reward": 0.42222222707101276, "reward_std": 0.26430981755256655, "rewards/accuracy_reward": 0.42222222707101276, "rewards/format_reward": 0.0, "step": 45 }, { "completion_length": 1460.6809709821428, "epoch": 0.14, "grad_norm": 0.09150154888629913, "kl": 0.010079302106584821, "learning_rate": 2.9859418726695507e-06, "loss": 0.0004, "reward": 0.3634920699255807, "reward_std": 0.20712316249098098, "rewards/accuracy_reward": 0.3634920699255807, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 1369.206365094866, "epoch": 0.154, "grad_norm": 0.19628317654132843, "kl": 0.011896623883928571, "learning_rate": 2.974141318670415e-06, "loss": 0.0005, "reward": 0.4968254029750824, "reward_std": 0.2662729093006679, "rewards/accuracy_reward": 0.4968254029750824, "rewards/format_reward": 0.0, "step": 55 }, { "completion_length": 1447.9825474330357, "epoch": 0.168, "grad_norm": 0.19150058925151825, "kl": 0.017090715680803573, "learning_rate": 2.9588115169424383e-06, "loss": 0.0007, "reward": 0.41111111502562253, "reward_std": 0.20665936001709528, "rewards/accuracy_reward": 0.41111111502562253, "rewards/format_reward": 0.0, "step": 60 }, { "completion_length": 1421.871435546875, "epoch": 0.182, "grad_norm": 0.1986546516418457, "kl": 0.02695835658482143, "learning_rate": 2.939989168626311e-06, "loss": 0.0011, "reward": 0.38095238751598764, "reward_std": 0.24111634790897368, "rewards/accuracy_reward": 0.38095238751598764, "rewards/format_reward": 0.0, "step": 65 }, { "completion_length": 1428.890487234933, "epoch": 0.196, "grad_norm": 0.1796417534351349, "kl": 0.038760811941964284, "learning_rate": 2.9177193363827173e-06, "loss": 0.0016, "reward": 0.39365079892533167, "reward_std": 0.2555341350180762, "rewards/accuracy_reward": 0.39365079892533167, "rewards/format_reward": 0.0, "step": 70 }, { "completion_length": 1401.482550920759, "epoch": 0.21, "grad_norm": 0.5775905847549438, "kl": 0.06578194754464285, "learning_rate": 2.8920553365076412e-06, "loss": 0.0026, "reward": 0.42380953005381994, "reward_std": 0.287408532840865, "rewards/accuracy_reward": 0.42380953005381994, "rewards/format_reward": 0.0, "step": 75 }, { "completion_length": 1438.0158796037947, "epoch": 0.224, "grad_norm": 0.20122337341308594, "kl": 0.09798409598214286, "learning_rate": 2.863058611287603e-06, "loss": 0.0039, "reward": 0.4301587402820587, "reward_std": 0.24474205672740937, "rewards/accuracy_reward": 0.4301587402820587, "rewards/format_reward": 0.0, "step": 80 }, { "completion_length": 1459.1603306361608, "epoch": 0.238, "grad_norm": 0.24150224030017853, "kl": 0.1611328125, "learning_rate": 2.830798581900429e-06, "loss": 0.0064, "reward": 0.44761905201843805, "reward_std": 0.21659241148403713, "rewards/accuracy_reward": 0.44761905201843805, "rewards/format_reward": 0.0, "step": 85 }, { "completion_length": 1383.814297921317, "epoch": 0.252, "grad_norm": 0.42380255460739136, "kl": 0.3262276785714286, "learning_rate": 2.795352482213732e-06, "loss": 0.0131, "reward": 0.4603174648114613, "reward_std": 0.2674967063324792, "rewards/accuracy_reward": 0.4603174648114613, "rewards/format_reward": 0.0, "step": 90 }, { "completion_length": 1414.3174700055804, "epoch": 0.266, "grad_norm": 1.0099889039993286, "kl": 0.6118303571428572, "learning_rate": 2.7568051738789903e-06, "loss": 0.0245, "reward": 0.42698413601943425, "reward_std": 0.29205660607133593, "rewards/accuracy_reward": 0.42698413601943425, "rewards/format_reward": 0.0, "step": 95 }, { "completion_length": 1474.3333426339286, "epoch": 0.28, "grad_norm": 0.8963289260864258, "kl": 0.5459821428571429, "learning_rate": 2.7152489431639246e-06, "loss": 0.0218, "reward": 0.3698412746191025, "reward_std": 0.2749745330640248, "rewards/accuracy_reward": 0.3698412746191025, "rewards/format_reward": 0.0, "step": 100 }, { "completion_length": 1287.5079467773437, "epoch": 0.294, "grad_norm": 1.8670637607574463, "kl": 0.62109375, "learning_rate": 2.6707832800095687e-06, "loss": 0.0249, "reward": 0.447619054466486, "reward_std": 0.28052737074238915, "rewards/accuracy_reward": 0.447619054466486, "rewards/format_reward": 0.0, "step": 105 }, { "completion_length": 1249.3746198381696, "epoch": 0.308, "grad_norm": 5.589221000671387, "kl": 0.8579799107142857, "learning_rate": 2.6235146398409985e-06, "loss": 0.0343, "reward": 0.5396825455129146, "reward_std": 0.27361573789800914, "rewards/accuracy_reward": 0.5396825455129146, "rewards/format_reward": 0.0, "step": 110 }, { "completion_length": 1410.7190673828125, "epoch": 0.322, "grad_norm": 1.1053833961486816, "kl": 0.7229073660714286, "learning_rate": 2.573556188701961e-06, "loss": 0.0289, "reward": 0.3888888947665691, "reward_std": 0.27650429010391236, "rewards/accuracy_reward": 0.3888888947665691, "rewards/format_reward": 0.0, "step": 115 }, { "completion_length": 1361.2142961774553, "epoch": 0.336, "grad_norm": 1.943098783493042, "kl": 1.5775669642857142, "learning_rate": 2.5210275323235945e-06, "loss": 0.0631, "reward": 0.4698412761092186, "reward_std": 0.2894968547991344, "rewards/accuracy_reward": 0.4698412761092186, "rewards/format_reward": 0.0, "step": 120 }, { "completion_length": 1351.7809605189732, "epoch": 0.35, "grad_norm": 0.9562058448791504, "kl": 0.7444196428571429, "learning_rate": 2.4660544297758557e-06, "loss": 0.0298, "reward": 0.46190476790070534, "reward_std": 0.247197277205331, "rewards/accuracy_reward": 0.46190476790070534, "rewards/format_reward": 0.0, "step": 125 }, { "completion_length": 1461.123822893415, "epoch": 0.364, "grad_norm": 1.6370760202407837, "kl": 0.7396205357142858, "learning_rate": 2.4087684923872226e-06, "loss": 0.0296, "reward": 0.4174603264246668, "reward_std": 0.22625759754862104, "rewards/accuracy_reward": 0.4174603264246668, "rewards/format_reward": 0.0, "step": 130 }, { "completion_length": 1485.9111206054688, "epoch": 0.378, "grad_norm": 1.3553705215454102, "kl": 1.7241071428571428, "learning_rate": 2.3493068686534758e-06, "loss": 0.0689, "reward": 0.3841269900756223, "reward_std": 0.2519443878105709, "rewards/accuracy_reward": 0.3841269900756223, "rewards/format_reward": 0.0, "step": 135 }, { "completion_length": 1377.660328892299, "epoch": 0.392, "grad_norm": 2.367931365966797, "kl": 0.9994977678571428, "learning_rate": 2.2878119158899268e-06, "loss": 0.04, "reward": 0.4492063578750406, "reward_std": 0.27590762632233756, "rewards/accuracy_reward": 0.4492063578750406, "rewards/format_reward": 0.0, "step": 140 }, { "completion_length": 1383.3619140625, "epoch": 0.406, "grad_norm": 1.5969258546829224, "kl": 0.9481026785714286, "learning_rate": 2.2244308594131895e-06, "loss": 0.0379, "reward": 0.4428571517978396, "reward_std": 0.285810204914638, "rewards/accuracy_reward": 0.4428571517978396, "rewards/format_reward": 0.0, "step": 145 }, { "completion_length": 1392.4809596470425, "epoch": 0.42, "grad_norm": 1.8966355323791504, "kl": 1.8736886160714286, "learning_rate": 2.1593154400684525e-06, "loss": 0.0749, "reward": 0.4222222318606717, "reward_std": 0.2493541623864855, "rewards/accuracy_reward": 0.4222222318606717, "rewards/format_reward": 0.0, "step": 150 }, { "completion_length": 1350.588902064732, "epoch": 0.434, "grad_norm": 1.227295994758606, "kl": 1.0234375, "learning_rate": 2.092621550946103e-06, "loss": 0.0409, "reward": 0.46349206737109594, "reward_std": 0.27739928151879995, "rewards/accuracy_reward": 0.46349206737109594, "rewards/format_reward": 0.0, "step": 155 }, { "completion_length": 1292.5793788364956, "epoch": 0.448, "grad_norm": 2.2678561210632324, "kl": 1.6760602678571428, "learning_rate": 2.0245088641574454e-06, "loss": 0.0671, "reward": 0.47142857717616216, "reward_std": 0.25264558323792047, "rewards/accuracy_reward": 0.47142857717616216, "rewards/format_reward": 0.0, "step": 160 }, { "completion_length": 1530.8698486328126, "epoch": 0.462, "grad_norm": 1.4743075370788574, "kl": 1.4337890625, "learning_rate": 1.955140448563049e-06, "loss": 0.0574, "reward": 0.36031746587582997, "reward_std": 0.26739980791296275, "rewards/accuracy_reward": 0.36031746587582997, "rewards/format_reward": 0.0, "step": 165 }, { "completion_length": 1308.3254237583706, "epoch": 0.476, "grad_norm": 1.203540325164795, "kl": 0.9108956473214286, "learning_rate": 1.8846823793689261e-06, "loss": 0.0365, "reward": 0.3793650852782386, "reward_std": 0.2656762455190931, "rewards/accuracy_reward": 0.3793650852782386, "rewards/format_reward": 0.0, "step": 170 }, { "completion_length": 1323.4126970563616, "epoch": 0.49, "grad_norm": 1.7284196615219116, "kl": 0.6505022321428572, "learning_rate": 1.8133033405252056e-06, "loss": 0.026, "reward": 0.4142857197139944, "reward_std": 0.27351883692400797, "rewards/accuracy_reward": 0.4142857197139944, "rewards/format_reward": 0.0, "step": 175 }, { "epoch": 0.49, "eval_completion_length": 1369.2646067527216, "eval_kl": 1.1555292945317186, "eval_loss": 0.04623142257332802, "eval_reward": 0.43851230413517195, "eval_reward_std": 0.2747080655926062, "eval_rewards/accuracy_reward": 0.43851230413517195, "eval_rewards/format_reward": 0.0, "eval_runtime": 23113.6513, "eval_samples_per_second": 0.216, "eval_steps_per_second": 0.012, "step": 175 }, { "completion_length": 1444.2365199497767, "epoch": 0.504, "grad_norm": 1.2406812906265259, "kl": 1.5666294642857144, "learning_rate": 1.7411742208792024e-06, "loss": 0.0627, "reward": 0.4269841328263283, "reward_std": 0.3386471267257418, "rewards/accuracy_reward": 0.4269841328263283, "rewards/format_reward": 0.0, "step": 180 }, { "completion_length": 1326.390487234933, "epoch": 0.518, "grad_norm": 1.0804634094238281, "kl": 1.7071986607142857, "learning_rate": 1.6684677050497315e-06, "loss": 0.0683, "reward": 0.42857143389327185, "reward_std": 0.26005484078611646, "rewards/accuracy_reward": 0.42857143389327185, "rewards/format_reward": 0.0, "step": 185 }, { "completion_length": 1468.5063546316965, "epoch": 0.532, "grad_norm": 3.3248608112335205, "kl": 2.3198660714285713, "learning_rate": 1.5953578600021593e-06, "loss": 0.0928, "reward": 0.39841270744800567, "reward_std": 0.24905584284237453, "rewards/accuracy_reward": 0.39841270744800567, "rewards/format_reward": 0.0, "step": 190 }, { "completion_length": 1407.6492065429688, "epoch": 0.546, "grad_norm": 3.002239465713501, "kl": 1.0529017857142857, "learning_rate": 1.522019718313975e-06, "loss": 0.0421, "reward": 0.4634920714156968, "reward_std": 0.2802497374159949, "rewards/accuracy_reward": 0.4634920714156968, "rewards/format_reward": 0.0, "step": 195 }, { "completion_length": 1504.898423549107, "epoch": 0.56, "grad_norm": 1.4985986948013306, "kl": 1.0293247767857143, "learning_rate": 1.4486288591285846e-06, "loss": 0.0412, "reward": 0.37142857632466725, "reward_std": 0.23027853284563338, "rewards/accuracy_reward": 0.37142857632466725, "rewards/format_reward": 0.0, "step": 200 }, { "completion_length": 1335.2016008649553, "epoch": 0.574, "grad_norm": 1.1860501766204834, "kl": 1.445361328125, "learning_rate": 1.3753609878005669e-06, "loss": 0.0578, "reward": 0.40317460958446777, "reward_std": 0.2433777698448726, "rewards/accuracy_reward": 0.40317460958446777, "rewards/format_reward": 0.0, "step": 205 }, { "completion_length": 1292.8063668387276, "epoch": 0.588, "grad_norm": 1.619707465171814, "kl": 1.3199497767857142, "learning_rate": 1.302391515238772e-06, "loss": 0.0528, "reward": 0.45873016587325505, "reward_std": 0.2305104353598186, "rewards/accuracy_reward": 0.45873016587325505, "rewards/format_reward": 0.0, "step": 210 }, { "completion_length": 1395.8714407784598, "epoch": 0.602, "grad_norm": 1.9457758665084839, "kl": 0.99453125, "learning_rate": 1.2298951379543504e-06, "loss": 0.0398, "reward": 0.439682551579816, "reward_std": 0.26879456128392903, "rewards/accuracy_reward": 0.439682551579816, "rewards/format_reward": 0.0, "step": 215 }, { "completion_length": 1425.9460414341518, "epoch": 0.616, "grad_norm": 1.7319049835205078, "kl": 1.1224330357142858, "learning_rate": 1.1580454198191229e-06, "loss": 0.0449, "reward": 0.4142857236521585, "reward_std": 0.2562353300196784, "rewards/accuracy_reward": 0.4142857236521585, "rewards/format_reward": 0.0, "step": 220 }, { "completion_length": 1343.5031895228794, "epoch": 0.63, "grad_norm": 1.912259817123413, "kl": 1.7331752232142856, "learning_rate": 1.0870143765356105e-06, "loss": 0.0694, "reward": 0.4603174694946834, "reward_std": 0.2678995690175465, "rewards/accuracy_reward": 0.4603174694946834, "rewards/format_reward": 0.0, "step": 225 }, { "completion_length": 1309.0412789481027, "epoch": 0.644, "grad_norm": 2.9131321907043457, "kl": 1.8904575892857143, "learning_rate": 1.0169720638135414e-06, "loss": 0.0756, "reward": 0.46984127344829696, "reward_std": 0.2539434403181076, "rewards/accuracy_reward": 0.46984127344829696, "rewards/format_reward": 0.0, "step": 230 }, { "completion_length": 1323.617471749442, "epoch": 0.658, "grad_norm": 1.1515146493911743, "kl": 1.2281808035714286, "learning_rate": 9.480861702387831e-07, "loss": 0.0491, "reward": 0.46825397376503264, "reward_std": 0.28800519619669235, "rewards/accuracy_reward": 0.46825397376503264, "rewards/format_reward": 0.0, "step": 235 }, { "completion_length": 1433.8714477539063, "epoch": 0.672, "grad_norm": 1.1839704513549805, "kl": 0.83984375, "learning_rate": 8.805216158094177e-07, "loss": 0.0336, "reward": 0.4142857189689364, "reward_std": 0.2477275082043239, "rewards/accuracy_reward": 0.4142857189689364, "rewards/format_reward": 0.0, "step": 240 }, { "completion_length": 1460.298420061384, "epoch": 0.686, "grad_norm": 1.041972279548645, "kl": 0.9199776785714285, "learning_rate": 8.144401571001033e-07, "loss": 0.0368, "reward": 0.4174603233379977, "reward_std": 0.23921968766621182, "rewards/accuracy_reward": 0.4174603233379977, "rewards/format_reward": 0.0, "step": 245 }, { "completion_length": 1339.3079485212054, "epoch": 0.7, "grad_norm": 3.0681028366088867, "kl": 1.1045758928571427, "learning_rate": 7.500000000000003e-07, "loss": 0.0442, "reward": 0.4365079409309796, "reward_std": 0.2710559815168381, "rewards/accuracy_reward": 0.4365079409309796, "rewards/format_reward": 0.0, "step": 250 }, { "completion_length": 1416.1635096958705, "epoch": 0.714, "grad_norm": 2.138991355895996, "kl": 1.3815848214285715, "learning_rate": 6.873554209514085e-07, "loss": 0.0553, "reward": 0.44126984679273196, "reward_std": 0.264638621040753, "rewards/accuracy_reward": 0.44126984679273196, "rewards/format_reward": 0.0, "step": 255 }, { "completion_length": 1366.5460518973214, "epoch": 0.728, "grad_norm": 4.086883544921875, "kl": 1.617299107142857, "learning_rate": 6.266563975959089e-07, "loss": 0.0647, "reward": 0.4523809592638697, "reward_std": 0.25510843566485814, "rewards/accuracy_reward": 0.4523809592638697, "rewards/format_reward": 0.0, "step": 260 }, { "completion_length": 1370.509535435268, "epoch": 0.742, "grad_norm": 2.976264238357544, "kl": 1.5864955357142858, "learning_rate": 5.680482497122817e-07, "loss": 0.0635, "reward": 0.36031746545008253, "reward_std": 0.2660050468785422, "rewards/accuracy_reward": 0.36031746545008253, "rewards/format_reward": 0.0, "step": 265 }, { "completion_length": 1400.5809605189731, "epoch": 0.756, "grad_norm": 1.7232657670974731, "kl": 1.679799107142857, "learning_rate": 5.116712913058335e-07, "loss": 0.0672, "reward": 0.4142857189689364, "reward_std": 0.2836837832416807, "rewards/accuracy_reward": 0.4142857189689364, "rewards/format_reward": 0.0, "step": 270 }, { "completion_length": 1365.423815046038, "epoch": 0.77, "grad_norm": 3.6572115421295166, "kl": 1.396372767857143, "learning_rate": 4.576604946820648e-07, "loss": 0.0558, "reward": 0.3904761965785708, "reward_std": 0.2657045785869871, "rewards/accuracy_reward": 0.3904761965785708, "rewards/format_reward": 0.0, "step": 275 }, { "completion_length": 1463.130170549665, "epoch": 0.784, "grad_norm": 1.000345230102539, "kl": 1.047251674107143, "learning_rate": 4.061451673089254e-07, "loss": 0.0419, "reward": 0.4063492118247918, "reward_std": 0.2173240670136043, "rewards/accuracy_reward": 0.4063492118247918, "rewards/format_reward": 0.0, "step": 280 }, { "completion_length": 1421.1793736049108, "epoch": 0.798, "grad_norm": 0.7917878031730652, "kl": 1.59453125, "learning_rate": 3.5724864224127866e-07, "loss": 0.0638, "reward": 0.4174603224865028, "reward_std": 0.2862054360764367, "rewards/accuracy_reward": 0.4174603224865028, "rewards/format_reward": 0.0, "step": 285 }, { "completion_length": 1388.3381086077009, "epoch": 0.812, "grad_norm": 1.5034881830215454, "kl": 1.5661551339285715, "learning_rate": 3.1108798284873347e-07, "loss": 0.0627, "reward": 0.4492063529789448, "reward_std": 0.2819504597357341, "rewards/accuracy_reward": 0.4492063529789448, "rewards/format_reward": 0.0, "step": 290 }, { "completion_length": 1393.196834891183, "epoch": 0.826, "grad_norm": 1.458122968673706, "kl": 1.493540736607143, "learning_rate": 2.6777370255375457e-07, "loss": 0.0598, "reward": 0.42857143368039813, "reward_std": 0.27959427535533904, "rewards/accuracy_reward": 0.42857143368039813, "rewards/format_reward": 0.0, "step": 295 }, { "completion_length": 1319.044456263951, "epoch": 0.84, "grad_norm": 2.7932145595550537, "kl": 1.5229352678571428, "learning_rate": 2.2740950025102765e-07, "loss": 0.0609, "reward": 0.4492063531918185, "reward_std": 0.26381005815097264, "rewards/accuracy_reward": 0.4492063531918185, "rewards/format_reward": 0.0, "step": 300 }, { "completion_length": 1450.5984270368303, "epoch": 0.854, "grad_norm": 2.0484893321990967, "kl": 1.2668526785714285, "learning_rate": 1.9009201204151573e-07, "loss": 0.0507, "reward": 0.38412698784044813, "reward_std": 0.2635781569140298, "rewards/accuracy_reward": 0.38412698784044813, "rewards/format_reward": 0.0, "step": 305 }, { "completion_length": 1386.0809674944196, "epoch": 0.868, "grad_norm": 3.057547092437744, "kl": 1.1603794642857144, "learning_rate": 1.5591057987557362e-07, "loss": 0.0464, "reward": 0.3746031781392438, "reward_std": 0.2741459710257394, "rewards/accuracy_reward": 0.3746031781392438, "rewards/format_reward": 0.0, "step": 310 }, { "completion_length": 1493.9031877790178, "epoch": 0.882, "grad_norm": 1.3004882335662842, "kl": 1.3931082589285715, "learning_rate": 1.2494703765902339e-07, "loss": 0.0557, "reward": 0.3746031816516604, "reward_std": 0.2563322309936796, "rewards/accuracy_reward": 0.3746031816516604, "rewards/format_reward": 0.0, "step": 315 }, { "completion_length": 1391.23335484096, "epoch": 0.896, "grad_norm": 1.51276695728302, "kl": 1.2445731026785714, "learning_rate": 9.72755153342662e-08, "loss": 0.0498, "reward": 0.457142864380564, "reward_std": 0.24264610367161887, "rewards/accuracy_reward": 0.457142864380564, "rewards/format_reward": 0.0, "step": 320 }, { "completion_length": 1393.0079450334822, "epoch": 0.91, "grad_norm": 2.4274632930755615, "kl": 1.422509765625, "learning_rate": 7.296226140548657e-08, "loss": 0.0569, "reward": 0.4190476249371256, "reward_std": 0.25706389929567064, "rewards/accuracy_reward": 0.4190476249371256, "rewards/format_reward": 0.0, "step": 325 }, { "completion_length": 1336.8095319475447, "epoch": 0.924, "grad_norm": 1.4370734691619873, "kl": 1.3261369977678572, "learning_rate": 5.206548433283803e-08, "loss": 0.0531, "reward": 0.4301587348537786, "reward_std": 0.26660170682838985, "rewards/accuracy_reward": 0.4301587348537786, "rewards/format_reward": 0.0, "step": 330 }, { "completion_length": 1455.5730259486606, "epoch": 0.938, "grad_norm": 2.1063618659973145, "kl": 1.1842912946428572, "learning_rate": 3.463521317533297e-08, "loss": 0.0474, "reward": 0.3809523857065609, "reward_std": 0.2787276157311031, "rewards/accuracy_reward": 0.3809523857065609, "rewards/format_reward": 0.0, "step": 335 }, { "completion_length": 1425.9031842912946, "epoch": 0.952, "grad_norm": 1.6960781812667847, "kl": 1.1869280133928573, "learning_rate": 2.0713177816067406e-08, "loss": 0.0475, "reward": 0.37777778027313097, "reward_std": 0.25294391640595026, "rewards/accuracy_reward": 0.37777778027313097, "rewards/format_reward": 0.0, "step": 340 }, { "completion_length": 1376.774614606585, "epoch": 0.966, "grad_norm": 1.5884455442428589, "kl": 1.6287388392857143, "learning_rate": 1.0332709056539491e-08, "loss": 0.0652, "reward": 0.4126984155603817, "reward_std": 0.2797271396432604, "rewards/accuracy_reward": 0.4126984155603817, "rewards/format_reward": 0.0, "step": 345 }, { "completion_length": 1331.7777849469867, "epoch": 0.98, "grad_norm": 1.7078304290771484, "kl": 1.332421875, "learning_rate": 3.518658819241127e-09, "loss": 0.0533, "reward": 0.46666666929210937, "reward_std": 0.3140872184719358, "rewards/accuracy_reward": 0.46666666929210937, "rewards/format_reward": 0.0, "step": 350 }, { "epoch": 0.98, "eval_completion_length": 1369.8748053428865, "eval_kl": 1.3092232139509599, "eval_loss": 0.05237572267651558, "eval_reward": 0.4383123438374039, "eval_reward_std": 0.2730543685392055, "eval_rewards/accuracy_reward": 0.4383123438374039, "eval_rewards/format_reward": 0.0, "eval_runtime": 23154.2417, "eval_samples_per_second": 0.216, "eval_steps_per_second": 0.012, "step": 350 }, { "completion_length": 1367.204776436942, "epoch": 0.994, "grad_norm": 2.3219335079193115, "kl": 1.3326729910714286, "learning_rate": 2.873406495697006e-10, "loss": 0.0533, "reward": 0.42380952749933515, "reward_std": 0.2821540274790355, "rewards/accuracy_reward": 0.42380952749933515, "rewards/format_reward": 0.0, "step": 355 }, { "completion_length": 1504.730189732143, "epoch": 0.9996, "kl": 1.1441127232142858, "reward": 0.37301587340022835, "reward_std": 0.19905034665550506, "rewards/accuracy_reward": 0.37301587340022835, "rewards/format_reward": 0.0, "step": 357, "total_flos": 0.0, "train_loss": 0.038403756807618, "train_runtime": 93166.988, "train_samples_per_second": 0.081, "train_steps_per_second": 0.004 } ], "logging_steps": 5, "max_steps": 357, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }