{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8553459119496856, "eval_steps": 500, "global_step": 8500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 218.66326141357422, "epoch": 0.00010062893081761007, "grad_norm": 2.015650510787964, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.5320588201284409, "reward_std": 0.41734498739242554, "rewards/accuracy_reward": 0.25654859840869904, "rewards/format_reward": 0.27551019191741943, "step": 1 }, { "completion_length": 386.7550964355469, "epoch": 0.00020125786163522014, "grad_norm": 3.88326096534729, "kl": 0.00029754638671875, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.489426389336586, "reward_std": 0.4838973730802536, "rewards/accuracy_reward": 0.18330395594239235, "rewards/format_reward": 0.30612245202064514, "step": 2 }, { "completion_length": 225.6836700439453, "epoch": 0.0003018867924528302, "grad_norm": 1.8384490013122559, "kl": 0.0003814697265625, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.7809853553771973, "reward_std": 0.5399021804332733, "rewards/accuracy_reward": 0.25037316232919693, "rewards/format_reward": 0.5306122452020645, "step": 3 }, { "completion_length": 210.03060913085938, "epoch": 0.0004025157232704403, "grad_norm": 1.6739630699157715, "kl": 0.00046253204345703125, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.8277225196361542, "reward_std": 0.37108638882637024, "rewards/accuracy_reward": 0.2767021059989929, "rewards/format_reward": 0.5510204136371613, "step": 4 }, { "completion_length": 240.27550506591797, "epoch": 0.0005031446540880503, "grad_norm": 1.9591482877731323, "kl": 0.0005512237548828125, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.766940712928772, "reward_std": 0.4467408359050751, "rewards/accuracy_reward": 0.3383692651987076, "rewards/format_reward": 0.428571417927742, "step": 5 }, { "completion_length": 257.9183654785156, "epoch": 0.0006037735849056604, "grad_norm": 2.9487502574920654, "kl": 0.0007953643798828125, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.9472275078296661, "reward_std": 0.40865907073020935, "rewards/accuracy_reward": 0.37579889595508575, "rewards/format_reward": 0.5714285522699356, "step": 6 }, { "completion_length": 224.20408248901367, "epoch": 0.0007044025157232704, "grad_norm": 1.8854634761810303, "kl": 0.001125335693359375, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.6581787467002869, "reward_std": 0.4699059873819351, "rewards/accuracy_reward": 0.28062771260738373, "rewards/format_reward": 0.37755101919174194, "step": 7 }, { "completion_length": 194.75509643554688, "epoch": 0.0008050314465408805, "grad_norm": 4.139524936676025, "kl": 0.00130462646484375, "learning_rate": 1e-06, "loss": 0.0001, "reward": 0.8524457514286041, "reward_std": 0.5000613778829575, "rewards/accuracy_reward": 0.24020091071724892, "rewards/format_reward": 0.6122448891401291, "step": 8 }, { "completion_length": 289.2550964355469, "epoch": 0.0009056603773584906, "grad_norm": 1.3969429731369019, "kl": 0.00298309326171875, "learning_rate": 1e-06, "loss": 0.0001, "reward": 0.6265543401241302, "reward_std": 0.504256546497345, "rewards/accuracy_reward": 0.13675842434167862, "rewards/format_reward": 0.48979590833187103, "step": 9 }, { "completion_length": 299.2040710449219, "epoch": 0.0010062893081761006, "grad_norm": 1.3500438928604126, "kl": 0.001617431640625, "learning_rate": 1e-06, "loss": 0.0001, "reward": 0.9410359859466553, "reward_std": 0.5756072402000427, "rewards/accuracy_reward": 0.3083830028772354, "rewards/format_reward": 0.6326530575752258, "step": 10 }, { "completion_length": 178.17346954345703, "epoch": 0.0011069182389937106, "grad_norm": 3.5225138664245605, "kl": 0.0038299560546875, "learning_rate": 1e-06, "loss": 0.0002, "reward": 1.0817062854766846, "reward_std": 0.43187688291072845, "rewards/accuracy_reward": 0.3572165369987488, "rewards/format_reward": 0.7244897782802582, "step": 11 }, { "completion_length": 259.8673324584961, "epoch": 0.0012075471698113208, "grad_norm": 10.97664737701416, "kl": 0.00388336181640625, "learning_rate": 1e-06, "loss": 0.0002, "reward": 0.77753946185112, "reward_std": 0.5170837938785553, "rewards/accuracy_reward": 0.18570270389318466, "rewards/format_reward": 0.5918367207050323, "step": 12 }, { "completion_length": 256.9897918701172, "epoch": 0.0013081761006289308, "grad_norm": 1.432379961013794, "kl": 0.00389862060546875, "learning_rate": 1e-06, "loss": 0.0002, "reward": 1.093548595905304, "reward_std": 0.3822285830974579, "rewards/accuracy_reward": 0.3792629763484001, "rewards/format_reward": 0.7142857015132904, "step": 13 }, { "completion_length": 215.2653045654297, "epoch": 0.0014088050314465409, "grad_norm": 2.010869026184082, "kl": 0.00713348388671875, "learning_rate": 1e-06, "loss": 0.0003, "reward": 1.0191496908664703, "reward_std": 0.5536944270133972, "rewards/accuracy_reward": 0.37629254162311554, "rewards/format_reward": 0.6428571343421936, "step": 14 }, { "completion_length": 245.81632232666016, "epoch": 0.0015094339622641509, "grad_norm": 1.4804081916809082, "kl": 0.004425048828125, "learning_rate": 1e-06, "loss": 0.0002, "reward": 0.7067106366157532, "reward_std": 0.38422340154647827, "rewards/accuracy_reward": 0.21691472828388214, "rewards/format_reward": 0.48979590833187103, "step": 15 }, { "completion_length": 242.2040786743164, "epoch": 0.001610062893081761, "grad_norm": 2.97932505607605, "kl": 0.011871337890625, "learning_rate": 1e-06, "loss": 0.0005, "reward": 1.0819512009620667, "reward_std": 0.5595148056745529, "rewards/accuracy_reward": 0.36766552925109863, "rewards/format_reward": 0.7142857015132904, "step": 16 }, { "completion_length": 222.66326141357422, "epoch": 0.0017106918238993711, "grad_norm": 1.875419020652771, "kl": 0.00392913818359375, "learning_rate": 1e-06, "loss": 0.0002, "reward": 1.1463394165039062, "reward_std": 0.42661353945732117, "rewards/accuracy_reward": 0.3504209965467453, "rewards/format_reward": 0.795918345451355, "step": 17 }, { "completion_length": 248.49999237060547, "epoch": 0.0018113207547169811, "grad_norm": 1.9066600799560547, "kl": 0.0037994384765625, "learning_rate": 1e-06, "loss": 0.0002, "reward": 0.9920322597026825, "reward_std": 0.40084418654441833, "rewards/accuracy_reward": 0.21652212738990784, "rewards/format_reward": 0.7755101919174194, "step": 18 }, { "completion_length": 229.26529693603516, "epoch": 0.0019119496855345911, "grad_norm": 1.9168208837509155, "kl": 0.00411224365234375, "learning_rate": 1e-06, "loss": 0.0002, "reward": 1.203552484512329, "reward_std": 0.393087700009346, "rewards/accuracy_reward": 0.3974299877882004, "rewards/format_reward": 0.8061224222183228, "step": 19 }, { "completion_length": 188.7653045654297, "epoch": 0.002012578616352201, "grad_norm": 2.200718402862549, "kl": 0.011383056640625, "learning_rate": 1e-06, "loss": 0.0005, "reward": 1.0205991864204407, "reward_std": 0.48320214450359344, "rewards/accuracy_reward": 0.3777420371770859, "rewards/format_reward": 0.6428571194410324, "step": 20 }, { "completion_length": 186.2244873046875, "epoch": 0.0021132075471698114, "grad_norm": 1.9553349018096924, "kl": 0.015228271484375, "learning_rate": 1e-06, "loss": 0.0006, "reward": 0.8716634511947632, "reward_std": 0.5606746673583984, "rewards/accuracy_reward": 0.2492145448923111, "rewards/format_reward": 0.6224489510059357, "step": 21 }, { "completion_length": 237.65306091308594, "epoch": 0.002213836477987421, "grad_norm": 1.6654584407806396, "kl": 0.006195068359375, "learning_rate": 1e-06, "loss": 0.0002, "reward": 1.1160717010498047, "reward_std": 0.4403817653656006, "rewards/accuracy_reward": 0.2895411103963852, "rewards/format_reward": 0.8265306055545807, "step": 22 }, { "completion_length": 188.1836700439453, "epoch": 0.0023144654088050314, "grad_norm": 1.9679638147354126, "kl": 0.008026123046875, "learning_rate": 1e-06, "loss": 0.0003, "reward": 1.2749876976013184, "reward_std": 0.5086776167154312, "rewards/accuracy_reward": 0.38723261654376984, "rewards/format_reward": 0.8877550661563873, "step": 23 }, { "completion_length": 233.79591369628906, "epoch": 0.0024150943396226416, "grad_norm": 4.067342758178711, "kl": 0.015777587890625, "learning_rate": 1e-06, "loss": 0.0006, "reward": 1.1043405532836914, "reward_std": 0.4806502163410187, "rewards/accuracy_reward": 0.29821813851594925, "rewards/format_reward": 0.8061224222183228, "step": 24 }, { "completion_length": 167.75509643554688, "epoch": 0.0025157232704402514, "grad_norm": 2.552858829498291, "kl": 0.01983642578125, "learning_rate": 1e-06, "loss": 0.0008, "reward": 1.244903802871704, "reward_std": 0.41207514703273773, "rewards/accuracy_reward": 0.3163324147462845, "rewards/format_reward": 0.9285714030265808, "step": 25 }, { "completion_length": 162.8469352722168, "epoch": 0.0026163522012578617, "grad_norm": 2.061877965927124, "kl": 0.015716552734375, "learning_rate": 1e-06, "loss": 0.0006, "reward": 1.3653060793876648, "reward_std": 0.44440026581287384, "rewards/accuracy_reward": 0.5081632286310196, "rewards/format_reward": 0.857142835855484, "step": 26 }, { "completion_length": 193.6326446533203, "epoch": 0.002716981132075472, "grad_norm": 2.213667392730713, "kl": 0.014495849609375, "learning_rate": 1e-06, "loss": 0.0006, "reward": 1.295918345451355, "reward_std": 0.5167251527309418, "rewards/accuracy_reward": 0.3877550959587097, "rewards/format_reward": 0.9081632494926453, "step": 27 }, { "completion_length": 210.27550506591797, "epoch": 0.0028176100628930817, "grad_norm": 3.7090582847595215, "kl": 0.012115478515625, "learning_rate": 1e-06, "loss": 0.0005, "reward": 1.178530991077423, "reward_std": 0.38119108974933624, "rewards/accuracy_reward": 0.362204447388649, "rewards/format_reward": 0.8163264989852905, "step": 28 }, { "completion_length": 193.28571319580078, "epoch": 0.002918238993710692, "grad_norm": 2.345113515853882, "kl": 0.012054443359375, "learning_rate": 1e-06, "loss": 0.0005, "reward": 1.2076815366744995, "reward_std": 0.42760658264160156, "rewards/accuracy_reward": 0.3505387008190155, "rewards/format_reward": 0.857142835855484, "step": 29 }, { "completion_length": 176.948974609375, "epoch": 0.0030188679245283017, "grad_norm": 2.215028762817383, "kl": 0.00848388671875, "learning_rate": 1e-06, "loss": 0.0003, "reward": 1.3001383543014526, "reward_std": 0.3911035731434822, "rewards/accuracy_reward": 0.3409547209739685, "rewards/format_reward": 0.9591836631298065, "step": 30 }, { "completion_length": 215.38774871826172, "epoch": 0.003119496855345912, "grad_norm": 5.466653347015381, "kl": 0.01416015625, "learning_rate": 1e-06, "loss": 0.0006, "reward": 1.1275387406349182, "reward_std": 0.42305299639701843, "rewards/accuracy_reward": 0.2703959345817566, "rewards/format_reward": 0.857142835855484, "step": 31 }, { "completion_length": 226.26529693603516, "epoch": 0.003220125786163522, "grad_norm": 1.3063735961914062, "kl": 0.0068511962890625, "learning_rate": 1e-06, "loss": 0.0003, "reward": 1.1234276294708252, "reward_std": 0.330197811126709, "rewards/accuracy_reward": 0.22546837478876114, "rewards/format_reward": 0.8979591727256775, "step": 32 }, { "completion_length": 245.9897918701172, "epoch": 0.003320754716981132, "grad_norm": 1.238959789276123, "kl": 0.0126953125, "learning_rate": 1e-06, "loss": 0.0005, "reward": 1.2296133041381836, "reward_std": 0.44070497155189514, "rewards/accuracy_reward": 0.3928786814212799, "rewards/format_reward": 0.8367346823215485, "step": 33 }, { "completion_length": 183.31632232666016, "epoch": 0.0034213836477987422, "grad_norm": 5.499197959899902, "kl": 0.0301513671875, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.4260493516921997, "reward_std": 0.47910386323928833, "rewards/accuracy_reward": 0.5791105031967163, "rewards/format_reward": 0.8469387590885162, "step": 34 }, { "completion_length": 165.91836547851562, "epoch": 0.003522012578616352, "grad_norm": 1.5540889501571655, "kl": 0.01702880859375, "learning_rate": 1e-06, "loss": 0.0007, "reward": 1.2697466015815735, "reward_std": 0.3600970357656479, "rewards/accuracy_reward": 0.46362414956092834, "rewards/format_reward": 0.8061224222183228, "step": 35 }, { "completion_length": 216.7653045654297, "epoch": 0.0036226415094339623, "grad_norm": 2.540203809738159, "kl": 0.013641357421875, "learning_rate": 1e-06, "loss": 0.0005, "reward": 1.3603999614715576, "reward_std": 0.506675511598587, "rewards/accuracy_reward": 0.45223671197891235, "rewards/format_reward": 0.9081632494926453, "step": 36 }, { "completion_length": 221.87754821777344, "epoch": 0.0037232704402515725, "grad_norm": 1.8994758129119873, "kl": 0.0111083984375, "learning_rate": 1e-06, "loss": 0.0004, "reward": 1.0885936915874481, "reward_std": 0.33754852414131165, "rewards/accuracy_reward": 0.32328762114048004, "rewards/format_reward": 0.7653061151504517, "step": 37 }, { "completion_length": 170.54080963134766, "epoch": 0.0038238993710691823, "grad_norm": 2.0362491607666016, "kl": 0.01959228515625, "learning_rate": 1e-06, "loss": 0.0008, "reward": 1.207449734210968, "reward_std": 0.4391282647848129, "rewards/accuracy_reward": 0.4523477256298065, "rewards/format_reward": 0.7551020085811615, "step": 38 }, { "completion_length": 177.38774871826172, "epoch": 0.0039245283018867925, "grad_norm": 2.673349618911743, "kl": 0.03179931640625, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.345836102962494, "reward_std": 0.5096787661314011, "rewards/accuracy_reward": 0.43767283856868744, "rewards/format_reward": 0.9081632494926453, "step": 39 }, { "completion_length": 199.86734771728516, "epoch": 0.004025157232704402, "grad_norm": 5.830787181854248, "kl": 0.01934814453125, "learning_rate": 1e-06, "loss": 0.0008, "reward": 1.312662661075592, "reward_std": 0.3929029852151871, "rewards/accuracy_reward": 0.34327487647533417, "rewards/format_reward": 0.9693877398967743, "step": 40 }, { "completion_length": 233.78570556640625, "epoch": 0.004125786163522013, "grad_norm": 1.339140772819519, "kl": 0.01849365234375, "learning_rate": 1e-06, "loss": 0.0007, "reward": 1.4504382610321045, "reward_std": 0.48154082894325256, "rewards/accuracy_reward": 0.4912545531988144, "rewards/format_reward": 0.9591836631298065, "step": 41 }, { "completion_length": 231.1938705444336, "epoch": 0.004226415094339623, "grad_norm": 1.9599303007125854, "kl": 0.014251708984375, "learning_rate": 1e-06, "loss": 0.0006, "reward": 1.303201675415039, "reward_std": 0.3173559904098511, "rewards/accuracy_reward": 0.4562629461288452, "rewards/format_reward": 0.8469387590885162, "step": 42 }, { "completion_length": 146.78571319580078, "epoch": 0.004327044025157233, "grad_norm": 2.4373416900634766, "kl": 0.032470703125, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.3887422680854797, "reward_std": 0.434279665350914, "rewards/accuracy_reward": 0.4091504365205765, "rewards/format_reward": 0.9795918464660645, "step": 43 }, { "completion_length": 230.06121826171875, "epoch": 0.004427672955974842, "grad_norm": 1.4456373453140259, "kl": 0.02508544921875, "learning_rate": 1e-06, "loss": 0.001, "reward": 1.1824631094932556, "reward_std": 0.31812720000743866, "rewards/accuracy_reward": 0.3661365732550621, "rewards/format_reward": 0.8163265287876129, "step": 44 }, { "completion_length": 186.35713958740234, "epoch": 0.004528301886792453, "grad_norm": 4.292947292327881, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.1900931000709534, "reward_std": 0.45047585666179657, "rewards/accuracy_reward": 0.4247870445251465, "rewards/format_reward": 0.7653060853481293, "step": 45 }, { "completion_length": 152.69387817382812, "epoch": 0.004628930817610063, "grad_norm": 1.7692022323608398, "kl": 0.02880859375, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.450366199016571, "reward_std": 0.36884360015392303, "rewards/accuracy_reward": 0.5422029197216034, "rewards/format_reward": 0.9081632494926453, "step": 46 }, { "completion_length": 197.4591827392578, "epoch": 0.004729559748427673, "grad_norm": 1.2226147651672363, "kl": 0.04296875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.3553282022476196, "reward_std": 0.3244319036602974, "rewards/accuracy_reward": 0.4777772128582001, "rewards/format_reward": 0.8775510191917419, "step": 47 }, { "completion_length": 230.35713958740234, "epoch": 0.004830188679245283, "grad_norm": 1.536458969116211, "kl": 0.0254364013671875, "learning_rate": 1e-06, "loss": 0.001, "reward": 1.1587249636650085, "reward_std": 0.2506393790245056, "rewards/accuracy_reward": 0.3934188038110733, "rewards/format_reward": 0.7653060853481293, "step": 48 }, { "completion_length": 184.60203552246094, "epoch": 0.004930817610062893, "grad_norm": 1.5703017711639404, "kl": 0.01934814453125, "learning_rate": 1e-06, "loss": 0.0008, "reward": 1.3072054386138916, "reward_std": 0.37715375423431396, "rewards/accuracy_reward": 0.4092463403940201, "rewards/format_reward": 0.8979591727256775, "step": 49 }, { "completion_length": 243.9693832397461, "epoch": 0.005031446540880503, "grad_norm": 1.9244974851608276, "kl": 0.0296630859375, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.2140928506851196, "reward_std": 0.39367322623729706, "rewards/accuracy_reward": 0.3263378217816353, "rewards/format_reward": 0.8877550661563873, "step": 50 }, { "completion_length": 188.70407485961914, "epoch": 0.0051320754716981136, "grad_norm": 1.31553316116333, "kl": 0.025726318359375, "learning_rate": 1e-06, "loss": 0.001, "reward": 1.236413836479187, "reward_std": 0.28811372071504593, "rewards/accuracy_reward": 0.38947510719299316, "rewards/format_reward": 0.8469387590885162, "step": 51 }, { "completion_length": 167.9897918701172, "epoch": 0.005232704402515723, "grad_norm": 2.269272804260254, "kl": 0.0643310546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5712640285491943, "reward_std": 0.3051300197839737, "rewards/accuracy_reward": 0.6937129497528076, "rewards/format_reward": 0.8775510191917419, "step": 52 }, { "completion_length": 244.28571319580078, "epoch": 0.005333333333333333, "grad_norm": 2.3951053619384766, "kl": 0.029449462890625, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.301635503768921, "reward_std": 0.35926653444767, "rewards/accuracy_reward": 0.42408451437950134, "rewards/format_reward": 0.8775510191917419, "step": 53 }, { "completion_length": 235.54080963134766, "epoch": 0.005433962264150944, "grad_norm": 1.4144891500473022, "kl": 0.0230712890625, "learning_rate": 1e-06, "loss": 0.0009, "reward": 1.2807680368423462, "reward_std": 0.326799213886261, "rewards/accuracy_reward": 0.44403330981731415, "rewards/format_reward": 0.8367346525192261, "step": 54 }, { "completion_length": 161.3163299560547, "epoch": 0.005534591194968554, "grad_norm": 1.2677497863769531, "kl": 0.022369384765625, "learning_rate": 1e-06, "loss": 0.0009, "reward": 1.5859571695327759, "reward_std": 0.24562766402959824, "rewards/accuracy_reward": 0.6063653975725174, "rewards/format_reward": 0.9795918166637421, "step": 55 }, { "completion_length": 233.948974609375, "epoch": 0.005635220125786163, "grad_norm": 1.711936116218567, "kl": 0.0101318359375, "learning_rate": 1e-06, "loss": 0.0004, "reward": 1.1225330829620361, "reward_std": 0.39600086212158203, "rewards/accuracy_reward": 0.21436983346939087, "rewards/format_reward": 0.9081632494926453, "step": 56 }, { "completion_length": 188.4591827392578, "epoch": 0.005735849056603773, "grad_norm": 1.7512102127075195, "kl": 0.0172119140625, "learning_rate": 1e-06, "loss": 0.0007, "reward": 1.4712687730789185, "reward_std": 0.40831591188907623, "rewards/accuracy_reward": 0.5937176644802094, "rewards/format_reward": 0.8775510191917419, "step": 57 }, { "completion_length": 174.03060913085938, "epoch": 0.005836477987421384, "grad_norm": 3.158796787261963, "kl": 0.0230712890625, "learning_rate": 1e-06, "loss": 0.0009, "reward": 1.4553990960121155, "reward_std": 0.29773109406232834, "rewards/accuracy_reward": 0.5370317697525024, "rewards/format_reward": 0.9183673560619354, "step": 58 }, { "completion_length": 226.76529693603516, "epoch": 0.005937106918238994, "grad_norm": 2.1232223510742188, "kl": 0.021087646484375, "learning_rate": 1e-06, "loss": 0.0008, "reward": 1.3513818979263306, "reward_std": 0.4034799188375473, "rewards/accuracy_reward": 0.4432186037302017, "rewards/format_reward": 0.9081632494926453, "step": 59 }, { "completion_length": 202.10204315185547, "epoch": 0.0060377358490566035, "grad_norm": 1.826830267906189, "kl": 0.01727294921875, "learning_rate": 1e-06, "loss": 0.0007, "reward": 1.1981303989887238, "reward_std": 0.3026905804872513, "rewards/accuracy_reward": 0.3920079469680786, "rewards/format_reward": 0.8061224222183228, "step": 60 }, { "completion_length": 205.86734771728516, "epoch": 0.006138364779874214, "grad_norm": 1.5655802488327026, "kl": 0.02008056640625, "learning_rate": 1e-06, "loss": 0.0008, "reward": 1.24322110414505, "reward_std": 0.4466005861759186, "rewards/accuracy_reward": 0.38607820868492126, "rewards/format_reward": 0.857142835855484, "step": 61 }, { "completion_length": 179.61223602294922, "epoch": 0.006238993710691824, "grad_norm": 1.6791610717773438, "kl": 0.01947021484375, "learning_rate": 1e-06, "loss": 0.0008, "reward": 1.374707281589508, "reward_std": 0.32250964641571045, "rewards/accuracy_reward": 0.4461359530687332, "rewards/format_reward": 0.9285714328289032, "step": 62 }, { "completion_length": 148.0408172607422, "epoch": 0.006339622641509434, "grad_norm": 2.6978964805603027, "kl": 0.039794921875, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.55585116147995, "reward_std": 0.38698646426200867, "rewards/accuracy_reward": 0.5864635109901428, "rewards/format_reward": 0.9693877398967743, "step": 63 }, { "completion_length": 254.57142639160156, "epoch": 0.006440251572327044, "grad_norm": 1.2517900466918945, "kl": 0.0260009765625, "learning_rate": 1e-06, "loss": 0.001, "reward": 1.3167973756790161, "reward_std": 0.4095155894756317, "rewards/accuracy_reward": 0.4290423095226288, "rewards/format_reward": 0.8877550959587097, "step": 64 }, { "completion_length": 192.9795913696289, "epoch": 0.006540880503144654, "grad_norm": 1.4680166244506836, "kl": 0.020050048828125, "learning_rate": 1e-06, "loss": 0.0008, "reward": 1.3995471000671387, "reward_std": 0.3667837828397751, "rewards/accuracy_reward": 0.44036345183849335, "rewards/format_reward": 0.9591836631298065, "step": 65 }, { "completion_length": 222.4897918701172, "epoch": 0.006641509433962264, "grad_norm": 1.3766028881072998, "kl": 0.012420654296875, "learning_rate": 1e-06, "loss": 0.0005, "reward": 1.3032156825065613, "reward_std": 0.4107082635164261, "rewards/accuracy_reward": 0.45627695322036743, "rewards/format_reward": 0.8469387590885162, "step": 66 }, { "completion_length": 149.59183502197266, "epoch": 0.006742138364779874, "grad_norm": 1.931784987449646, "kl": 0.0303955078125, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.4193130135536194, "reward_std": 0.42195090651512146, "rewards/accuracy_reward": 0.43972113728523254, "rewards/format_reward": 0.9795918464660645, "step": 67 }, { "completion_length": 205.91836547851562, "epoch": 0.0068427672955974845, "grad_norm": 1.6071871519088745, "kl": 0.02203369140625, "learning_rate": 1e-06, "loss": 0.0009, "reward": 1.2616899013519287, "reward_std": 0.35774481296539307, "rewards/accuracy_reward": 0.2923021763563156, "rewards/format_reward": 0.9693877398967743, "step": 68 }, { "completion_length": 202.57142639160156, "epoch": 0.006943396226415094, "grad_norm": 1.4991306066513062, "kl": 0.02789306640625, "learning_rate": 1e-06, "loss": 0.0011, "reward": 1.3642263412475586, "reward_std": 0.4622096121311188, "rewards/accuracy_reward": 0.445858970284462, "rewards/format_reward": 0.918367326259613, "step": 69 }, { "completion_length": 161.2244873046875, "epoch": 0.007044025157232704, "grad_norm": 1.640189290046692, "kl": 0.022613525390625, "learning_rate": 1e-06, "loss": 0.0009, "reward": 1.338890016078949, "reward_std": 0.4491671025753021, "rewards/accuracy_reward": 0.4205225855112076, "rewards/format_reward": 0.918367326259613, "step": 70 }, { "completion_length": 198.21428680419922, "epoch": 0.007144654088050315, "grad_norm": 1.7003629207611084, "kl": 0.021575927734375, "learning_rate": 1e-06, "loss": 0.0009, "reward": 1.3938804268836975, "reward_std": 0.4147895872592926, "rewards/accuracy_reward": 0.45510485768318176, "rewards/format_reward": 0.9387754797935486, "step": 71 }, { "completion_length": 170.42856979370117, "epoch": 0.0072452830188679245, "grad_norm": 1.7991348505020142, "kl": 0.04541015625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.4359290599822998, "reward_std": 0.4439338594675064, "rewards/accuracy_reward": 0.4971535950899124, "rewards/format_reward": 0.938775509595871, "step": 72 }, { "completion_length": 193.84693145751953, "epoch": 0.007345911949685534, "grad_norm": 1.3896877765655518, "kl": 0.018310546875, "learning_rate": 1e-06, "loss": 0.0007, "reward": 1.3469387292861938, "reward_std": 0.41438667476177216, "rewards/accuracy_reward": 0.367346927523613, "rewards/format_reward": 0.9795918166637421, "step": 73 }, { "completion_length": 169.15305709838867, "epoch": 0.007446540880503145, "grad_norm": 2.177089214324951, "kl": 0.0328369140625, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.6151602864265442, "reward_std": 0.3620704412460327, "rewards/accuracy_reward": 0.6253643929958344, "rewards/format_reward": 0.9897959232330322, "step": 74 }, { "completion_length": 131.21428298950195, "epoch": 0.007547169811320755, "grad_norm": 2.4660096168518066, "kl": 0.0345458984375, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.482028841972351, "reward_std": 0.3827500492334366, "rewards/accuracy_reward": 0.49223288893699646, "rewards/format_reward": 0.9897959232330322, "step": 75 }, { "completion_length": 233.30612182617188, "epoch": 0.007647798742138365, "grad_norm": 1.2720917463302612, "kl": 0.0216064453125, "learning_rate": 1e-06, "loss": 0.0009, "reward": 1.3476522564888, "reward_std": 0.35342904925346375, "rewards/accuracy_reward": 0.368060439825058, "rewards/format_reward": 0.9795918166637421, "step": 76 }, { "completion_length": 287.6632614135742, "epoch": 0.007748427672955975, "grad_norm": 1.145920991897583, "kl": 0.0269775390625, "learning_rate": 1e-06, "loss": 0.0011, "reward": 1.311145007610321, "reward_std": 0.4035676270723343, "rewards/accuracy_reward": 0.3519614040851593, "rewards/format_reward": 0.9591836631298065, "step": 77 }, { "completion_length": 215.06121826171875, "epoch": 0.007849056603773585, "grad_norm": 2.3808581829071045, "kl": 0.02752685546875, "learning_rate": 1e-06, "loss": 0.0011, "reward": 1.3739957809448242, "reward_std": 0.27003736048936844, "rewards/accuracy_reward": 0.4148120880126953, "rewards/format_reward": 0.9591836631298065, "step": 78 }, { "completion_length": 215.6530532836914, "epoch": 0.007949685534591196, "grad_norm": 1.4775151014328003, "kl": 0.0477294921875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.2653061151504517, "reward_std": 0.30805227160453796, "rewards/accuracy_reward": 0.2857142835855484, "rewards/format_reward": 0.9795918166637421, "step": 79 }, { "completion_length": 153.39795684814453, "epoch": 0.008050314465408805, "grad_norm": 3.4179086685180664, "kl": 0.03570556640625, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.5040816068649292, "reward_std": 0.36895547807216644, "rewards/accuracy_reward": 0.5244897902011871, "rewards/format_reward": 0.9795918166637421, "step": 80 }, { "completion_length": 129.08162689208984, "epoch": 0.008150943396226415, "grad_norm": 3.115170955657959, "kl": 0.02679443359375, "learning_rate": 1e-06, "loss": 0.0011, "reward": 1.4938289523124695, "reward_std": 0.3534407317638397, "rewards/accuracy_reward": 0.4938289523124695, "rewards/format_reward": 1.0, "step": 81 }, { "completion_length": 286.4897918701172, "epoch": 0.008251572327044026, "grad_norm": 1.0171079635620117, "kl": 0.023193359375, "learning_rate": 1e-06, "loss": 0.0009, "reward": 1.3509610295295715, "reward_std": 0.3746196925640106, "rewards/accuracy_reward": 0.3611650913953781, "rewards/format_reward": 0.9897959232330322, "step": 82 }, { "completion_length": 162.65306091308594, "epoch": 0.008352201257861635, "grad_norm": 2.018052339553833, "kl": 0.0474853515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.4952380657196045, "reward_std": 0.3993731737136841, "rewards/accuracy_reward": 0.5156462490558624, "rewards/format_reward": 0.9795918464660645, "step": 83 }, { "completion_length": 163.60203552246094, "epoch": 0.008452830188679246, "grad_norm": 1.6173856258392334, "kl": 0.044189453125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.5027210712432861, "reward_std": 0.34466859698295593, "rewards/accuracy_reward": 0.5027211010456085, "rewards/format_reward": 1.0, "step": 84 }, { "completion_length": 159.7551040649414, "epoch": 0.008553459119496854, "grad_norm": 1.8835937976837158, "kl": 0.029052734375, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.414680004119873, "reward_std": 0.3570461869239807, "rewards/accuracy_reward": 0.435088187456131, "rewards/format_reward": 0.9795918166637421, "step": 85 }, { "completion_length": 157.4693832397461, "epoch": 0.008654088050314465, "grad_norm": 3.629713296890259, "kl": 0.0267333984375, "learning_rate": 1e-06, "loss": 0.0011, "reward": 1.5318419337272644, "reward_std": 0.40467900037765503, "rewards/accuracy_reward": 0.5420460402965546, "rewards/format_reward": 0.9897959232330322, "step": 86 }, { "completion_length": 193.15306091308594, "epoch": 0.008754716981132076, "grad_norm": 2.160449981689453, "kl": 0.03033447265625, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.4919986128807068, "reward_std": 0.3937499672174454, "rewards/accuracy_reward": 0.5124068558216095, "rewards/format_reward": 0.9795918166637421, "step": 87 }, { "completion_length": 220.9081573486328, "epoch": 0.008855345911949685, "grad_norm": 2.5356836318969727, "kl": 0.048583984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5852283835411072, "reward_std": 0.3513483554124832, "rewards/accuracy_reward": 0.6056365370750427, "rewards/format_reward": 0.9795918166637421, "step": 88 }, { "completion_length": 213.90816497802734, "epoch": 0.008955974842767295, "grad_norm": 5.228051662445068, "kl": 0.0389404296875, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.4694042205810547, "reward_std": 0.37774816155433655, "rewards/accuracy_reward": 0.5000164955854416, "rewards/format_reward": 0.9693877398967743, "step": 89 }, { "completion_length": 197.16326141357422, "epoch": 0.009056603773584906, "grad_norm": 1.683003544807434, "kl": 0.02923583984375, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.3483662605285645, "reward_std": 0.420497328042984, "rewards/accuracy_reward": 0.3789784610271454, "rewards/format_reward": 0.9693877398967743, "step": 90 }, { "completion_length": 174.11224365234375, "epoch": 0.009157232704402515, "grad_norm": 1.985316514968872, "kl": 0.0477294921875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5789050459861755, "reward_std": 0.32658664882183075, "rewards/accuracy_reward": 0.5993131399154663, "rewards/format_reward": 0.9795918464660645, "step": 91 }, { "completion_length": 156.43877029418945, "epoch": 0.009257861635220126, "grad_norm": 1.8593297004699707, "kl": 0.0406494140625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.4518171548843384, "reward_std": 0.30012810230255127, "rewards/accuracy_reward": 0.47222527861595154, "rewards/format_reward": 0.9795918166637421, "step": 92 }, { "completion_length": 232.12244415283203, "epoch": 0.009358490566037736, "grad_norm": 1.78206467628479, "kl": 0.0274658203125, "learning_rate": 1e-06, "loss": 0.0011, "reward": 1.5089933276176453, "reward_std": 0.3736242800951004, "rewards/accuracy_reward": 0.5294015109539032, "rewards/format_reward": 0.9795918166637421, "step": 93 }, { "completion_length": 195.62244415283203, "epoch": 0.009459119496855345, "grad_norm": 2.8148257732391357, "kl": 0.03753662109375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.3791208863258362, "reward_std": 0.4135039299726486, "rewards/accuracy_reward": 0.39952903985977173, "rewards/format_reward": 0.9795918166637421, "step": 94 }, { "completion_length": 176.87754821777344, "epoch": 0.009559748427672956, "grad_norm": 1.9326462745666504, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.4229348301887512, "reward_std": 0.2234322428703308, "rewards/accuracy_reward": 0.4229348599910736, "rewards/format_reward": 1.0, "step": 95 }, { "completion_length": 146.62244415283203, "epoch": 0.009660377358490567, "grad_norm": 1.601141095161438, "kl": 0.03106689453125, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.5714285373687744, "reward_std": 0.3158058524131775, "rewards/accuracy_reward": 0.5816326439380646, "rewards/format_reward": 0.9897959232330322, "step": 96 }, { "completion_length": 209.9285659790039, "epoch": 0.009761006289308176, "grad_norm": 2.884089708328247, "kl": 0.02459716796875, "learning_rate": 1e-06, "loss": 0.001, "reward": 1.4265261888504028, "reward_std": 0.36789292097091675, "rewards/accuracy_reward": 0.44693438708782196, "rewards/format_reward": 0.9795918166637421, "step": 97 }, { "completion_length": 187.32653045654297, "epoch": 0.009861635220125786, "grad_norm": 1.59282386302948, "kl": 0.05224609375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5001183152198792, "reward_std": 0.36488576233386993, "rewards/accuracy_reward": 0.5307305604219437, "rewards/format_reward": 0.9693877398967743, "step": 98 }, { "completion_length": 184.7244873046875, "epoch": 0.009962264150943397, "grad_norm": 2.7011594772338867, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5078230500221252, "reward_std": 0.4484402537345886, "rewards/accuracy_reward": 0.5384353697299957, "rewards/format_reward": 0.9693877398967743, "step": 99 }, { "completion_length": 209.6836700439453, "epoch": 0.010062893081761006, "grad_norm": 1.9062658548355103, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.2926797270774841, "reward_std": 0.4193524718284607, "rewards/accuracy_reward": 0.3743124157190323, "rewards/format_reward": 0.918367326259613, "step": 100 }, { "completion_length": 169.2346954345703, "epoch": 0.010163522012578616, "grad_norm": 1.856393575668335, "kl": 0.0396728515625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.5122923851013184, "reward_std": 0.39982539415359497, "rewards/accuracy_reward": 0.5224964767694473, "rewards/format_reward": 0.9897959232330322, "step": 101 }, { "completion_length": 180.85713958740234, "epoch": 0.010264150943396227, "grad_norm": 1.9084393978118896, "kl": 0.05517578125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.542857050895691, "reward_std": 0.4078316390514374, "rewards/accuracy_reward": 0.5632653087377548, "rewards/format_reward": 0.9795918464660645, "step": 102 }, { "completion_length": 178.66326141357422, "epoch": 0.010364779874213836, "grad_norm": 2.2284162044525146, "kl": 0.0487060546875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6295552849769592, "reward_std": 0.3908209502696991, "rewards/accuracy_reward": 0.6397594511508942, "rewards/format_reward": 0.9897959232330322, "step": 103 }, { "completion_length": 235.43877410888672, "epoch": 0.010465408805031447, "grad_norm": 1.225446105003357, "kl": 0.037109375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5216540098190308, "reward_std": 0.2957613468170166, "rewards/accuracy_reward": 0.5420622527599335, "rewards/format_reward": 0.9795918464660645, "step": 104 }, { "completion_length": 189.38774871826172, "epoch": 0.010566037735849057, "grad_norm": 4.283112525939941, "kl": 0.11376953125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.476283311843872, "reward_std": 0.441458061337471, "rewards/accuracy_reward": 0.5170995444059372, "rewards/format_reward": 0.9591836631298065, "step": 105 }, { "completion_length": 208.15306091308594, "epoch": 0.010666666666666666, "grad_norm": 1.4162559509277344, "kl": 0.04119873046875, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.4011029601097107, "reward_std": 0.3697906732559204, "rewards/accuracy_reward": 0.4317152500152588, "rewards/format_reward": 0.9693877398967743, "step": 106 }, { "completion_length": 157.67346954345703, "epoch": 0.010767295597484277, "grad_norm": 2.052203893661499, "kl": 0.0482177734375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.46151602268219, "reward_std": 0.2616165801882744, "rewards/accuracy_reward": 0.49212829768657684, "rewards/format_reward": 0.9693877398967743, "step": 107 }, { "completion_length": 277.8673400878906, "epoch": 0.010867924528301888, "grad_norm": 1.2249630689620972, "kl": 0.03277587890625, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.3910057544708252, "reward_std": 0.36952435970306396, "rewards/accuracy_reward": 0.4216180741786957, "rewards/format_reward": 0.9693877398967743, "step": 108 }, { "completion_length": 182.07142639160156, "epoch": 0.010968553459119497, "grad_norm": 3.1504170894622803, "kl": 0.047607421875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.417526364326477, "reward_std": 0.4146096557378769, "rewards/accuracy_reward": 0.4787507951259613, "rewards/format_reward": 0.938775509595871, "step": 109 }, { "completion_length": 220.448974609375, "epoch": 0.011069182389937107, "grad_norm": 1.4062131643295288, "kl": 0.021453857421875, "learning_rate": 1e-06, "loss": 0.0009, "reward": 1.450389802455902, "reward_std": 0.40936553478240967, "rewards/accuracy_reward": 0.4605938047170639, "rewards/format_reward": 0.9897959232330322, "step": 110 }, { "completion_length": 181.19387817382812, "epoch": 0.011169811320754716, "grad_norm": 1.4759414196014404, "kl": 0.02850341796875, "learning_rate": 1e-06, "loss": 0.0011, "reward": 1.4761905074119568, "reward_std": 0.3353823274374008, "rewards/accuracy_reward": 0.4761904925107956, "rewards/format_reward": 1.0, "step": 111 }, { "completion_length": 191.23468780517578, "epoch": 0.011270440251572327, "grad_norm": 2.338186025619507, "kl": 0.02813720703125, "learning_rate": 1e-06, "loss": 0.0011, "reward": 1.515773355960846, "reward_std": 0.425983265042305, "rewards/accuracy_reward": 0.5463855862617493, "rewards/format_reward": 0.9693877398967743, "step": 112 }, { "completion_length": 244.05101776123047, "epoch": 0.011371069182389937, "grad_norm": 5.024562835693359, "kl": 0.0462646484375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.363284945487976, "reward_std": 0.39820416271686554, "rewards/accuracy_reward": 0.4245094805955887, "rewards/format_reward": 0.9387754797935486, "step": 113 }, { "completion_length": 240.91836547851562, "epoch": 0.011471698113207546, "grad_norm": 2.197408437728882, "kl": 0.053253173828125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.4222386479377747, "reward_std": 0.3634347915649414, "rewards/accuracy_reward": 0.4324427545070648, "rewards/format_reward": 0.9897959232330322, "step": 114 }, { "completion_length": 206.70407104492188, "epoch": 0.011572327044025157, "grad_norm": 1.4912091493606567, "kl": 0.02081298828125, "learning_rate": 1e-06, "loss": 0.0008, "reward": 1.5174927115440369, "reward_std": 0.3339982330799103, "rewards/accuracy_reward": 0.5174926966428757, "rewards/format_reward": 1.0, "step": 115 }, { "completion_length": 222.21428680419922, "epoch": 0.011672955974842768, "grad_norm": 1.415444254875183, "kl": 0.02947998046875, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.450365126132965, "reward_std": 0.4279673397541046, "rewards/accuracy_reward": 0.4707733243703842, "rewards/format_reward": 0.9795918464660645, "step": 116 }, { "completion_length": 262.2346954345703, "epoch": 0.011773584905660377, "grad_norm": 1.2936384677886963, "kl": 0.021636962890625, "learning_rate": 1e-06, "loss": 0.0009, "reward": 1.3703129291534424, "reward_std": 0.4735078513622284, "rewards/accuracy_reward": 0.4417415261268616, "rewards/format_reward": 0.9285714030265808, "step": 117 }, { "completion_length": 231.88775634765625, "epoch": 0.011874213836477987, "grad_norm": 3.7409467697143555, "kl": 0.03314208984375, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.4348214864730835, "reward_std": 0.34981703758239746, "rewards/accuracy_reward": 0.4858419746160507, "rewards/format_reward": 0.9489795565605164, "step": 118 }, { "completion_length": 248.04080963134766, "epoch": 0.011974842767295598, "grad_norm": 1.5523029565811157, "kl": 0.0234375, "learning_rate": 1e-06, "loss": 0.0009, "reward": 1.4352046847343445, "reward_std": 0.455160528421402, "rewards/accuracy_reward": 0.5372455716133118, "rewards/format_reward": 0.8979591727256775, "step": 119 }, { "completion_length": 204.62245178222656, "epoch": 0.012075471698113207, "grad_norm": 1.4514662027359009, "kl": 0.04046630859375, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.5699405074119568, "reward_std": 0.36879126727581024, "rewards/accuracy_reward": 0.6005527824163437, "rewards/format_reward": 0.9693877398967743, "step": 120 }, { "completion_length": 251.62245178222656, "epoch": 0.012176100628930818, "grad_norm": 1.6343716382980347, "kl": 0.03515625, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.4826481342315674, "reward_std": 0.3750191479921341, "rewards/accuracy_reward": 0.5030563920736313, "rewards/format_reward": 0.9795918464660645, "step": 121 }, { "completion_length": 246.30611419677734, "epoch": 0.012276729559748428, "grad_norm": 1.0869419574737549, "kl": 0.0296630859375, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.5568512678146362, "reward_std": 0.20377665758132935, "rewards/accuracy_reward": 0.5670553743839264, "rewards/format_reward": 0.9897959232330322, "step": 122 }, { "completion_length": 154.88774871826172, "epoch": 0.012377358490566037, "grad_norm": 1.3178097009658813, "kl": 0.0526123046875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6328712105751038, "reward_std": 0.31180860102176666, "rewards/accuracy_reward": 0.6532793939113617, "rewards/format_reward": 0.9795918166637421, "step": 123 }, { "completion_length": 245.5408172607422, "epoch": 0.012477987421383648, "grad_norm": 1.2893092632293701, "kl": 0.02374267578125, "learning_rate": 1e-06, "loss": 0.001, "reward": 1.3700224161148071, "reward_std": 0.3297482579946518, "rewards/accuracy_reward": 0.4414509981870651, "rewards/format_reward": 0.9285714030265808, "step": 124 }, { "completion_length": 183.99999237060547, "epoch": 0.012578616352201259, "grad_norm": 2.6214048862457275, "kl": 0.04541015625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.4920634627342224, "reward_std": 0.4623621702194214, "rewards/accuracy_reward": 0.5328797996044159, "rewards/format_reward": 0.9591836631298065, "step": 125 }, { "completion_length": 169.83673095703125, "epoch": 0.012679245283018867, "grad_norm": 1.8131129741668701, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.4098113179206848, "reward_std": 0.3551645427942276, "rewards/accuracy_reward": 0.4812399744987488, "rewards/format_reward": 0.9285714030265808, "step": 126 }, { "completion_length": 250.4795913696289, "epoch": 0.012779874213836478, "grad_norm": 1.356493592262268, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5837076902389526, "reward_std": 0.29582685232162476, "rewards/accuracy_reward": 0.604115903377533, "rewards/format_reward": 0.9795918166637421, "step": 127 }, { "completion_length": 288.45916748046875, "epoch": 0.012880503144654089, "grad_norm": 2.1139416694641113, "kl": 0.04852294921875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.3962435126304626, "reward_std": 0.361464262008667, "rewards/accuracy_reward": 0.4268558397889137, "rewards/format_reward": 0.9693877398967743, "step": 128 }, { "completion_length": 207.35713958740234, "epoch": 0.012981132075471698, "grad_norm": 1.516575813293457, "kl": 0.0399169921875, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.4523965120315552, "reward_std": 0.37182775139808655, "rewards/accuracy_reward": 0.4830087721347809, "rewards/format_reward": 0.9693877398967743, "step": 129 }, { "completion_length": 245.11224365234375, "epoch": 0.013081761006289308, "grad_norm": 1.986663818359375, "kl": 0.02764892578125, "learning_rate": 1e-06, "loss": 0.0011, "reward": 1.5517985224723816, "reward_std": 0.4598006457090378, "rewards/accuracy_reward": 0.5926147997379303, "rewards/format_reward": 0.9591836631298065, "step": 130 }, { "completion_length": 272.9591751098633, "epoch": 0.013182389937106919, "grad_norm": 2.034482717514038, "kl": 0.02362060546875, "learning_rate": 1e-06, "loss": 0.0009, "reward": 1.4065791964530945, "reward_std": 0.42826618254184723, "rewards/accuracy_reward": 0.4371914565563202, "rewards/format_reward": 0.9693877398967743, "step": 131 }, { "completion_length": 225.53060913085938, "epoch": 0.013283018867924528, "grad_norm": 1.4059189558029175, "kl": 0.02191162109375, "learning_rate": 1e-06, "loss": 0.0009, "reward": 1.5705944895744324, "reward_std": 0.16564694792032242, "rewards/accuracy_reward": 0.5705945491790771, "rewards/format_reward": 1.0, "step": 132 }, { "completion_length": 268.4591751098633, "epoch": 0.013383647798742139, "grad_norm": 1.647763729095459, "kl": 0.027435302734375, "learning_rate": 1e-06, "loss": 0.0011, "reward": 1.4182372689247131, "reward_std": 0.49757710099220276, "rewards/accuracy_reward": 0.4692576676607132, "rewards/format_reward": 0.9489795565605164, "step": 133 }, { "completion_length": 303.32653045654297, "epoch": 0.013484276729559748, "grad_norm": 1.882201075553894, "kl": 0.0433349609375, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.4475135803222656, "reward_std": 0.39644575119018555, "rewards/accuracy_reward": 0.4883299469947815, "rewards/format_reward": 0.9591836333274841, "step": 134 }, { "completion_length": 218.56121826171875, "epoch": 0.013584905660377358, "grad_norm": 1.8356863260269165, "kl": 0.04345703125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5731291770935059, "reward_std": 0.3894544839859009, "rewards/accuracy_reward": 0.6139455735683441, "rewards/format_reward": 0.9591836631298065, "step": 135 }, { "completion_length": 222.35713958740234, "epoch": 0.013685534591194969, "grad_norm": 2.3738315105438232, "kl": 0.032958984375, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.5477405190467834, "reward_std": 0.40568891167640686, "rewards/accuracy_reward": 0.5987609028816223, "rewards/format_reward": 0.9489795863628387, "step": 136 }, { "completion_length": 206.27550506591797, "epoch": 0.013786163522012578, "grad_norm": 1.5954742431640625, "kl": 0.042724609375, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.4981634616851807, "reward_std": 0.3208349347114563, "rewards/accuracy_reward": 0.5287757366895676, "rewards/format_reward": 0.9693877398967743, "step": 137 }, { "completion_length": 241.2448959350586, "epoch": 0.013886792452830189, "grad_norm": 1.397955298423767, "kl": 0.03564453125, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.4725573658943176, "reward_std": 0.2751447707414627, "rewards/accuracy_reward": 0.4827614426612854, "rewards/format_reward": 0.9897959232330322, "step": 138 }, { "completion_length": 159.4693832397461, "epoch": 0.0139874213836478, "grad_norm": 1.7167046070098877, "kl": 0.0494384765625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.632652997970581, "reward_std": 0.294442281126976, "rewards/accuracy_reward": 0.6530612111091614, "rewards/format_reward": 0.9795918166637421, "step": 139 }, { "completion_length": 219.47958374023438, "epoch": 0.014088050314465408, "grad_norm": 1.4060781002044678, "kl": 0.051025390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5997615456581116, "reward_std": 0.37977585196495056, "rewards/accuracy_reward": 0.6303738057613373, "rewards/format_reward": 0.9693877398967743, "step": 140 }, { "completion_length": 270.52040100097656, "epoch": 0.014188679245283019, "grad_norm": 1.3270539045333862, "kl": 0.036865234375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.3108924627304077, "reward_std": 0.2267511785030365, "rewards/accuracy_reward": 0.3210965394973755, "rewards/format_reward": 0.9897959232330322, "step": 141 }, { "completion_length": 197.86734008789062, "epoch": 0.01428930817610063, "grad_norm": 3.6983816623687744, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5829088687896729, "reward_std": 0.32486075907945633, "rewards/accuracy_reward": 0.5931130349636078, "rewards/format_reward": 0.9897959232330322, "step": 142 }, { "completion_length": 182.69387817382812, "epoch": 0.014389937106918238, "grad_norm": 2.256887912750244, "kl": 0.0540771484375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5714285373687744, "reward_std": 0.4080834835767746, "rewards/accuracy_reward": 0.5816326290369034, "rewards/format_reward": 0.9897959232330322, "step": 143 }, { "completion_length": 209.83673095703125, "epoch": 0.014490566037735849, "grad_norm": 1.3738785982131958, "kl": 0.0380859375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5306122303009033, "reward_std": 0.28778909891843796, "rewards/accuracy_reward": 0.5408163070678711, "rewards/format_reward": 0.9897959232330322, "step": 144 }, { "completion_length": 186.49999237060547, "epoch": 0.01459119496855346, "grad_norm": 1.190528154373169, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.548857569694519, "reward_std": 0.2943621277809143, "rewards/accuracy_reward": 0.548857569694519, "rewards/format_reward": 1.0, "step": 145 }, { "completion_length": 244.1326446533203, "epoch": 0.014691823899371069, "grad_norm": 1.2075672149658203, "kl": 0.0509033203125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.440837323665619, "reward_std": 0.33812977373600006, "rewards/accuracy_reward": 0.4714495688676834, "rewards/format_reward": 0.9693877398967743, "step": 146 }, { "completion_length": 244.75509643554688, "epoch": 0.01479245283018868, "grad_norm": 1.343095302581787, "kl": 0.038330078125, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.4081631898880005, "reward_std": 0.36316512525081635, "rewards/accuracy_reward": 0.44897958636283875, "rewards/format_reward": 0.9591836631298065, "step": 147 }, { "completion_length": 217.31632232666016, "epoch": 0.01489308176100629, "grad_norm": 3.4040379524230957, "kl": 0.049072265625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5516122579574585, "reward_std": 0.42073947191238403, "rewards/accuracy_reward": 0.561816394329071, "rewards/format_reward": 0.9897959232330322, "step": 148 }, { "completion_length": 124.63264846801758, "epoch": 0.014993710691823899, "grad_norm": 2.1890456676483154, "kl": 0.055419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7213920950889587, "reward_std": 0.36890026926994324, "rewards/accuracy_reward": 0.7315961718559265, "rewards/format_reward": 0.9897959232330322, "step": 149 }, { "completion_length": 188.9897918701172, "epoch": 0.01509433962264151, "grad_norm": 3.622816801071167, "kl": 0.0352783203125, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.4107494354248047, "reward_std": 0.3009645417332649, "rewards/accuracy_reward": 0.42095354199409485, "rewards/format_reward": 0.9897959232330322, "step": 150 }, { "completion_length": 226.34693908691406, "epoch": 0.01519496855345912, "grad_norm": 1.9172476530075073, "kl": 0.0313720703125, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.4746728539466858, "reward_std": 0.3625407889485359, "rewards/accuracy_reward": 0.48487699031829834, "rewards/format_reward": 0.9897959232330322, "step": 151 }, { "completion_length": 138.35713958740234, "epoch": 0.01529559748427673, "grad_norm": 2.6378111839294434, "kl": 0.03741455078125, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5678302645683289, "reward_std": 0.1562303751707077, "rewards/accuracy_reward": 0.5678302347660065, "rewards/format_reward": 1.0, "step": 152 }, { "completion_length": 195.1938705444336, "epoch": 0.01539622641509434, "grad_norm": 1.5018402338027954, "kl": 0.056640625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.600097119808197, "reward_std": 0.3398319184780121, "rewards/accuracy_reward": 0.6307094097137451, "rewards/format_reward": 0.9693877398967743, "step": 153 }, { "completion_length": 222.89795684814453, "epoch": 0.01549685534591195, "grad_norm": 1.8275996446609497, "kl": 0.04718017578125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.3738353252410889, "reward_std": 0.34228289127349854, "rewards/accuracy_reward": 0.38403937220573425, "rewards/format_reward": 0.9897959232330322, "step": 154 }, { "completion_length": 246.6938705444336, "epoch": 0.01559748427672956, "grad_norm": 1.6539638042449951, "kl": 0.0389404296875, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.4526239037513733, "reward_std": 0.40473298728466034, "rewards/accuracy_reward": 0.4832361489534378, "rewards/format_reward": 0.9693877398967743, "step": 155 }, { "completion_length": 220.62244415283203, "epoch": 0.01569811320754717, "grad_norm": 1.290424108505249, "kl": 0.0457763671875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.5237890481948853, "reward_std": 0.2153947502374649, "rewards/accuracy_reward": 0.5441971719264984, "rewards/format_reward": 0.9795918166637421, "step": 156 }, { "completion_length": 165.4897918701172, "epoch": 0.01579874213836478, "grad_norm": 1.4359794855117798, "kl": 0.04888916015625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5155490636825562, "reward_std": 0.26141832023859024, "rewards/accuracy_reward": 0.5257531702518463, "rewards/format_reward": 0.9897959232330322, "step": 157 }, { "completion_length": 242.4285659790039, "epoch": 0.01589937106918239, "grad_norm": 1.37777578830719, "kl": 0.04150390625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.4790350198745728, "reward_std": 0.38973772525787354, "rewards/accuracy_reward": 0.5198514461517334, "rewards/format_reward": 0.9591836333274841, "step": 158 }, { "completion_length": 181.69387817382812, "epoch": 0.016, "grad_norm": 1.7750186920166016, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5106413960456848, "reward_std": 0.29370446503162384, "rewards/accuracy_reward": 0.5412535965442657, "rewards/format_reward": 0.9693877398967743, "step": 159 }, { "completion_length": 167.93877410888672, "epoch": 0.01610062893081761, "grad_norm": 2.1319186687469482, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6356055736541748, "reward_std": 0.3709573596715927, "rewards/accuracy_reward": 0.6560137271881104, "rewards/format_reward": 0.9795918166637421, "step": 160 }, { "completion_length": 233.98978424072266, "epoch": 0.01620125786163522, "grad_norm": 4.792313098907471, "kl": 0.040771484375, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.3457725644111633, "reward_std": 0.41575150191783905, "rewards/accuracy_reward": 0.396793007850647, "rewards/format_reward": 0.9489795863628387, "step": 161 }, { "completion_length": 181.39795684814453, "epoch": 0.01630188679245283, "grad_norm": 1.5325912237167358, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5128539204597473, "reward_std": 0.21048802137374878, "rewards/accuracy_reward": 0.5128539651632309, "rewards/format_reward": 1.0, "step": 162 }, { "completion_length": 197.82652282714844, "epoch": 0.01640251572327044, "grad_norm": 1.8279829025268555, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5234693884849548, "reward_std": 0.2677358612418175, "rewards/accuracy_reward": 0.5438775420188904, "rewards/format_reward": 0.9795918166637421, "step": 163 }, { "completion_length": 227.29591369628906, "epoch": 0.016503144654088052, "grad_norm": 1.6243494749069214, "kl": 0.0574951171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.465378999710083, "reward_std": 0.4134708493947983, "rewards/accuracy_reward": 0.5061953216791153, "rewards/format_reward": 0.9591836631298065, "step": 164 }, { "completion_length": 206.08162689208984, "epoch": 0.01660377358490566, "grad_norm": 2.054898738861084, "kl": 0.0401611328125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.5677370429039001, "reward_std": 0.3481898903846741, "rewards/accuracy_reward": 0.5779411792755127, "rewards/format_reward": 0.9897959232330322, "step": 165 }, { "completion_length": 127.79591369628906, "epoch": 0.01670440251572327, "grad_norm": 3.9170122146606445, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7551019787788391, "reward_std": 0.30325527489185333, "rewards/accuracy_reward": 0.7755101919174194, "rewards/format_reward": 0.9795918166637421, "step": 166 }, { "completion_length": 189.86734008789062, "epoch": 0.016805031446540882, "grad_norm": 2.183093547821045, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6122449040412903, "reward_std": 0.4122529625892639, "rewards/accuracy_reward": 0.6530612111091614, "rewards/format_reward": 0.9591836631298065, "step": 167 }, { "completion_length": 221.79591369628906, "epoch": 0.01690566037735849, "grad_norm": 1.6184570789337158, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.3446910977363586, "reward_std": 0.37068483233451843, "rewards/accuracy_reward": 0.3957115113735199, "rewards/format_reward": 0.9489795863628387, "step": 168 }, { "completion_length": 201.9999885559082, "epoch": 0.0170062893081761, "grad_norm": 1.0953717231750488, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5510540008544922, "reward_std": 0.28899873048067093, "rewards/accuracy_reward": 0.5918703824281693, "rewards/format_reward": 0.9591836631298065, "step": 169 }, { "completion_length": 186.32652282714844, "epoch": 0.01710691823899371, "grad_norm": 1.4920436143875122, "kl": 0.0460205078125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6596916317939758, "reward_std": 0.2809006795287132, "rewards/accuracy_reward": 0.6698956787586212, "rewards/format_reward": 0.9897959232330322, "step": 170 }, { "completion_length": 154.12244415283203, "epoch": 0.01720754716981132, "grad_norm": 2.2478151321411133, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.5578653216362, "reward_std": 0.3439750671386719, "rewards/accuracy_reward": 0.5986816883087158, "rewards/format_reward": 0.9591836631298065, "step": 171 }, { "completion_length": 160.14285278320312, "epoch": 0.01730817610062893, "grad_norm": 13.525318145751953, "kl": 0.1104736328125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.5707029700279236, "reward_std": 0.32795779407024384, "rewards/accuracy_reward": 0.5809070914983749, "rewards/format_reward": 0.9897959232330322, "step": 172 }, { "completion_length": 238.59182739257812, "epoch": 0.01740880503144654, "grad_norm": 1.3484450578689575, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.4727891087532043, "reward_std": 0.30905191600322723, "rewards/accuracy_reward": 0.50340136885643, "rewards/format_reward": 0.9693877398967743, "step": 173 }, { "completion_length": 231.29591369628906, "epoch": 0.01750943396226415, "grad_norm": 4.796051025390625, "kl": 0.05078125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.450923204421997, "reward_std": 0.47082144021987915, "rewards/accuracy_reward": 0.48153547942638397, "rewards/format_reward": 0.9693877398967743, "step": 174 }, { "completion_length": 177.81632232666016, "epoch": 0.01761006289308176, "grad_norm": 1.291987419128418, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5074968934059143, "reward_std": 0.24368866533041, "rewards/accuracy_reward": 0.5279050320386887, "rewards/format_reward": 0.9795918166637421, "step": 175 }, { "completion_length": 197.6836700439453, "epoch": 0.01771069182389937, "grad_norm": 1.2850292921066284, "kl": 0.03778076171875, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.4252803325653076, "reward_std": 0.36464686691761017, "rewards/accuracy_reward": 0.44568854570388794, "rewards/format_reward": 0.9795918166637421, "step": 176 }, { "completion_length": 222.25509643554688, "epoch": 0.017811320754716982, "grad_norm": 1.0152888298034668, "kl": 0.0633544921875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.4656181931495667, "reward_std": 0.2852150574326515, "rewards/accuracy_reward": 0.5268427133560181, "rewards/format_reward": 0.938775509595871, "step": 177 }, { "completion_length": 210.4591827392578, "epoch": 0.01791194968553459, "grad_norm": 1.7092416286468506, "kl": 0.03631591796875, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.441852867603302, "reward_std": 0.37435051798820496, "rewards/accuracy_reward": 0.4724651873111725, "rewards/format_reward": 0.9693877398967743, "step": 178 }, { "completion_length": 237.4897918701172, "epoch": 0.0180125786163522, "grad_norm": 3.3830313682556152, "kl": 0.0509033203125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.4535146355628967, "reward_std": 0.4048677533864975, "rewards/accuracy_reward": 0.46371880173683167, "rewards/format_reward": 0.9897959232330322, "step": 179 }, { "completion_length": 186.07142639160156, "epoch": 0.018113207547169812, "grad_norm": 1.2673259973526, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.4396662712097168, "reward_std": 0.3253837376832962, "rewards/accuracy_reward": 0.4600744843482971, "rewards/format_reward": 0.9795918166637421, "step": 180 }, { "completion_length": 188.88775634765625, "epoch": 0.01821383647798742, "grad_norm": 1.43186616897583, "kl": 0.049560546875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5758340954780579, "reward_std": 0.30691472440958023, "rewards/accuracy_reward": 0.606446385383606, "rewards/format_reward": 0.9693877398967743, "step": 181 }, { "completion_length": 245.99999237060547, "epoch": 0.01831446540880503, "grad_norm": 1.7411636114120483, "kl": 0.047607421875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.445785641670227, "reward_std": 0.318904384970665, "rewards/accuracy_reward": 0.466193825006485, "rewards/format_reward": 0.9795918464660645, "step": 182 }, { "completion_length": 241.41836547851562, "epoch": 0.018415094339622642, "grad_norm": 1.839848518371582, "kl": 0.0338134765625, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.4299557209014893, "reward_std": 0.3068293035030365, "rewards/accuracy_reward": 0.46056798100471497, "rewards/format_reward": 0.9693877398967743, "step": 183 }, { "completion_length": 224.06121826171875, "epoch": 0.01851572327044025, "grad_norm": 1.6720441579818726, "kl": 0.055419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.3457286357879639, "reward_std": 0.29811991751194, "rewards/accuracy_reward": 0.35593266785144806, "rewards/format_reward": 0.9897959232330322, "step": 184 }, { "completion_length": 281.27550506591797, "epoch": 0.01861635220125786, "grad_norm": 1.3148736953735352, "kl": 0.0369873046875, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.1612244248390198, "reward_std": 0.2250632792711258, "rewards/accuracy_reward": 0.16122448816895485, "rewards/format_reward": 1.0, "step": 185 }, { "completion_length": 157.41836547851562, "epoch": 0.018716981132075473, "grad_norm": 2.349407434463501, "kl": 0.0550537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6823371648788452, "reward_std": 0.2510022595524788, "rewards/accuracy_reward": 0.6823371648788452, "rewards/format_reward": 1.0, "step": 186 }, { "completion_length": 130.40816116333008, "epoch": 0.01881761006289308, "grad_norm": 1.5343153476715088, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7653060555458069, "reward_std": 0.2428746223449707, "rewards/accuracy_reward": 0.775510162115097, "rewards/format_reward": 0.9897959232330322, "step": 187 }, { "completion_length": 144.16326141357422, "epoch": 0.01891823899371069, "grad_norm": 5.847264289855957, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5924744606018066, "reward_std": 0.34435756504535675, "rewards/accuracy_reward": 0.6128826439380646, "rewards/format_reward": 0.9795918166637421, "step": 188 }, { "completion_length": 209.37754821777344, "epoch": 0.019018867924528303, "grad_norm": 3.2360150814056396, "kl": 0.052734375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.3924521803855896, "reward_std": 0.3595912754535675, "rewards/accuracy_reward": 0.40265631675720215, "rewards/format_reward": 0.9897959232330322, "step": 189 }, { "completion_length": 206.46937561035156, "epoch": 0.019119496855345912, "grad_norm": 1.5041025876998901, "kl": 0.05303955078125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.48763906955719, "reward_std": 0.39191681146621704, "rewards/accuracy_reward": 0.5284554362297058, "rewards/format_reward": 0.9591836631298065, "step": 190 }, { "completion_length": 234.7142791748047, "epoch": 0.01922012578616352, "grad_norm": 1.1817384958267212, "kl": 0.0506591796875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6092295050621033, "reward_std": 0.3622943162918091, "rewards/accuracy_reward": 0.6194335520267487, "rewards/format_reward": 0.9897959232330322, "step": 191 }, { "completion_length": 208.78570556640625, "epoch": 0.019320754716981133, "grad_norm": 1.744893193244934, "kl": 0.05206298828125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.4081632494926453, "reward_std": 0.3741292506456375, "rewards/accuracy_reward": 0.4285714328289032, "rewards/format_reward": 0.9795918464660645, "step": 192 }, { "completion_length": 208.9081573486328, "epoch": 0.019421383647798742, "grad_norm": 1.420930027961731, "kl": 0.054443359375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.4659356474876404, "reward_std": 0.4084123522043228, "rewards/accuracy_reward": 0.4965479075908661, "rewards/format_reward": 0.9693877398967743, "step": 193 }, { "completion_length": 250.9693832397461, "epoch": 0.01952201257861635, "grad_norm": 1.2491899728775024, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.4387754797935486, "reward_std": 0.3719244748353958, "rewards/accuracy_reward": 0.47959183156490326, "rewards/format_reward": 0.9591836333274841, "step": 194 }, { "completion_length": 211.32652282714844, "epoch": 0.019622641509433963, "grad_norm": 1.4305967092514038, "kl": 0.0380859375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.595395028591156, "reward_std": 0.4062698036432266, "rewards/accuracy_reward": 0.5953950583934784, "rewards/format_reward": 1.0, "step": 195 }, { "completion_length": 172.74488830566406, "epoch": 0.019723270440251572, "grad_norm": 4.31199312210083, "kl": 0.206787109375, "learning_rate": 1e-06, "loss": 0.0083, "reward": 1.7346938252449036, "reward_std": 0.3306012898683548, "rewards/accuracy_reward": 0.7551020085811615, "rewards/format_reward": 0.9795918166637421, "step": 196 }, { "completion_length": 163.32653045654297, "epoch": 0.01982389937106918, "grad_norm": 1.803178071975708, "kl": 0.0704345703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.454384982585907, "reward_std": 0.2712782509624958, "rewards/accuracy_reward": 0.46458902955055237, "rewards/format_reward": 0.9897959232330322, "step": 197 }, { "completion_length": 201.95918655395508, "epoch": 0.019924528301886794, "grad_norm": 8.49641227722168, "kl": 0.0706787109375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.3682492971420288, "reward_std": 0.33723750710487366, "rewards/accuracy_reward": 0.4090656191110611, "rewards/format_reward": 0.9591836631298065, "step": 198 }, { "completion_length": 193.36734008789062, "epoch": 0.020025157232704403, "grad_norm": 3.081144094467163, "kl": 0.04718017578125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.4779260754585266, "reward_std": 0.43183037638664246, "rewards/accuracy_reward": 0.5085383355617523, "rewards/format_reward": 0.9693877398967743, "step": 199 }, { "completion_length": 218.61224365234375, "epoch": 0.02012578616352201, "grad_norm": 1.6491172313690186, "kl": 0.03436279296875, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.4420366883277893, "reward_std": 0.3511282801628113, "rewards/accuracy_reward": 0.4522407650947571, "rewards/format_reward": 0.9897959232330322, "step": 200 }, { "completion_length": 175.4081573486328, "epoch": 0.020226415094339624, "grad_norm": 2.8936684131622314, "kl": 0.05224609375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.4661202430725098, "reward_std": 0.30288371443748474, "rewards/accuracy_reward": 0.49673251807689667, "rewards/format_reward": 0.9693877398967743, "step": 201 }, { "completion_length": 183.55101776123047, "epoch": 0.020327044025157233, "grad_norm": 2.0622658729553223, "kl": 0.0633544921875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6669419407844543, "reward_std": 0.38652582466602325, "rewards/accuracy_reward": 0.6873501241207123, "rewards/format_reward": 0.9795918166637421, "step": 202 }, { "completion_length": 212.31632232666016, "epoch": 0.020427672955974842, "grad_norm": 1.2897238731384277, "kl": 0.0556640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.414820909500122, "reward_std": 0.28086480498313904, "rewards/accuracy_reward": 0.43522903323173523, "rewards/format_reward": 0.9795918166637421, "step": 203 }, { "completion_length": 176.05101776123047, "epoch": 0.020528301886792454, "grad_norm": 1.8253178596496582, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5284886360168457, "reward_std": 0.30904001742601395, "rewards/accuracy_reward": 0.559100866317749, "rewards/format_reward": 0.9693877398967743, "step": 204 }, { "completion_length": 194.15306091308594, "epoch": 0.020628930817610063, "grad_norm": 1.1486434936523438, "kl": 0.047607421875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.4298915266990662, "reward_std": 0.29349614679813385, "rewards/accuracy_reward": 0.4502996653318405, "rewards/format_reward": 0.9795918464660645, "step": 205 }, { "completion_length": 203.31632232666016, "epoch": 0.020729559748427672, "grad_norm": 1.9534318447113037, "kl": 0.033935546875, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.4401360154151917, "reward_std": 0.38162706792354584, "rewards/accuracy_reward": 0.4503401592373848, "rewards/format_reward": 0.9897959232330322, "step": 206 }, { "completion_length": 247.02040100097656, "epoch": 0.020830188679245284, "grad_norm": 2.0852441787719727, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.606813669204712, "reward_std": 0.31564460694789886, "rewards/accuracy_reward": 0.63742595911026, "rewards/format_reward": 0.9693877398967743, "step": 207 }, { "completion_length": 171.80612182617188, "epoch": 0.020930817610062893, "grad_norm": 1.4466040134429932, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6763848066329956, "reward_std": 0.3324142023921013, "rewards/accuracy_reward": 0.7069970667362213, "rewards/format_reward": 0.9693877398967743, "step": 208 }, { "completion_length": 202.63265228271484, "epoch": 0.021031446540880502, "grad_norm": 1.145763874053955, "kl": 0.04376220703125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5035192966461182, "reward_std": 0.31092528998851776, "rewards/accuracy_reward": 0.5239274799823761, "rewards/format_reward": 0.9795918464660645, "step": 209 }, { "completion_length": 211.56121826171875, "epoch": 0.021132075471698115, "grad_norm": 1.1505582332611084, "kl": 0.032470703125, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.5714285373687744, "reward_std": 0.24277209490537643, "rewards/accuracy_reward": 0.5816326439380646, "rewards/format_reward": 0.9897959232330322, "step": 210 }, { "completion_length": 184.17346954345703, "epoch": 0.021232704402515724, "grad_norm": 0.831329882144928, "kl": 0.05517578125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.684948980808258, "reward_std": 0.2466888278722763, "rewards/accuracy_reward": 0.6951530277729034, "rewards/format_reward": 0.9897959232330322, "step": 211 }, { "completion_length": 198.62244415283203, "epoch": 0.021333333333333333, "grad_norm": 1.103936791419983, "kl": 0.03466796875, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.5244468450546265, "reward_std": 0.21394824236631393, "rewards/accuracy_reward": 0.5448549538850784, "rewards/format_reward": 0.9795918166637421, "step": 212 }, { "completion_length": 242.80611419677734, "epoch": 0.021433962264150945, "grad_norm": 1.4925191402435303, "kl": 0.033203125, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.5493832230567932, "reward_std": 0.36937104165554047, "rewards/accuracy_reward": 0.5493832528591156, "rewards/format_reward": 1.0, "step": 213 }, { "completion_length": 234.448974609375, "epoch": 0.021534591194968554, "grad_norm": 1.5202181339263916, "kl": 0.05615234375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.4239786863327026, "reward_std": 0.3798091560602188, "rewards/accuracy_reward": 0.46479499340057373, "rewards/format_reward": 0.9591836333274841, "step": 214 }, { "completion_length": 255.4081573486328, "epoch": 0.021635220125786163, "grad_norm": 1.6591095924377441, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.4989148378372192, "reward_std": 0.40872015058994293, "rewards/accuracy_reward": 0.4989148825407028, "rewards/format_reward": 1.0, "step": 215 }, { "completion_length": 171.1938705444336, "epoch": 0.021735849056603775, "grad_norm": 2.0313808917999268, "kl": 0.0394287109375, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.5331632494926453, "reward_std": 0.39173008501529694, "rewards/accuracy_reward": 0.5535714030265808, "rewards/format_reward": 0.9795918464660645, "step": 216 }, { "completion_length": 191.85713958740234, "epoch": 0.021836477987421384, "grad_norm": 1.286941647529602, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6109461188316345, "reward_std": 0.270639568567276, "rewards/accuracy_reward": 0.6211502850055695, "rewards/format_reward": 0.9897959232330322, "step": 217 }, { "completion_length": 260.6938781738281, "epoch": 0.021937106918238993, "grad_norm": 1.4940221309661865, "kl": 0.0408935546875, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.4455782175064087, "reward_std": 0.3727252334356308, "rewards/accuracy_reward": 0.45578229427337646, "rewards/format_reward": 0.9897959232330322, "step": 218 }, { "completion_length": 163.948974609375, "epoch": 0.022037735849056602, "grad_norm": 1.6930210590362549, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5526222586631775, "reward_std": 0.2178930565714836, "rewards/accuracy_reward": 0.5628263503313065, "rewards/format_reward": 0.9897959232330322, "step": 219 }, { "completion_length": 216.80612182617188, "epoch": 0.022138364779874214, "grad_norm": 4.4900736808776855, "kl": 0.0357666015625, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.4191219210624695, "reward_std": 0.40106378495693207, "rewards/accuracy_reward": 0.4395300894975662, "rewards/format_reward": 0.9795918166637421, "step": 220 }, { "completion_length": 154.12244415283203, "epoch": 0.022238993710691823, "grad_norm": 1.5986279249191284, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.591123104095459, "reward_std": 0.25516271591186523, "rewards/accuracy_reward": 0.6115313172340393, "rewards/format_reward": 0.9795918464660645, "step": 221 }, { "completion_length": 196.1530532836914, "epoch": 0.022339622641509432, "grad_norm": 1.6921132802963257, "kl": 0.04083251953125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.5442177057266235, "reward_std": 0.3874721974134445, "rewards/accuracy_reward": 0.5442176908254623, "rewards/format_reward": 1.0, "step": 222 }, { "completion_length": 176.15306091308594, "epoch": 0.022440251572327045, "grad_norm": 1.3001519441604614, "kl": 0.0377197265625, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5815111994743347, "reward_std": 0.29468323290348053, "rewards/accuracy_reward": 0.6019193530082703, "rewards/format_reward": 0.9795918166637421, "step": 223 }, { "completion_length": 211.2142791748047, "epoch": 0.022540880503144654, "grad_norm": 0.9768163561820984, "kl": 0.0435791015625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.504593312740326, "reward_std": 0.22502975910902023, "rewards/accuracy_reward": 0.5045933872461319, "rewards/format_reward": 1.0, "step": 224 }, { "completion_length": 206.39795684814453, "epoch": 0.022641509433962263, "grad_norm": 1.3759135007858276, "kl": 0.0411376953125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.5923997163772583, "reward_std": 0.27630362659692764, "rewards/accuracy_reward": 0.6026038527488708, "rewards/format_reward": 0.9897959232330322, "step": 225 }, { "completion_length": 293.61224365234375, "epoch": 0.022742138364779875, "grad_norm": 1.2450898885726929, "kl": 0.02349853515625, "learning_rate": 1e-06, "loss": 0.0009, "reward": 1.3931331634521484, "reward_std": 0.3747707009315491, "rewards/accuracy_reward": 0.4237454682588577, "rewards/format_reward": 0.9693877398967743, "step": 226 }, { "completion_length": 227.36734008789062, "epoch": 0.022842767295597484, "grad_norm": 1.0906825065612793, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.471397042274475, "reward_std": 0.32335682213306427, "rewards/accuracy_reward": 0.5020092874765396, "rewards/format_reward": 0.9693877398967743, "step": 227 }, { "completion_length": 245.76529693603516, "epoch": 0.022943396226415093, "grad_norm": 3.9644269943237305, "kl": 0.05059814453125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5673606395721436, "reward_std": 0.4110766500234604, "rewards/accuracy_reward": 0.5877687931060791, "rewards/format_reward": 0.9795918464660645, "step": 228 }, { "completion_length": 225.54080963134766, "epoch": 0.023044025157232705, "grad_norm": 1.7464959621429443, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6108410954475403, "reward_std": 0.33047954738140106, "rewards/accuracy_reward": 0.6108411550521851, "rewards/format_reward": 1.0, "step": 229 }, { "completion_length": 211.9591827392578, "epoch": 0.023144654088050314, "grad_norm": 1.8671456575393677, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5207683444023132, "reward_std": 0.4708293080329895, "rewards/accuracy_reward": 0.5615846365690231, "rewards/format_reward": 0.9591836333274841, "step": 230 }, { "completion_length": 235.5, "epoch": 0.023245283018867923, "grad_norm": 1.3002753257751465, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5248015522956848, "reward_std": 0.311226062476635, "rewards/accuracy_reward": 0.5656179040670395, "rewards/format_reward": 0.9591836631298065, "step": 231 }, { "completion_length": 194.60203552246094, "epoch": 0.023345911949685536, "grad_norm": 1.7968554496765137, "kl": 0.05419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5510203838348389, "reward_std": 0.2585868537425995, "rewards/accuracy_reward": 0.5510203987360001, "rewards/format_reward": 1.0, "step": 232 }, { "completion_length": 215.67346954345703, "epoch": 0.023446540880503144, "grad_norm": 1.4275659322738647, "kl": 0.111572265625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.5732800960540771, "reward_std": 0.28734396398067474, "rewards/accuracy_reward": 0.614096462726593, "rewards/format_reward": 0.9591836333274841, "step": 233 }, { "completion_length": 259.06121826171875, "epoch": 0.023547169811320753, "grad_norm": 0.8454795479774475, "kl": 0.0570068359375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.4592822790145874, "reward_std": 0.3095555603504181, "rewards/accuracy_reward": 0.46948640048503876, "rewards/format_reward": 0.9897959232330322, "step": 234 }, { "completion_length": 216.64285278320312, "epoch": 0.023647798742138366, "grad_norm": 2.6139965057373047, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.4562381505966187, "reward_std": 0.37688587605953217, "rewards/accuracy_reward": 0.4664422422647476, "rewards/format_reward": 0.9897959232330322, "step": 235 }, { "completion_length": 215.39794921875, "epoch": 0.023748427672955975, "grad_norm": 1.4040168523788452, "kl": 0.0743408203125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.4956915378570557, "reward_std": 0.35681410133838654, "rewards/accuracy_reward": 0.516099750995636, "rewards/format_reward": 0.9795918166637421, "step": 236 }, { "completion_length": 194.06121826171875, "epoch": 0.023849056603773584, "grad_norm": 1.5017261505126953, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5840785503387451, "reward_std": 0.27086637914180756, "rewards/accuracy_reward": 0.6044866740703583, "rewards/format_reward": 0.9795918166637421, "step": 237 }, { "completion_length": 209.948974609375, "epoch": 0.023949685534591196, "grad_norm": 1.8807759284973145, "kl": 0.0682373046875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.4223638772964478, "reward_std": 0.25648196786642075, "rewards/accuracy_reward": 0.42236393690109253, "rewards/format_reward": 1.0, "step": 238 }, { "completion_length": 239.07142639160156, "epoch": 0.024050314465408805, "grad_norm": 1.7096936702728271, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.46588933467865, "reward_std": 0.3037087172269821, "rewards/accuracy_reward": 0.486297607421875, "rewards/format_reward": 0.9795918464660645, "step": 239 }, { "completion_length": 227.31632232666016, "epoch": 0.024150943396226414, "grad_norm": 1.0308592319488525, "kl": 0.0391845703125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6264951825141907, "reward_std": 0.35434944927692413, "rewards/accuracy_reward": 0.6469033658504486, "rewards/format_reward": 0.9795918166637421, "step": 240 }, { "completion_length": 264.6734619140625, "epoch": 0.024251572327044026, "grad_norm": 1.3256287574768066, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.528166115283966, "reward_std": 0.33851584792137146, "rewards/accuracy_reward": 0.5485743135213852, "rewards/format_reward": 0.9795918166637421, "step": 241 }, { "completion_length": 237.43877410888672, "epoch": 0.024352201257861635, "grad_norm": 1.9376411437988281, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5331507921218872, "reward_std": 0.18593022227287292, "rewards/accuracy_reward": 0.5331508219242096, "rewards/format_reward": 1.0, "step": 242 }, { "completion_length": 251.30612182617188, "epoch": 0.024452830188679244, "grad_norm": 1.7197574377059937, "kl": 0.0574951171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5398944020271301, "reward_std": 0.3683670163154602, "rewards/accuracy_reward": 0.5398943722248077, "rewards/format_reward": 1.0, "step": 243 }, { "completion_length": 198.17346954345703, "epoch": 0.024553459119496857, "grad_norm": 0.9377825856208801, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5376765727996826, "reward_std": 0.2676790654659271, "rewards/accuracy_reward": 0.537676602602005, "rewards/format_reward": 1.0, "step": 244 }, { "completion_length": 160.16326141357422, "epoch": 0.024654088050314465, "grad_norm": 1.5950536727905273, "kl": 0.117431640625, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.615476131439209, "reward_std": 0.2803475707769394, "rewards/accuracy_reward": 0.6460884213447571, "rewards/format_reward": 0.9693877398967743, "step": 245 }, { "completion_length": 198.31632232666016, "epoch": 0.024754716981132074, "grad_norm": 2.0941150188446045, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5469825267791748, "reward_std": 0.36537590622901917, "rewards/accuracy_reward": 0.5775947570800781, "rewards/format_reward": 0.9693877398967743, "step": 246 }, { "completion_length": 208.80611419677734, "epoch": 0.024855345911949687, "grad_norm": 3.854766368865967, "kl": 0.1041259765625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.5309422612190247, "reward_std": 0.2537536770105362, "rewards/accuracy_reward": 0.5615545064210892, "rewards/format_reward": 0.9693877398967743, "step": 247 }, { "completion_length": 228.37754821777344, "epoch": 0.024955974842767296, "grad_norm": 0.8412731885910034, "kl": 0.05908203125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5957725644111633, "reward_std": 0.3133610337972641, "rewards/accuracy_reward": 0.6263847947120667, "rewards/format_reward": 0.9693877398967743, "step": 248 }, { "completion_length": 264.55101013183594, "epoch": 0.025056603773584905, "grad_norm": 0.8883137106895447, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.453154742717743, "reward_std": 0.2911250591278076, "rewards/accuracy_reward": 0.5041751712560654, "rewards/format_reward": 0.9489795565605164, "step": 249 }, { "completion_length": 152.34693908691406, "epoch": 0.025157232704402517, "grad_norm": 1.134968638420105, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.656605303287506, "reward_std": 0.2554771229624748, "rewards/accuracy_reward": 0.6668093800544739, "rewards/format_reward": 0.9897959232330322, "step": 250 }, { "completion_length": 210.71428680419922, "epoch": 0.025257861635220126, "grad_norm": 41.935604095458984, "kl": 0.991455078125, "learning_rate": 1e-06, "loss": 0.0399, "reward": 1.5257359147071838, "reward_std": 0.381788894534111, "rewards/accuracy_reward": 0.5563481450080872, "rewards/format_reward": 0.9693877398967743, "step": 251 }, { "completion_length": 147.43877410888672, "epoch": 0.025358490566037735, "grad_norm": 7.223467826843262, "kl": 0.272705078125, "learning_rate": 1e-06, "loss": 0.011, "reward": 1.7967686653137207, "reward_std": 0.2691649794578552, "rewards/accuracy_reward": 0.817176878452301, "rewards/format_reward": 0.9795918166637421, "step": 252 }, { "completion_length": 195.34693145751953, "epoch": 0.025459119496855347, "grad_norm": 2.3127684593200684, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5491441488265991, "reward_std": 0.2099238932132721, "rewards/accuracy_reward": 0.5593482702970505, "rewards/format_reward": 0.9897959232330322, "step": 253 }, { "completion_length": 178.53060913085938, "epoch": 0.025559748427672956, "grad_norm": 1.1452337503433228, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6961450576782227, "reward_std": 0.30203065276145935, "rewards/accuracy_reward": 0.6961451172828674, "rewards/format_reward": 1.0, "step": 254 }, { "completion_length": 182.03060913085938, "epoch": 0.025660377358490565, "grad_norm": 1.8533351421356201, "kl": 0.10205078125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6869613528251648, "reward_std": 0.2049485258758068, "rewards/accuracy_reward": 0.6869614124298096, "rewards/format_reward": 1.0, "step": 255 }, { "completion_length": 153.7551040649414, "epoch": 0.025761006289308178, "grad_norm": 1.266753911972046, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.695397973060608, "reward_std": 0.3375290334224701, "rewards/accuracy_reward": 0.7056021094322205, "rewards/format_reward": 0.9897959232330322, "step": 256 }, { "completion_length": 174.33673095703125, "epoch": 0.025861635220125787, "grad_norm": 2.3942153453826904, "kl": 0.047119140625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.4908902049064636, "reward_std": 0.2862359881401062, "rewards/accuracy_reward": 0.5317066013813019, "rewards/format_reward": 0.9591836631298065, "step": 257 }, { "completion_length": 237.6938705444336, "epoch": 0.025962264150943395, "grad_norm": 2.769270658493042, "kl": 0.04766845703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5893865823745728, "reward_std": 0.329324372112751, "rewards/accuracy_reward": 0.6097947657108307, "rewards/format_reward": 0.9795918464660645, "step": 258 }, { "completion_length": 228.48979949951172, "epoch": 0.026062893081761008, "grad_norm": 1.3427060842514038, "kl": 0.05963134765625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5430623292922974, "reward_std": 0.30009645223617554, "rewards/accuracy_reward": 0.5736745893955231, "rewards/format_reward": 0.9693877398967743, "step": 259 }, { "completion_length": 211.43877410888672, "epoch": 0.026163522012578617, "grad_norm": 1.5841524600982666, "kl": 0.0579833984375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5207644701004028, "reward_std": 0.2586216554045677, "rewards/accuracy_reward": 0.5207644999027252, "rewards/format_reward": 1.0, "step": 260 }, { "completion_length": 172.65306091308594, "epoch": 0.026264150943396226, "grad_norm": 1.0054014921188354, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.4844515919685364, "reward_std": 0.2661963254213333, "rewards/accuracy_reward": 0.49465568363666534, "rewards/format_reward": 0.9897959232330322, "step": 261 }, { "completion_length": 202.448974609375, "epoch": 0.026364779874213838, "grad_norm": 1.0298690795898438, "kl": 0.0550537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6458211541175842, "reward_std": 0.2787811607122421, "rewards/accuracy_reward": 0.6458211541175842, "rewards/format_reward": 1.0, "step": 262 }, { "completion_length": 216.9693832397461, "epoch": 0.026465408805031447, "grad_norm": 0.9447999596595764, "kl": 0.04736328125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.655889630317688, "reward_std": 0.27258263528347015, "rewards/accuracy_reward": 0.6660937070846558, "rewards/format_reward": 0.9897959232330322, "step": 263 }, { "completion_length": 202.41836547851562, "epoch": 0.026566037735849056, "grad_norm": 1.3479151725769043, "kl": 0.04052734375, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.3539405465126038, "reward_std": 0.3138662874698639, "rewards/accuracy_reward": 0.4151649624109268, "rewards/format_reward": 0.9387754797935486, "step": 264 }, { "completion_length": 213.6836700439453, "epoch": 0.02666666666666667, "grad_norm": 1.933054804801941, "kl": 0.04736328125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5385947227478027, "reward_std": 0.2949916422367096, "rewards/accuracy_reward": 0.5385947227478027, "rewards/format_reward": 1.0, "step": 265 }, { "completion_length": 246.56122589111328, "epoch": 0.026767295597484277, "grad_norm": 1.2654718160629272, "kl": 0.037933349609375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.4565645456314087, "reward_std": 0.3259216919541359, "rewards/accuracy_reward": 0.4769727736711502, "rewards/format_reward": 0.9795918464660645, "step": 266 }, { "completion_length": 169.46939086914062, "epoch": 0.026867924528301886, "grad_norm": 1.653281331062317, "kl": 0.0513916015625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6167952418327332, "reward_std": 0.3028011918067932, "rewards/accuracy_reward": 0.6269993782043457, "rewards/format_reward": 0.9897959232330322, "step": 267 }, { "completion_length": 210.49999237060547, "epoch": 0.026968553459119495, "grad_norm": 0.6555625796318054, "kl": 0.0391845703125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7244897484779358, "reward_std": 0.1652088463306427, "rewards/accuracy_reward": 0.7244897782802582, "rewards/format_reward": 1.0, "step": 268 }, { "completion_length": 273.5102005004883, "epoch": 0.027069182389937108, "grad_norm": 1.621829867362976, "kl": 0.048583984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5341594219207764, "reward_std": 0.36592648923397064, "rewards/accuracy_reward": 0.5545675158500671, "rewards/format_reward": 0.9795918464660645, "step": 269 }, { "completion_length": 223.1326446533203, "epoch": 0.027169811320754716, "grad_norm": 1.0262044668197632, "kl": 0.0439453125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.5494742393493652, "reward_std": 0.37094584107398987, "rewards/accuracy_reward": 0.5800865441560745, "rewards/format_reward": 0.9693877398967743, "step": 270 }, { "completion_length": 226.78570556640625, "epoch": 0.027270440251572325, "grad_norm": 1.517886996269226, "kl": 0.0511474609375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6137269139289856, "reward_std": 0.36376453936100006, "rewards/accuracy_reward": 0.6137269139289856, "rewards/format_reward": 1.0, "step": 271 }, { "completion_length": 166.6530532836914, "epoch": 0.027371069182389938, "grad_norm": 1.295241117477417, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6190476417541504, "reward_std": 0.23788218200206757, "rewards/accuracy_reward": 0.6190476417541504, "rewards/format_reward": 1.0, "step": 272 }, { "completion_length": 188.36734771728516, "epoch": 0.027471698113207547, "grad_norm": 1.7688010931015015, "kl": 0.0516357421875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5634920001029968, "reward_std": 0.3201025128364563, "rewards/accuracy_reward": 0.5634920448064804, "rewards/format_reward": 1.0, "step": 273 }, { "completion_length": 152.9285659790039, "epoch": 0.027572327044025156, "grad_norm": 8.186936378479004, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.714285671710968, "reward_std": 0.26856663823127747, "rewards/accuracy_reward": 0.7142857015132904, "rewards/format_reward": 1.0, "step": 274 }, { "completion_length": 157.67346954345703, "epoch": 0.027672955974842768, "grad_norm": 1.002650260925293, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7031096816062927, "reward_std": 0.2596295475959778, "rewards/accuracy_reward": 0.733722060918808, "rewards/format_reward": 0.9693877398967743, "step": 275 }, { "completion_length": 174.2346954345703, "epoch": 0.027773584905660377, "grad_norm": 1.354985237121582, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7459720373153687, "reward_std": 0.26174378395080566, "rewards/accuracy_reward": 0.7663802206516266, "rewards/format_reward": 0.9795918166637421, "step": 276 }, { "completion_length": 223.12244415283203, "epoch": 0.027874213836477986, "grad_norm": 1.1065088510513306, "kl": 0.0506591796875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6445578336715698, "reward_std": 0.2558666840195656, "rewards/accuracy_reward": 0.6445578038692474, "rewards/format_reward": 1.0, "step": 277 }, { "completion_length": 199.03060913085938, "epoch": 0.0279748427672956, "grad_norm": 1.7871431112289429, "kl": 0.0540771484375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5911288261413574, "reward_std": 0.25926416367292404, "rewards/accuracy_reward": 0.6013330221176147, "rewards/format_reward": 0.9897959232330322, "step": 278 }, { "completion_length": 240.36734008789062, "epoch": 0.028075471698113207, "grad_norm": 1.1229931116104126, "kl": 0.0494384765625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.4957449436187744, "reward_std": 0.24711260199546814, "rewards/accuracy_reward": 0.5263572037220001, "rewards/format_reward": 0.9693877398967743, "step": 279 }, { "completion_length": 151.1836700439453, "epoch": 0.028176100628930816, "grad_norm": 0.9105086326599121, "kl": 0.0487060546875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7575315833091736, "reward_std": 0.14756937325000763, "rewards/accuracy_reward": 0.7575315833091736, "rewards/format_reward": 1.0, "step": 280 }, { "completion_length": 222.7551040649414, "epoch": 0.02827672955974843, "grad_norm": 1.30797278881073, "kl": 0.02874755859375, "learning_rate": 1e-06, "loss": 0.0011, "reward": 1.4944823384284973, "reward_std": 0.3300415128469467, "rewards/accuracy_reward": 0.4944823533296585, "rewards/format_reward": 1.0, "step": 281 }, { "completion_length": 237.9897918701172, "epoch": 0.028377358490566038, "grad_norm": 3.6075997352600098, "kl": 0.037353515625, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5421714782714844, "reward_std": 0.2822411060333252, "rewards/accuracy_reward": 0.542171522974968, "rewards/format_reward": 1.0, "step": 282 }, { "completion_length": 205.33673858642578, "epoch": 0.028477987421383646, "grad_norm": 1.1478800773620605, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.552235186100006, "reward_std": 0.26622921973466873, "rewards/accuracy_reward": 0.5624392628669739, "rewards/format_reward": 0.9897959232330322, "step": 283 }, { "completion_length": 214.31632232666016, "epoch": 0.02857861635220126, "grad_norm": 2539.360107421875, "kl": 22.140625, "learning_rate": 1e-06, "loss": 0.8875, "reward": 1.4335069060325623, "reward_std": 0.25979240983724594, "rewards/accuracy_reward": 0.44371098279953003, "rewards/format_reward": 0.9897959232330322, "step": 284 }, { "completion_length": 253.4897918701172, "epoch": 0.028679245283018868, "grad_norm": 0.8661548495292664, "kl": 0.0447998046875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.5880951285362244, "reward_std": 0.24055861681699753, "rewards/accuracy_reward": 0.5982992947101593, "rewards/format_reward": 0.9897959232330322, "step": 285 }, { "completion_length": 224.88774871826172, "epoch": 0.028779874213836477, "grad_norm": 1.6213070154190063, "kl": 0.0499267578125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.4513267278671265, "reward_std": 0.4059562385082245, "rewards/accuracy_reward": 0.4819388836622238, "rewards/format_reward": 0.9693877398967743, "step": 286 }, { "completion_length": 181.33673095703125, "epoch": 0.02888050314465409, "grad_norm": 1.452121615409851, "kl": 0.03778076171875, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5899094939231873, "reward_std": 0.3181476965546608, "rewards/accuracy_reward": 0.5899094641208649, "rewards/format_reward": 1.0, "step": 287 }, { "completion_length": 174.03060913085938, "epoch": 0.028981132075471698, "grad_norm": 3.6198854446411133, "kl": 0.0416259765625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.618010938167572, "reward_std": 0.2928866297006607, "rewards/accuracy_reward": 0.6282150745391846, "rewards/format_reward": 0.9897959232330322, "step": 288 }, { "completion_length": 226.42857360839844, "epoch": 0.029081761006289307, "grad_norm": 1.2914307117462158, "kl": 0.03955078125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6351842880249023, "reward_std": 0.28131359815597534, "rewards/accuracy_reward": 0.6453884243965149, "rewards/format_reward": 0.9897959232330322, "step": 289 }, { "completion_length": 174.25509643554688, "epoch": 0.02918238993710692, "grad_norm": 1.2254911661148071, "kl": 0.0428466796875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6093443632125854, "reward_std": 0.26560230553150177, "rewards/accuracy_reward": 0.6093443930149078, "rewards/format_reward": 1.0, "step": 290 }, { "completion_length": 221.78571319580078, "epoch": 0.02928301886792453, "grad_norm": 1.1489965915679932, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7448979020118713, "reward_std": 0.18887970596551895, "rewards/accuracy_reward": 0.7448979318141937, "rewards/format_reward": 1.0, "step": 291 }, { "completion_length": 172.0204086303711, "epoch": 0.029383647798742137, "grad_norm": 1.3796651363372803, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5919905304908752, "reward_std": 0.29735660552978516, "rewards/accuracy_reward": 0.6021946668624878, "rewards/format_reward": 0.9897959232330322, "step": 292 }, { "completion_length": 207.36734008789062, "epoch": 0.02948427672955975, "grad_norm": 0.7993505001068115, "kl": 0.0399169921875, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6082202792167664, "reward_std": 0.21752876043319702, "rewards/accuracy_reward": 0.6286284029483795, "rewards/format_reward": 0.9795918166637421, "step": 293 }, { "completion_length": 171.81632232666016, "epoch": 0.02958490566037736, "grad_norm": 0.9576517343521118, "kl": 0.049560546875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6417222619056702, "reward_std": 0.1923503503203392, "rewards/accuracy_reward": 0.6519263684749603, "rewards/format_reward": 0.9897959232330322, "step": 294 }, { "completion_length": 223.4897918701172, "epoch": 0.029685534591194968, "grad_norm": 1.101180911064148, "kl": 0.035888671875, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.510204017162323, "reward_std": 0.21763086318969727, "rewards/accuracy_reward": 0.5204081684350967, "rewards/format_reward": 0.9897959232330322, "step": 295 }, { "completion_length": 251.66326141357422, "epoch": 0.02978616352201258, "grad_norm": 2.393232822418213, "kl": 0.0511474609375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.3265305757522583, "reward_std": 0.29344040155410767, "rewards/accuracy_reward": 0.3265306130051613, "rewards/format_reward": 1.0, "step": 296 }, { "completion_length": 190.36734771728516, "epoch": 0.02988679245283019, "grad_norm": 1.190024495124817, "kl": 0.0362548828125, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.7015857100486755, "reward_std": 0.21452732384204865, "rewards/accuracy_reward": 0.7015856206417084, "rewards/format_reward": 1.0, "step": 297 }, { "completion_length": 181.59182739257812, "epoch": 0.029987421383647798, "grad_norm": 1.6108084917068481, "kl": 0.0733642578125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.528571367263794, "reward_std": 0.36711809039115906, "rewards/accuracy_reward": 0.5489795506000519, "rewards/format_reward": 0.9795918464660645, "step": 298 }, { "completion_length": 138.47958374023438, "epoch": 0.03008805031446541, "grad_norm": 2.769042491912842, "kl": 0.0474853515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.730806589126587, "reward_std": 0.16429410502314568, "rewards/accuracy_reward": 0.7308066189289093, "rewards/format_reward": 1.0, "step": 299 }, { "completion_length": 260.8571319580078, "epoch": 0.03018867924528302, "grad_norm": 1.2665867805480957, "kl": 0.03369140625, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.4838678240776062, "reward_std": 0.2498823031783104, "rewards/accuracy_reward": 0.49407193064689636, "rewards/format_reward": 0.9897959232330322, "step": 300 }, { "completion_length": 317.948974609375, "epoch": 0.030289308176100628, "grad_norm": 0.8863894939422607, "kl": 0.02423095703125, "learning_rate": 1e-06, "loss": 0.001, "reward": 1.3410006165504456, "reward_std": 0.27373720705509186, "rewards/accuracy_reward": 0.3614088296890259, "rewards/format_reward": 0.9795918464660645, "step": 301 }, { "completion_length": 184.1326446533203, "epoch": 0.03038993710691824, "grad_norm": 3.345491886138916, "kl": 0.0411376953125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6276915669441223, "reward_std": 0.3376290276646614, "rewards/accuracy_reward": 0.6276916563510895, "rewards/format_reward": 1.0, "step": 302 }, { "completion_length": 160.63265228271484, "epoch": 0.03049056603773585, "grad_norm": 1.8548709154129028, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7746363282203674, "reward_std": 0.2796761095523834, "rewards/accuracy_reward": 0.7950445115566254, "rewards/format_reward": 0.9795918464660645, "step": 303 }, { "completion_length": 207.86734771728516, "epoch": 0.03059119496855346, "grad_norm": 1.2331475019454956, "kl": 0.041259765625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6344372034072876, "reward_std": 0.24884840101003647, "rewards/accuracy_reward": 0.6548454165458679, "rewards/format_reward": 0.9795918464660645, "step": 304 }, { "completion_length": 132.81632232666016, "epoch": 0.03069182389937107, "grad_norm": 1.2163764238357544, "kl": 0.056884765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.74253249168396, "reward_std": 0.2113840989768505, "rewards/accuracy_reward": 0.7629407644271851, "rewards/format_reward": 0.9795918464660645, "step": 305 }, { "completion_length": 198.7040786743164, "epoch": 0.03079245283018868, "grad_norm": 4.700756549835205, "kl": 0.03692626953125, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.6695970296859741, "reward_std": 0.20583373308181763, "rewards/accuracy_reward": 0.6798011362552643, "rewards/format_reward": 0.9897959232330322, "step": 306 }, { "completion_length": 223.2959213256836, "epoch": 0.03089308176100629, "grad_norm": 0.7250779271125793, "kl": 0.03125, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.6335034370422363, "reward_std": 0.271258607506752, "rewards/accuracy_reward": 0.6437075138092041, "rewards/format_reward": 0.9897959232330322, "step": 307 }, { "completion_length": 216.49999237060547, "epoch": 0.0309937106918239, "grad_norm": 1.0911568403244019, "kl": 0.0400390625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6325498223304749, "reward_std": 0.2207450494170189, "rewards/accuracy_reward": 0.632549911737442, "rewards/format_reward": 1.0, "step": 308 }, { "completion_length": 203.65306091308594, "epoch": 0.03109433962264151, "grad_norm": 1.6139248609542847, "kl": 0.05322265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5990375876426697, "reward_std": 0.3232436627149582, "rewards/accuracy_reward": 0.6092417240142822, "rewards/format_reward": 0.9897959232330322, "step": 309 }, { "completion_length": 162.77550506591797, "epoch": 0.03119496855345912, "grad_norm": 1.8661608695983887, "kl": 0.048095703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6850876808166504, "reward_std": 0.26692594587802887, "rewards/accuracy_reward": 0.6850877702236176, "rewards/format_reward": 1.0, "step": 310 }, { "completion_length": 226.47958374023438, "epoch": 0.03129559748427673, "grad_norm": 0.8982725739479065, "kl": 0.0372314453125, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.4038176536560059, "reward_std": 0.15818333253264427, "rewards/accuracy_reward": 0.40381768345832825, "rewards/format_reward": 1.0, "step": 311 }, { "completion_length": 258.36734771728516, "epoch": 0.03139622641509434, "grad_norm": 1.2810972929000854, "kl": 0.0498046875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.4257676601409912, "reward_std": 0.2202993482351303, "rewards/accuracy_reward": 0.42576761543750763, "rewards/format_reward": 1.0, "step": 312 }, { "completion_length": 252.21428680419922, "epoch": 0.03149685534591195, "grad_norm": 1.5396931171417236, "kl": 0.0419921875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.4988821148872375, "reward_std": 0.3120696395635605, "rewards/accuracy_reward": 0.5090862363576889, "rewards/format_reward": 0.9897959232330322, "step": 313 }, { "completion_length": 318.46937561035156, "epoch": 0.03159748427672956, "grad_norm": 0.5989140868186951, "kl": 0.03289794921875, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.5251535773277283, "reward_std": 0.26676853001117706, "rewards/accuracy_reward": 0.5353576689958572, "rewards/format_reward": 0.9897959232330322, "step": 314 }, { "completion_length": 227.84693908691406, "epoch": 0.03169811320754717, "grad_norm": 2.0445141792297363, "kl": 0.042236328125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5432608127593994, "reward_std": 0.23348551988601685, "rewards/accuracy_reward": 0.5534648597240448, "rewards/format_reward": 0.9897959232330322, "step": 315 }, { "completion_length": 257.17346954345703, "epoch": 0.03179874213836478, "grad_norm": 1.3158107995986938, "kl": 0.03277587890625, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.4094205498695374, "reward_std": 0.2586745023727417, "rewards/accuracy_reward": 0.4196246862411499, "rewards/format_reward": 0.9897959232330322, "step": 316 }, { "completion_length": 207.12244415283203, "epoch": 0.03189937106918239, "grad_norm": 1.7749074697494507, "kl": 0.0482177734375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7401509881019592, "reward_std": 0.32953259348869324, "rewards/accuracy_reward": 0.7605591416358948, "rewards/format_reward": 0.9795918166637421, "step": 317 }, { "completion_length": 273.77549743652344, "epoch": 0.032, "grad_norm": 1.5743038654327393, "kl": 0.0323486328125, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.5019804239273071, "reward_std": 0.3685754984617233, "rewards/accuracy_reward": 0.5019804388284683, "rewards/format_reward": 1.0, "step": 318 }, { "completion_length": 202.54080963134766, "epoch": 0.03210062893081761, "grad_norm": 12.278429985046387, "kl": 0.0450439453125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6797861456871033, "reward_std": 0.30073586106300354, "rewards/accuracy_reward": 0.6797861754894257, "rewards/format_reward": 1.0, "step": 319 }, { "completion_length": 191.3163299560547, "epoch": 0.03220125786163522, "grad_norm": 1.5893596410751343, "kl": 0.03753662109375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.582568883895874, "reward_std": 0.37275558710098267, "rewards/accuracy_reward": 0.6029770523309708, "rewards/format_reward": 0.9795918464660645, "step": 320 }, { "completion_length": 146.2244873046875, "epoch": 0.03230188679245283, "grad_norm": 1.7518746852874756, "kl": 0.052001953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6177237629890442, "reward_std": 0.2402820661664009, "rewards/accuracy_reward": 0.617723822593689, "rewards/format_reward": 1.0, "step": 321 }, { "completion_length": 200.9285659790039, "epoch": 0.03240251572327044, "grad_norm": 1.4273287057876587, "kl": 0.05859375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6415200233459473, "reward_std": 0.2265104092657566, "rewards/accuracy_reward": 0.6415200531482697, "rewards/format_reward": 1.0, "step": 322 }, { "completion_length": 209.93877410888672, "epoch": 0.03250314465408805, "grad_norm": 0.8671693801879883, "kl": 0.0467529296875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5761087536811829, "reward_std": 0.25817834585905075, "rewards/accuracy_reward": 0.5965169668197632, "rewards/format_reward": 0.9795918166637421, "step": 323 }, { "completion_length": 210.32653045654297, "epoch": 0.03260377358490566, "grad_norm": 1.7793934345245361, "kl": 0.045654296875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6020408272743225, "reward_std": 0.2985507547855377, "rewards/accuracy_reward": 0.6326530575752258, "rewards/format_reward": 0.9693877398967743, "step": 324 }, { "completion_length": 208.37754821777344, "epoch": 0.03270440251572327, "grad_norm": 1.1824871301651, "kl": 0.044189453125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.642063856124878, "reward_std": 0.32671305537223816, "rewards/accuracy_reward": 0.6522680222988129, "rewards/format_reward": 0.9897959232330322, "step": 325 }, { "completion_length": 202.07142639160156, "epoch": 0.03280503144654088, "grad_norm": 0.8470100164413452, "kl": 0.0396728515625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6597755551338196, "reward_std": 0.23096469044685364, "rewards/accuracy_reward": 0.659775584936142, "rewards/format_reward": 1.0, "step": 326 }, { "completion_length": 191.4693832397461, "epoch": 0.03290566037735849, "grad_norm": 0.7690261006355286, "kl": 0.04833984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6828028559684753, "reward_std": 0.13443829491734505, "rewards/accuracy_reward": 0.6828028857707977, "rewards/format_reward": 1.0, "step": 327 }, { "completion_length": 272.448974609375, "epoch": 0.033006289308176104, "grad_norm": 0.867473304271698, "kl": 0.0330810546875, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.6366174221038818, "reward_std": 0.28003351390361786, "rewards/accuracy_reward": 0.646821528673172, "rewards/format_reward": 0.9897959232330322, "step": 328 }, { "completion_length": 215.76529693603516, "epoch": 0.03310691823899371, "grad_norm": 1.7343281507492065, "kl": 0.04193115234375, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.55147385597229, "reward_std": 0.2688369154930115, "rewards/accuracy_reward": 0.5718820840120316, "rewards/format_reward": 0.9795918464660645, "step": 329 }, { "completion_length": 206.3571319580078, "epoch": 0.03320754716981132, "grad_norm": 1.5205153226852417, "kl": 0.038330078125, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.6530611515045166, "reward_std": 0.40012483298778534, "rewards/accuracy_reward": 0.6632652878761292, "rewards/format_reward": 0.9897959232330322, "step": 330 }, { "completion_length": 213.9387664794922, "epoch": 0.03330817610062893, "grad_norm": 0.9799218773841858, "kl": 0.0399169921875, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.50477933883667, "reward_std": 0.27648236602544785, "rewards/accuracy_reward": 0.5047793686389923, "rewards/format_reward": 1.0, "step": 331 }, { "completion_length": 203.4693832397461, "epoch": 0.03340880503144654, "grad_norm": 0.7898527979850769, "kl": 0.041748046875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5919488072395325, "reward_std": 0.1976219117641449, "rewards/accuracy_reward": 0.5919488370418549, "rewards/format_reward": 1.0, "step": 332 }, { "completion_length": 280.1734619140625, "epoch": 0.03350943396226415, "grad_norm": 1.0007644891738892, "kl": 0.0333251953125, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.5665852427482605, "reward_std": 0.2035464569926262, "rewards/accuracy_reward": 0.5665852129459381, "rewards/format_reward": 1.0, "step": 333 }, { "completion_length": 209.25509643554688, "epoch": 0.033610062893081764, "grad_norm": 0.8472474813461304, "kl": 0.03826904296875, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5933658480644226, "reward_std": 0.19004035741090775, "rewards/accuracy_reward": 0.6137740612030029, "rewards/format_reward": 0.9795918166637421, "step": 334 }, { "completion_length": 248.3571319580078, "epoch": 0.03371069182389937, "grad_norm": 0.9994526505470276, "kl": 0.029541015625, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.5789116024971008, "reward_std": 0.302063912153244, "rewards/accuracy_reward": 0.5789115726947784, "rewards/format_reward": 1.0, "step": 335 }, { "completion_length": 247.39795684814453, "epoch": 0.03381132075471698, "grad_norm": 0.7158417701721191, "kl": 0.03094482421875, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.7204295992851257, "reward_std": 0.15342948213219643, "rewards/accuracy_reward": 0.7204296886920929, "rewards/format_reward": 1.0, "step": 336 }, { "completion_length": 260.14286041259766, "epoch": 0.03391194968553459, "grad_norm": 0.6985768675804138, "kl": 0.0491943359375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.418384611606598, "reward_std": 0.2055024653673172, "rewards/accuracy_reward": 0.4489968493580818, "rewards/format_reward": 0.9693877398967743, "step": 337 }, { "completion_length": 253.23468780517578, "epoch": 0.0340125786163522, "grad_norm": 0.9023293256759644, "kl": 0.044677734375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.5607309341430664, "reward_std": 0.1829308271408081, "rewards/accuracy_reward": 0.5709350407123566, "rewards/format_reward": 0.9897959232330322, "step": 338 }, { "completion_length": 176.448974609375, "epoch": 0.03411320754716981, "grad_norm": 7.655227184295654, "kl": 0.0426025390625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6810790300369263, "reward_std": 0.19738443568348885, "rewards/accuracy_reward": 0.681079089641571, "rewards/format_reward": 1.0, "step": 339 }, { "completion_length": 202.1938705444336, "epoch": 0.03421383647798742, "grad_norm": 1.5638774633407593, "kl": 0.0491943359375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.673469364643097, "reward_std": 0.2777281850576401, "rewards/accuracy_reward": 0.6836734712123871, "rewards/format_reward": 0.9897959232330322, "step": 340 }, { "completion_length": 249.48979949951172, "epoch": 0.034314465408805034, "grad_norm": 0.8749297857284546, "kl": 0.0299072265625, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.5090408325195312, "reward_std": 0.30127204209566116, "rewards/accuracy_reward": 0.5294489860534668, "rewards/format_reward": 0.9795918166637421, "step": 341 }, { "completion_length": 276.09183502197266, "epoch": 0.03441509433962264, "grad_norm": 1.2626649141311646, "kl": 0.0325927734375, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.5408162474632263, "reward_std": 0.3349471837282181, "rewards/accuracy_reward": 0.5510203987360001, "rewards/format_reward": 0.9897959232330322, "step": 342 }, { "completion_length": 257.1734619140625, "epoch": 0.03451572327044025, "grad_norm": 1.0075628757476807, "kl": 0.0330810546875, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.5835276246070862, "reward_std": 0.23093076795339584, "rewards/accuracy_reward": 0.5937317311763763, "rewards/format_reward": 0.9897959232330322, "step": 343 }, { "completion_length": 227.3775405883789, "epoch": 0.03461635220125786, "grad_norm": 1.6852235794067383, "kl": 0.02880859375, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.6250242590904236, "reward_std": 0.2830858379602432, "rewards/accuracy_reward": 0.6250242590904236, "rewards/format_reward": 1.0, "step": 344 }, { "completion_length": 235.58162689208984, "epoch": 0.03471698113207547, "grad_norm": 1.3654571771621704, "kl": 0.0469970703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.4999600052833557, "reward_std": 0.39135271310806274, "rewards/accuracy_reward": 0.520368218421936, "rewards/format_reward": 0.9795918166637421, "step": 345 }, { "completion_length": 260.30611419677734, "epoch": 0.03481761006289308, "grad_norm": 1.1690601110458374, "kl": 0.03302001953125, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.497335135936737, "reward_std": 0.2928498238325119, "rewards/accuracy_reward": 0.4973351061344147, "rewards/format_reward": 1.0, "step": 346 }, { "completion_length": 219.13265228271484, "epoch": 0.034918238993710694, "grad_norm": 0.8244197368621826, "kl": 0.03289794921875, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.5, "reward_std": 0.2428746297955513, "rewards/accuracy_reward": 0.5102040618658066, "rewards/format_reward": 0.9897959232330322, "step": 347 }, { "completion_length": 285.4795837402344, "epoch": 0.0350188679245283, "grad_norm": 0.8693699240684509, "kl": 0.03021240234375, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.4377551078796387, "reward_std": 0.3272630423307419, "rewards/accuracy_reward": 0.44795916974544525, "rewards/format_reward": 0.9897959232330322, "step": 348 }, { "completion_length": 185.38774871826172, "epoch": 0.03511949685534591, "grad_norm": 0.9466422200202942, "kl": 0.043701171875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6234384179115295, "reward_std": 0.22527365386486053, "rewards/accuracy_reward": 0.6234384477138519, "rewards/format_reward": 1.0, "step": 349 }, { "completion_length": 208.32652282714844, "epoch": 0.03522012578616352, "grad_norm": 2.513374090194702, "kl": 0.0380859375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5872274041175842, "reward_std": 0.30057893693447113, "rewards/accuracy_reward": 0.5974315404891968, "rewards/format_reward": 0.9897959232330322, "step": 350 }, { "completion_length": 164.82653045654297, "epoch": 0.03532075471698113, "grad_norm": 1.5979398488998413, "kl": 0.0509033203125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7806931734085083, "reward_std": 0.31336842477321625, "rewards/accuracy_reward": 0.8011013567447662, "rewards/format_reward": 0.9795918166637421, "step": 351 }, { "completion_length": 213.86734008789062, "epoch": 0.03542138364779874, "grad_norm": 1.0528541803359985, "kl": 0.0447998046875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6851310729980469, "reward_std": 0.27397914975881577, "rewards/accuracy_reward": 0.6953352391719818, "rewards/format_reward": 0.9897959232330322, "step": 352 }, { "completion_length": 195.34693145751953, "epoch": 0.035522012578616355, "grad_norm": 1.552303433418274, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7494897246360779, "reward_std": 0.25321054458618164, "rewards/accuracy_reward": 0.759693831205368, "rewards/format_reward": 0.9897959232330322, "step": 353 }, { "completion_length": 233.92857360839844, "epoch": 0.035622641509433964, "grad_norm": 1.894070029258728, "kl": 0.0391845703125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.5099587440490723, "reward_std": 0.37978291511535645, "rewards/accuracy_reward": 0.5099587291479111, "rewards/format_reward": 1.0, "step": 354 }, { "completion_length": 220.1530532836914, "epoch": 0.03572327044025157, "grad_norm": 1.1465917825698853, "kl": 0.0499267578125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5034583806991577, "reward_std": 0.25867899507284164, "rewards/accuracy_reward": 0.5238665193319321, "rewards/format_reward": 0.9795918464660645, "step": 355 }, { "completion_length": 164.51020050048828, "epoch": 0.03582389937106918, "grad_norm": 1.4889384508132935, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6580860018730164, "reward_std": 0.2351205125451088, "rewards/accuracy_reward": 0.6784941107034683, "rewards/format_reward": 0.9795918166637421, "step": 356 }, { "completion_length": 145.44897842407227, "epoch": 0.03592452830188679, "grad_norm": 3.2025582790374756, "kl": 0.0633544921875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6244897246360779, "reward_std": 0.29140226542949677, "rewards/accuracy_reward": 0.6448979377746582, "rewards/format_reward": 0.9795918166637421, "step": 357 }, { "completion_length": 226.12244415283203, "epoch": 0.0360251572327044, "grad_norm": 1.4894888401031494, "kl": 0.033935546875, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.5606406927108765, "reward_std": 0.3302336558699608, "rewards/accuracy_reward": 0.5912529677152634, "rewards/format_reward": 0.9693877398967743, "step": 358 }, { "completion_length": 201.58162689208984, "epoch": 0.036125786163522015, "grad_norm": 1.9058735370635986, "kl": 0.0411376953125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.5894818902015686, "reward_std": 0.2563936710357666, "rewards/accuracy_reward": 0.6098901331424713, "rewards/format_reward": 0.9795918464660645, "step": 359 }, { "completion_length": 241.20407104492188, "epoch": 0.036226415094339624, "grad_norm": 1.0027248859405518, "kl": 0.0235595703125, "learning_rate": 1e-06, "loss": 0.0009, "reward": 1.6120955348014832, "reward_std": 0.39161159098148346, "rewards/accuracy_reward": 0.6222996711730957, "rewards/format_reward": 0.9897959232330322, "step": 360 }, { "completion_length": 245.28570556640625, "epoch": 0.03632704402515723, "grad_norm": 1.1923812627792358, "kl": 0.04486083984375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.3860544562339783, "reward_std": 0.4038238376379013, "rewards/accuracy_reward": 0.40646257996559143, "rewards/format_reward": 0.9795918166637421, "step": 361 }, { "completion_length": 243.46939086914062, "epoch": 0.03642767295597484, "grad_norm": 0.6966853737831116, "kl": 0.03521728515625, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.5374150276184082, "reward_std": 0.24441854655742645, "rewards/accuracy_reward": 0.5476190596818924, "rewards/format_reward": 0.9897959232330322, "step": 362 }, { "completion_length": 254.54080963134766, "epoch": 0.03652830188679245, "grad_norm": 1.4433263540267944, "kl": 0.0311279296875, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.5422739386558533, "reward_std": 0.3948972523212433, "rewards/accuracy_reward": 0.5524781495332718, "rewards/format_reward": 0.9897959232330322, "step": 363 }, { "completion_length": 347.91835021972656, "epoch": 0.03662893081761006, "grad_norm": 0.9544913172721863, "kl": 0.0245361328125, "learning_rate": 1e-06, "loss": 0.001, "reward": 1.3503401279449463, "reward_std": 0.3638933300971985, "rewards/accuracy_reward": 0.380952388048172, "rewards/format_reward": 0.9693877398967743, "step": 364 }, { "completion_length": 196.24488830566406, "epoch": 0.036729559748427676, "grad_norm": 0.944810688495636, "kl": 0.04736328125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.8857142925262451, "reward_std": 0.17846444249153137, "rewards/accuracy_reward": 0.8857142627239227, "rewards/format_reward": 1.0, "step": 365 }, { "completion_length": 177.09183502197266, "epoch": 0.036830188679245285, "grad_norm": 1.4670798778533936, "kl": 0.0711669921875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7890021800994873, "reward_std": 0.22890817373991013, "rewards/accuracy_reward": 0.7992063164710999, "rewards/format_reward": 0.9897959232330322, "step": 366 }, { "completion_length": 205.59182739257812, "epoch": 0.036930817610062894, "grad_norm": 0.773955225944519, "kl": 0.03179931640625, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.5845191478729248, "reward_std": 0.250836081802845, "rewards/accuracy_reward": 0.6049274504184723, "rewards/format_reward": 0.9795918166637421, "step": 367 }, { "completion_length": 270.07142639160156, "epoch": 0.0370314465408805, "grad_norm": 2.0806169509887695, "kl": 0.02752685546875, "learning_rate": 1e-06, "loss": 0.0011, "reward": 1.5089589357376099, "reward_std": 0.27405285090208054, "rewards/accuracy_reward": 0.5089588612318039, "rewards/format_reward": 1.0, "step": 368 }, { "completion_length": 302.6836700439453, "epoch": 0.03713207547169811, "grad_norm": 0.8699954748153687, "kl": 0.02587890625, "learning_rate": 1e-06, "loss": 0.001, "reward": 1.6208256483078003, "reward_std": 0.23757977783679962, "rewards/accuracy_reward": 0.6310296654701233, "rewards/format_reward": 0.9897959232330322, "step": 369 }, { "completion_length": 265.7142868041992, "epoch": 0.03723270440251572, "grad_norm": 1.5766732692718506, "kl": 0.06085205078125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5365784764289856, "reward_std": 0.37176623940467834, "rewards/accuracy_reward": 0.5467825829982758, "rewards/format_reward": 0.9897959232330322, "step": 370 }, { "completion_length": 206.2040786743164, "epoch": 0.037333333333333336, "grad_norm": 0.8604740500450134, "kl": 0.044677734375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.641156554222107, "reward_std": 0.201597698032856, "rewards/accuracy_reward": 0.6615645885467529, "rewards/format_reward": 0.9795918464660645, "step": 371 }, { "completion_length": 201.04080963134766, "epoch": 0.037433962264150945, "grad_norm": 0.709321141242981, "kl": 0.03509521484375, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.5038450360298157, "reward_std": 0.21523433923721313, "rewards/accuracy_reward": 0.5038450062274933, "rewards/format_reward": 1.0, "step": 372 }, { "completion_length": 244.6632537841797, "epoch": 0.037534591194968554, "grad_norm": 1.233547568321228, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.549934983253479, "reward_std": 0.2849665954709053, "rewards/accuracy_reward": 0.5703431218862534, "rewards/format_reward": 0.9795918166637421, "step": 373 }, { "completion_length": 215.29591369628906, "epoch": 0.03763522012578616, "grad_norm": 0.9752773642539978, "kl": 0.033935546875, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.726723551750183, "reward_std": 0.23095638304948807, "rewards/accuracy_reward": 0.7267235219478607, "rewards/format_reward": 1.0, "step": 374 }, { "completion_length": 168.89795684814453, "epoch": 0.03773584905660377, "grad_norm": 0.5676239132881165, "kl": 0.0487060546875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6127426624298096, "reward_std": 0.1233854778110981, "rewards/accuracy_reward": 0.6229467391967773, "rewards/format_reward": 0.9897959232330322, "step": 375 }, { "completion_length": 208.79591369628906, "epoch": 0.03783647798742138, "grad_norm": 2.0126726627349854, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7848638892173767, "reward_std": 0.21619706973433495, "rewards/accuracy_reward": 0.7848639488220215, "rewards/format_reward": 1.0, "step": 376 }, { "completion_length": 228.5204086303711, "epoch": 0.037937106918239, "grad_norm": 1.7231454849243164, "kl": 0.02935791015625, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.522997498512268, "reward_std": 0.2807261645793915, "rewards/accuracy_reward": 0.543405681848526, "rewards/format_reward": 0.9795918166637421, "step": 377 }, { "completion_length": 192.5, "epoch": 0.038037735849056606, "grad_norm": 1.316260814666748, "kl": 0.04791259765625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.8160732984542847, "reward_std": 0.2708076685667038, "rewards/accuracy_reward": 0.8160732984542847, "rewards/format_reward": 1.0, "step": 378 }, { "completion_length": 193.68366241455078, "epoch": 0.038138364779874215, "grad_norm": 1.3260279893875122, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.622165560722351, "reward_std": 0.23694218695163727, "rewards/accuracy_reward": 0.6221655458211899, "rewards/format_reward": 1.0, "step": 379 }, { "completion_length": 218.80611419677734, "epoch": 0.038238993710691824, "grad_norm": 1.252877950668335, "kl": 0.0323486328125, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.6377550959587097, "reward_std": 0.2800697609782219, "rewards/accuracy_reward": 0.6377550661563873, "rewards/format_reward": 1.0, "step": 380 }, { "completion_length": 260.9897918701172, "epoch": 0.03833962264150943, "grad_norm": 0.791221022605896, "kl": 0.0345458984375, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.6228060126304626, "reward_std": 0.2991415113210678, "rewards/accuracy_reward": 0.6432141959667206, "rewards/format_reward": 0.9795918464660645, "step": 381 }, { "completion_length": 186.9387664794922, "epoch": 0.03844025157232704, "grad_norm": 1.122750163078308, "kl": 0.046630859375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.690306007862091, "reward_std": 0.18943045288324356, "rewards/accuracy_reward": 0.7005102038383484, "rewards/format_reward": 0.9897959232330322, "step": 382 }, { "completion_length": 196.30611419677734, "epoch": 0.03854088050314466, "grad_norm": 1.192777156829834, "kl": 0.033447265625, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.6993330717086792, "reward_std": 0.16217897832393646, "rewards/accuracy_reward": 0.699333131313324, "rewards/format_reward": 1.0, "step": 383 }, { "completion_length": 167.05101776123047, "epoch": 0.038641509433962266, "grad_norm": 1.2862550020217896, "kl": 0.048095703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7451571226119995, "reward_std": 0.278512604534626, "rewards/accuracy_reward": 0.7553612291812897, "rewards/format_reward": 0.9897959232330322, "step": 384 }, { "completion_length": 237.07142639160156, "epoch": 0.038742138364779875, "grad_norm": 201.18807983398438, "kl": 0.869873046875, "learning_rate": 1e-06, "loss": 0.0348, "reward": 1.4806679487228394, "reward_std": 0.3759430944919586, "rewards/accuracy_reward": 0.5112802386283875, "rewards/format_reward": 0.9693877398967743, "step": 385 }, { "completion_length": 288.4693908691406, "epoch": 0.038842767295597484, "grad_norm": 0.991107165813446, "kl": 0.02850341796875, "learning_rate": 1e-06, "loss": 0.0011, "reward": 1.56297367811203, "reward_std": 0.2816944420337677, "rewards/accuracy_reward": 0.5731777995824814, "rewards/format_reward": 0.9897959232330322, "step": 386 }, { "completion_length": 239.5204086303711, "epoch": 0.03894339622641509, "grad_norm": 0.6900619268417358, "kl": 0.02197265625, "learning_rate": 1e-06, "loss": 0.0009, "reward": 1.5424964427947998, "reward_std": 0.2114153802394867, "rewards/accuracy_reward": 0.5424964278936386, "rewards/format_reward": 1.0, "step": 387 }, { "completion_length": 273.8571319580078, "epoch": 0.0390440251572327, "grad_norm": 1.2429834604263306, "kl": 0.0361328125, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.5019616484642029, "reward_std": 0.24342454969882965, "rewards/accuracy_reward": 0.522369846701622, "rewards/format_reward": 0.9795918166637421, "step": 388 }, { "completion_length": 223.2959213256836, "epoch": 0.03914465408805031, "grad_norm": 0.8895801305770874, "kl": 0.028564453125, "learning_rate": 1e-06, "loss": 0.0011, "reward": 1.5351917147636414, "reward_std": 0.2766787260770798, "rewards/accuracy_reward": 0.5453958213329315, "rewards/format_reward": 0.9897959232330322, "step": 389 }, { "completion_length": 274.3775405883789, "epoch": 0.03924528301886793, "grad_norm": 0.931897759437561, "kl": 0.042724609375, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.570699691772461, "reward_std": 0.2792956605553627, "rewards/accuracy_reward": 0.5809037983417511, "rewards/format_reward": 0.9897959232330322, "step": 390 }, { "completion_length": 176.94897842407227, "epoch": 0.039345911949685536, "grad_norm": 1.60930597782135, "kl": 0.044921875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7809587717056274, "reward_std": 0.2212038617581129, "rewards/accuracy_reward": 0.7911629378795624, "rewards/format_reward": 0.9897959232330322, "step": 391 }, { "completion_length": 136.51019668579102, "epoch": 0.039446540880503145, "grad_norm": 8.711913108825684, "kl": 0.0447998046875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7261903882026672, "reward_std": 0.20381193608045578, "rewards/accuracy_reward": 0.726190447807312, "rewards/format_reward": 1.0, "step": 392 }, { "completion_length": 217.99999237060547, "epoch": 0.039547169811320754, "grad_norm": 1.2131041288375854, "kl": 0.03155517578125, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.4999999403953552, "reward_std": 0.2854219228029251, "rewards/accuracy_reward": 0.5102040618658066, "rewards/format_reward": 0.9897959232330322, "step": 393 }, { "completion_length": 258.9387664794922, "epoch": 0.03964779874213836, "grad_norm": 1.9182833433151245, "kl": 0.04644775390625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5366557836532593, "reward_std": 0.2730342149734497, "rewards/accuracy_reward": 0.5570639818906784, "rewards/format_reward": 0.9795918166637421, "step": 394 }, { "completion_length": 214.54080963134766, "epoch": 0.03974842767295597, "grad_norm": 2.411756753921509, "kl": 0.0343017578125, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.5840033292770386, "reward_std": 0.3334144055843353, "rewards/accuracy_reward": 0.5840033292770386, "rewards/format_reward": 1.0, "step": 395 }, { "completion_length": 249.90816497802734, "epoch": 0.03984905660377359, "grad_norm": 1.1057673692703247, "kl": 0.02972412109375, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.4698724150657654, "reward_std": 0.2253032699227333, "rewards/accuracy_reward": 0.46987245976924896, "rewards/format_reward": 1.0, "step": 396 }, { "completion_length": 170.82653045654297, "epoch": 0.039949685534591196, "grad_norm": 1.4357892274856567, "kl": 0.04693603515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6969387531280518, "reward_std": 0.31646043062210083, "rewards/accuracy_reward": 0.7071428894996643, "rewards/format_reward": 0.9897959232330322, "step": 397 }, { "completion_length": 289.3163146972656, "epoch": 0.040050314465408805, "grad_norm": 0.9083858132362366, "kl": 0.04266357421875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6429307460784912, "reward_std": 0.22904998809099197, "rewards/accuracy_reward": 0.6735430359840393, "rewards/format_reward": 0.9693877398967743, "step": 398 }, { "completion_length": 223.2551040649414, "epoch": 0.040150943396226414, "grad_norm": 2.5391929149627686, "kl": 0.046630859375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5902531743049622, "reward_std": 0.23984985798597336, "rewards/accuracy_reward": 0.5902532190084457, "rewards/format_reward": 1.0, "step": 399 }, { "completion_length": 242.01020050048828, "epoch": 0.04025157232704402, "grad_norm": 1.2611008882522583, "kl": 0.04296875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6412550210952759, "reward_std": 0.26137053966522217, "rewards/accuracy_reward": 0.6718672811985016, "rewards/format_reward": 0.9693877398967743, "step": 400 }, { "completion_length": 214.32652282714844, "epoch": 0.04035220125786163, "grad_norm": 1.2376362085342407, "kl": 0.0421142578125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6013604998588562, "reward_std": 0.23732461035251617, "rewards/accuracy_reward": 0.6013604998588562, "rewards/format_reward": 1.0, "step": 401 }, { "completion_length": 203.61224365234375, "epoch": 0.04045283018867925, "grad_norm": 1.1356291770935059, "kl": 0.03265380859375, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.778036892414093, "reward_std": 0.228745736181736, "rewards/accuracy_reward": 0.778036892414093, "rewards/format_reward": 1.0, "step": 402 }, { "completion_length": 228.48978424072266, "epoch": 0.04055345911949686, "grad_norm": 0.7882019877433777, "kl": 0.0501708984375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7040815949440002, "reward_std": 0.21920377016067505, "rewards/accuracy_reward": 0.7244897782802582, "rewards/format_reward": 0.9795918464660645, "step": 403 }, { "completion_length": 228.448974609375, "epoch": 0.040654088050314466, "grad_norm": 4.796632289886475, "kl": 0.03662109375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.4915515780448914, "reward_std": 0.15635668486356735, "rewards/accuracy_reward": 0.5017556548118591, "rewards/format_reward": 0.9897959232330322, "step": 404 }, { "completion_length": 193.09182739257812, "epoch": 0.040754716981132075, "grad_norm": 1.5677821636199951, "kl": 0.0517578125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6656153798103333, "reward_std": 0.20662909001111984, "rewards/accuracy_reward": 0.6758194267749786, "rewards/format_reward": 0.9897959232330322, "step": 405 }, { "completion_length": 222.6734619140625, "epoch": 0.040855345911949684, "grad_norm": 0.8841682076454163, "kl": 0.0452880859375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.652025282382965, "reward_std": 0.19368688017129898, "rewards/accuracy_reward": 0.6622294485569, "rewards/format_reward": 0.9897959232330322, "step": 406 }, { "completion_length": 211.82653045654297, "epoch": 0.04095597484276729, "grad_norm": 1.0093152523040771, "kl": 0.055908203125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6510204076766968, "reward_std": 0.32471613585948944, "rewards/accuracy_reward": 0.6510204374790192, "rewards/format_reward": 1.0, "step": 407 }, { "completion_length": 161.66326141357422, "epoch": 0.04105660377358491, "grad_norm": 1.429966926574707, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5704566836357117, "reward_std": 0.24678420275449753, "rewards/accuracy_reward": 0.5704567432403564, "rewards/format_reward": 1.0, "step": 408 }, { "completion_length": 326.84693908691406, "epoch": 0.04115723270440252, "grad_norm": 0.8364593982696533, "kl": 0.03289794921875, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.4505919814109802, "reward_std": 0.26525457948446274, "rewards/accuracy_reward": 0.47100017964839935, "rewards/format_reward": 0.9795918166637421, "step": 409 }, { "completion_length": 156.2040786743164, "epoch": 0.041257861635220126, "grad_norm": 2.516920804977417, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7548299431800842, "reward_std": 0.3304380923509598, "rewards/accuracy_reward": 0.7548299133777618, "rewards/format_reward": 1.0, "step": 410 }, { "completion_length": 293.11224365234375, "epoch": 0.041358490566037735, "grad_norm": 0.9809574484825134, "kl": 0.0245361328125, "learning_rate": 1e-06, "loss": 0.001, "reward": 1.5765305757522583, "reward_std": 0.38675418496131897, "rewards/accuracy_reward": 0.5765306055545807, "rewards/format_reward": 1.0, "step": 411 }, { "completion_length": 180.65306091308594, "epoch": 0.041459119496855344, "grad_norm": 2.5179271697998047, "kl": 0.0399169921875, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7894547581672668, "reward_std": 0.17964263260364532, "rewards/accuracy_reward": 0.7894548177719116, "rewards/format_reward": 1.0, "step": 412 }, { "completion_length": 281.0306091308594, "epoch": 0.04155974842767295, "grad_norm": 0.9447323083877563, "kl": 0.03338623046875, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.5840246081352234, "reward_std": 0.2903495728969574, "rewards/accuracy_reward": 0.6044327914714813, "rewards/format_reward": 0.9795918464660645, "step": 413 }, { "completion_length": 164.4285659790039, "epoch": 0.04166037735849057, "grad_norm": 1.7296277284622192, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7059815526008606, "reward_std": 0.18119346350431442, "rewards/accuracy_reward": 0.7059816420078278, "rewards/format_reward": 1.0, "step": 414 }, { "completion_length": 250.62245178222656, "epoch": 0.04176100628930818, "grad_norm": 0.6932798027992249, "kl": 0.0372314453125, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.6918419003486633, "reward_std": 0.21546786278486252, "rewards/accuracy_reward": 0.7122502326965332, "rewards/format_reward": 0.9795918464660645, "step": 415 }, { "completion_length": 232.95917510986328, "epoch": 0.04186163522012579, "grad_norm": 1.3365272283554077, "kl": 0.0501708984375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6325550079345703, "reward_std": 0.3229721933603287, "rewards/accuracy_reward": 0.6427591443061829, "rewards/format_reward": 0.9897959232330322, "step": 416 }, { "completion_length": 178.63265228271484, "epoch": 0.041962264150943396, "grad_norm": 1.5990670919418335, "kl": 0.0574951171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.674038827419281, "reward_std": 0.21902252733707428, "rewards/accuracy_reward": 0.6740389168262482, "rewards/format_reward": 1.0, "step": 417 }, { "completion_length": 228.72447967529297, "epoch": 0.042062893081761005, "grad_norm": 1.0417078733444214, "kl": 0.0601806640625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6377550959587097, "reward_std": 0.21789834648370743, "rewards/accuracy_reward": 0.6377550959587097, "rewards/format_reward": 1.0, "step": 418 }, { "completion_length": 204.7142791748047, "epoch": 0.042163522012578614, "grad_norm": 1.642745852470398, "kl": 0.0565185546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5128279328346252, "reward_std": 0.3182232528924942, "rewards/accuracy_reward": 0.5332361161708832, "rewards/format_reward": 0.9795918166637421, "step": 419 }, { "completion_length": 203.89795684814453, "epoch": 0.04226415094339623, "grad_norm": 0.750477135181427, "kl": 0.06103515625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7017267346382141, "reward_std": 0.17388036847114563, "rewards/accuracy_reward": 0.7017267942428589, "rewards/format_reward": 1.0, "step": 420 }, { "completion_length": 262.4591751098633, "epoch": 0.04236477987421384, "grad_norm": 0.9113112092018127, "kl": 0.0460205078125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6477127075195312, "reward_std": 0.29884183406829834, "rewards/accuracy_reward": 0.6477126777172089, "rewards/format_reward": 1.0, "step": 421 }, { "completion_length": 218.6938705444336, "epoch": 0.04246540880503145, "grad_norm": 0.916228175163269, "kl": 0.0494384765625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6661250591278076, "reward_std": 0.372503399848938, "rewards/accuracy_reward": 0.6865333318710327, "rewards/format_reward": 0.9795918166637421, "step": 422 }, { "completion_length": 185.05101776123047, "epoch": 0.042566037735849056, "grad_norm": 7.762082576751709, "kl": 0.0758056640625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.709759771823883, "reward_std": 0.17958243191242218, "rewards/accuracy_reward": 0.7199638783931732, "rewards/format_reward": 0.9897959232330322, "step": 423 }, { "completion_length": 250.9693832397461, "epoch": 0.042666666666666665, "grad_norm": 1.2010889053344727, "kl": 0.038330078125, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5831735134124756, "reward_std": 0.3306109756231308, "rewards/accuracy_reward": 0.583173543214798, "rewards/format_reward": 1.0, "step": 424 }, { "completion_length": 192.1530532836914, "epoch": 0.042767295597484274, "grad_norm": 0.7057455778121948, "kl": 0.051513671875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5229591727256775, "reward_std": 0.1537434346973896, "rewards/accuracy_reward": 0.5331632643938065, "rewards/format_reward": 0.9897959232330322, "step": 425 }, { "completion_length": 172.60203552246094, "epoch": 0.04286792452830189, "grad_norm": 0.8350086212158203, "kl": 0.057373046875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7787414193153381, "reward_std": 0.23400932550430298, "rewards/accuracy_reward": 0.7787415087223053, "rewards/format_reward": 1.0, "step": 426 }, { "completion_length": 269.0612106323242, "epoch": 0.0429685534591195, "grad_norm": 0.970820963382721, "kl": 0.0372314453125, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5160777568817139, "reward_std": 0.32018430531024933, "rewards/accuracy_reward": 0.5160778015851974, "rewards/format_reward": 1.0, "step": 427 }, { "completion_length": 154.0, "epoch": 0.04306918238993711, "grad_norm": 3.943016767501831, "kl": 0.055419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7350582480430603, "reward_std": 0.2524019926786423, "rewards/accuracy_reward": 0.7452623546123505, "rewards/format_reward": 0.9897959232330322, "step": 428 }, { "completion_length": 146.84693908691406, "epoch": 0.04316981132075472, "grad_norm": 0.6962029933929443, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7754591703414917, "reward_std": 0.12554500997066498, "rewards/accuracy_reward": 0.7754592299461365, "rewards/format_reward": 1.0, "step": 429 }, { "completion_length": 199.29591369628906, "epoch": 0.043270440251572326, "grad_norm": 1.2469412088394165, "kl": 0.045166015625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.5772595405578613, "reward_std": 0.2302502989768982, "rewards/accuracy_reward": 0.5772594511508942, "rewards/format_reward": 1.0, "step": 430 }, { "completion_length": 246.91836547851562, "epoch": 0.043371069182389935, "grad_norm": 0.8515167832374573, "kl": 0.0413818359375, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5479591488838196, "reward_std": 0.27027102559804916, "rewards/accuracy_reward": 0.547959178686142, "rewards/format_reward": 1.0, "step": 431 }, { "completion_length": 184.0204086303711, "epoch": 0.04347169811320755, "grad_norm": 1.0281320810317993, "kl": 0.0816650390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7653060555458069, "reward_std": 0.26524004340171814, "rewards/accuracy_reward": 0.7653060853481293, "rewards/format_reward": 1.0, "step": 432 }, { "completion_length": 195.05101776123047, "epoch": 0.04357232704402516, "grad_norm": 1.2094025611877441, "kl": 0.0460205078125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6325654983520508, "reward_std": 0.21410004049539566, "rewards/accuracy_reward": 0.6427696049213409, "rewards/format_reward": 0.9897959232330322, "step": 433 }, { "completion_length": 214.64285278320312, "epoch": 0.04367295597484277, "grad_norm": 1.088992714881897, "kl": 0.0506591796875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.4819251894950867, "reward_std": 0.24614696949720383, "rewards/accuracy_reward": 0.5125373750925064, "rewards/format_reward": 0.9693877398967743, "step": 434 }, { "completion_length": 135.6020393371582, "epoch": 0.04377358490566038, "grad_norm": 1.281840205192566, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.880519449710846, "reward_std": 0.12954771146178246, "rewards/accuracy_reward": 0.8805194795131683, "rewards/format_reward": 1.0, "step": 435 }, { "completion_length": 174.92856979370117, "epoch": 0.043874213836477986, "grad_norm": 1.4939730167388916, "kl": 0.0458984375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.77268648147583, "reward_std": 0.17745322734117508, "rewards/accuracy_reward": 0.7726865708827972, "rewards/format_reward": 1.0, "step": 436 }, { "completion_length": 258.7346954345703, "epoch": 0.043974842767295595, "grad_norm": 0.7799019813537598, "kl": 0.05615234375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5455623269081116, "reward_std": 0.2575414851307869, "rewards/accuracy_reward": 0.5455623418092728, "rewards/format_reward": 1.0, "step": 437 }, { "completion_length": 195.62244415283203, "epoch": 0.044075471698113204, "grad_norm": 1.2203173637390137, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5401313304901123, "reward_std": 0.20136727392673492, "rewards/accuracy_reward": 0.5503354221582413, "rewards/format_reward": 0.9897959232330322, "step": 438 }, { "completion_length": 210.1326446533203, "epoch": 0.04417610062893082, "grad_norm": 1.0127344131469727, "kl": 0.03533935546875, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.6898990869522095, "reward_std": 0.2590012401342392, "rewards/accuracy_reward": 0.6898991167545319, "rewards/format_reward": 1.0, "step": 439 }, { "completion_length": 182.13265228271484, "epoch": 0.04427672955974843, "grad_norm": 2.5882019996643066, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7089953422546387, "reward_std": 0.2474483624100685, "rewards/accuracy_reward": 0.7089953720569611, "rewards/format_reward": 1.0, "step": 440 }, { "completion_length": 223.22447967529297, "epoch": 0.04437735849056604, "grad_norm": 1.5921272039413452, "kl": 0.048583984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5920156240463257, "reward_std": 0.31562794744968414, "rewards/accuracy_reward": 0.6022197008132935, "rewards/format_reward": 0.9897959232330322, "step": 441 }, { "completion_length": 197.01020050048828, "epoch": 0.04447798742138365, "grad_norm": 0.7567514181137085, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6122448444366455, "reward_std": 0.19220630079507828, "rewards/accuracy_reward": 0.6122449040412903, "rewards/format_reward": 1.0, "step": 442 }, { "completion_length": 242.89795684814453, "epoch": 0.044578616352201256, "grad_norm": 1.0162668228149414, "kl": 0.048828125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5880616307258606, "reward_std": 0.22324105352163315, "rewards/accuracy_reward": 0.5880616009235382, "rewards/format_reward": 1.0, "step": 443 }, { "completion_length": 186.38774871826172, "epoch": 0.044679245283018865, "grad_norm": 0.6885946989059448, "kl": 0.0538330078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.764455795288086, "reward_std": 0.15798020362854004, "rewards/accuracy_reward": 0.7644557654857635, "rewards/format_reward": 1.0, "step": 444 }, { "completion_length": 177.7142791748047, "epoch": 0.04477987421383648, "grad_norm": 0.8140802383422852, "kl": 0.052001953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5738722085952759, "reward_std": 0.2312658578157425, "rewards/accuracy_reward": 0.584076315164566, "rewards/format_reward": 0.9897959232330322, "step": 445 }, { "completion_length": 262.27550506591797, "epoch": 0.04488050314465409, "grad_norm": 1.2903730869293213, "kl": 0.0372314453125, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5990646481513977, "reward_std": 0.34000013023614883, "rewards/accuracy_reward": 0.6092687249183655, "rewards/format_reward": 0.9897959232330322, "step": 446 }, { "completion_length": 225.69387817382812, "epoch": 0.0449811320754717, "grad_norm": 3.1019771099090576, "kl": 0.065185546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6246472597122192, "reward_std": 0.3360922038555145, "rewards/accuracy_reward": 0.6246472597122192, "rewards/format_reward": 1.0, "step": 447 }, { "completion_length": 211.9693832397461, "epoch": 0.04508176100628931, "grad_norm": 1.1136345863342285, "kl": 0.03729248046875, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.6394663453102112, "reward_std": 0.18330146372318268, "rewards/accuracy_reward": 0.6394663751125336, "rewards/format_reward": 1.0, "step": 448 }, { "completion_length": 199.89795684814453, "epoch": 0.045182389937106916, "grad_norm": 0.9000609517097473, "kl": 0.045166015625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.583957850933075, "reward_std": 0.23878392577171326, "rewards/accuracy_reward": 0.5839579105377197, "rewards/format_reward": 1.0, "step": 449 }, { "completion_length": 157.32653045654297, "epoch": 0.045283018867924525, "grad_norm": 2.0235910415649414, "kl": 0.103759765625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6427842378616333, "reward_std": 0.24131646752357483, "rewards/accuracy_reward": 0.6529883146286011, "rewards/format_reward": 0.9897959232330322, "step": 450 }, { "completion_length": 188.56122589111328, "epoch": 0.04538364779874214, "grad_norm": 0.8310626745223999, "kl": 0.04296875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.805445909500122, "reward_std": 0.22329504787921906, "rewards/accuracy_reward": 0.8360581696033478, "rewards/format_reward": 0.9693877398967743, "step": 451 }, { "completion_length": 224.76531219482422, "epoch": 0.04548427672955975, "grad_norm": 1.393441915512085, "kl": 0.048828125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6929057240486145, "reward_std": 0.3194476068019867, "rewards/accuracy_reward": 0.7031097412109375, "rewards/format_reward": 0.9897959232330322, "step": 452 }, { "completion_length": 299.9693832397461, "epoch": 0.04558490566037736, "grad_norm": 0.8871423602104187, "kl": 0.0469970703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.4470157027244568, "reward_std": 0.2505965158343315, "rewards/accuracy_reward": 0.4470156878232956, "rewards/format_reward": 1.0, "step": 453 }, { "completion_length": 240.58162689208984, "epoch": 0.04568553459119497, "grad_norm": 1.3437827825546265, "kl": 0.049560546875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.622874677181244, "reward_std": 0.3472769558429718, "rewards/accuracy_reward": 0.6228746175765991, "rewards/format_reward": 1.0, "step": 454 }, { "completion_length": 191.36734008789062, "epoch": 0.04578616352201258, "grad_norm": 1.5549781322479248, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.704315423965454, "reward_std": 0.2745325416326523, "rewards/accuracy_reward": 0.7145195007324219, "rewards/format_reward": 0.9897959232330322, "step": 455 }, { "completion_length": 295.37754821777344, "epoch": 0.045886792452830186, "grad_norm": 0.6863526105880737, "kl": 0.04058837890625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.555224597454071, "reward_std": 0.23503050953149796, "rewards/accuracy_reward": 0.5654286295175552, "rewards/format_reward": 0.9897959232330322, "step": 456 }, { "completion_length": 281.0816345214844, "epoch": 0.0459874213836478, "grad_norm": 0.8162129521369934, "kl": 0.03570556640625, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.5792961120605469, "reward_std": 0.2618565559387207, "rewards/accuracy_reward": 0.5895003080368042, "rewards/format_reward": 0.9897959232330322, "step": 457 }, { "completion_length": 243.58162689208984, "epoch": 0.04608805031446541, "grad_norm": 0.7931280136108398, "kl": 0.045654296875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6662915349006653, "reward_std": 0.28063400089740753, "rewards/accuracy_reward": 0.6866996586322784, "rewards/format_reward": 0.9795918464660645, "step": 458 }, { "completion_length": 222.04080963134766, "epoch": 0.04618867924528302, "grad_norm": 1.270127296447754, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6122448444366455, "reward_std": 0.26497258991003036, "rewards/accuracy_reward": 0.6326530575752258, "rewards/format_reward": 0.9795918166637421, "step": 459 }, { "completion_length": 293.51019287109375, "epoch": 0.04628930817610063, "grad_norm": 0.9558162689208984, "kl": 0.0460205078125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6326530575752258, "reward_std": 0.19673581421375275, "rewards/accuracy_reward": 0.6326530575752258, "rewards/format_reward": 1.0, "step": 460 }, { "completion_length": 217.10203552246094, "epoch": 0.04638993710691824, "grad_norm": 1.0579527616500854, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.4778425693511963, "reward_std": 0.28499482572078705, "rewards/accuracy_reward": 0.4880466163158417, "rewards/format_reward": 0.9897959232330322, "step": 461 }, { "completion_length": 186.76529693603516, "epoch": 0.046490566037735846, "grad_norm": 1.1207361221313477, "kl": 0.0380859375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.7198331356048584, "reward_std": 0.23512297868728638, "rewards/accuracy_reward": 0.7300372421741486, "rewards/format_reward": 0.9897959232330322, "step": 462 }, { "completion_length": 129.43877410888672, "epoch": 0.04659119496855346, "grad_norm": 2.059990167617798, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6045918464660645, "reward_std": 0.17541727237403393, "rewards/accuracy_reward": 0.6045918315649033, "rewards/format_reward": 1.0, "step": 463 }, { "completion_length": 225.58162689208984, "epoch": 0.04669182389937107, "grad_norm": 1.5816500186920166, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.622668206691742, "reward_std": 0.287349209189415, "rewards/accuracy_reward": 0.6226681619882584, "rewards/format_reward": 1.0, "step": 464 }, { "completion_length": 266.0, "epoch": 0.04679245283018868, "grad_norm": 1.209191918373108, "kl": 0.0499267578125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6514818668365479, "reward_std": 0.29253095388412476, "rewards/accuracy_reward": 0.66168612241745, "rewards/format_reward": 0.9897959232330322, "step": 465 }, { "completion_length": 173.67346572875977, "epoch": 0.04689308176100629, "grad_norm": 1.5156649351119995, "kl": 0.0517578125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7725947499275208, "reward_std": 0.12667856365442276, "rewards/accuracy_reward": 0.7725947499275208, "rewards/format_reward": 1.0, "step": 466 }, { "completion_length": 224.11224365234375, "epoch": 0.0469937106918239, "grad_norm": 0.6744065284729004, "kl": 0.04278564453125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6882166266441345, "reward_std": 0.164799053221941, "rewards/accuracy_reward": 0.718828946352005, "rewards/format_reward": 0.9693877398967743, "step": 467 }, { "completion_length": 226.3877410888672, "epoch": 0.04709433962264151, "grad_norm": 1.6007424592971802, "kl": 0.0482177734375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.643926203250885, "reward_std": 0.3046176955103874, "rewards/accuracy_reward": 0.6439261436462402, "rewards/format_reward": 1.0, "step": 468 }, { "completion_length": 217.15306091308594, "epoch": 0.04719496855345912, "grad_norm": 0.7187586426734924, "kl": 0.0457763671875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.5390669703483582, "reward_std": 0.18608583509922028, "rewards/accuracy_reward": 0.5696793049573898, "rewards/format_reward": 0.9693877398967743, "step": 469 }, { "completion_length": 241.08162689208984, "epoch": 0.04729559748427673, "grad_norm": 28.86932945251465, "kl": 0.8558349609375, "learning_rate": 1e-06, "loss": 0.0344, "reward": 1.479174017906189, "reward_std": 0.3295276015996933, "rewards/accuracy_reward": 0.4995822012424469, "rewards/format_reward": 0.9795918464660645, "step": 470 }, { "completion_length": 265.3367233276367, "epoch": 0.04739622641509434, "grad_norm": 0.6856458187103271, "kl": 0.0367431640625, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5612244606018066, "reward_std": 0.2797493636608124, "rewards/accuracy_reward": 0.5612244755029678, "rewards/format_reward": 1.0, "step": 471 }, { "completion_length": 191.22447967529297, "epoch": 0.04749685534591195, "grad_norm": 1.2074438333511353, "kl": 0.0672607421875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6462584733963013, "reward_std": 0.20575617998838425, "rewards/accuracy_reward": 0.656462550163269, "rewards/format_reward": 0.9897959232330322, "step": 472 }, { "completion_length": 222.37754821777344, "epoch": 0.04759748427672956, "grad_norm": 0.8018175363540649, "kl": 0.03863525390625, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5933823585510254, "reward_std": 0.2833126485347748, "rewards/accuracy_reward": 0.5933823585510254, "rewards/format_reward": 1.0, "step": 473 }, { "completion_length": 162.29591369628906, "epoch": 0.04769811320754717, "grad_norm": 8.33061408996582, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6967201232910156, "reward_std": 0.2764175236225128, "rewards/accuracy_reward": 0.7171282172203064, "rewards/format_reward": 0.9795918166637421, "step": 474 }, { "completion_length": 186.39794921875, "epoch": 0.04779874213836478, "grad_norm": 0.7544052600860596, "kl": 0.050048828125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.719293236732483, "reward_std": 0.18462588638067245, "rewards/accuracy_reward": 0.7192932963371277, "rewards/format_reward": 1.0, "step": 475 }, { "completion_length": 217.1530532836914, "epoch": 0.04789937106918239, "grad_norm": 1.0864946842193604, "kl": 0.05078125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5558794736862183, "reward_std": 0.29777538776397705, "rewards/accuracy_reward": 0.5558795034885406, "rewards/format_reward": 1.0, "step": 476 }, { "completion_length": 193.03060913085938, "epoch": 0.048, "grad_norm": 1.5090595483779907, "kl": 0.05517578125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.636533260345459, "reward_std": 0.26599104702472687, "rewards/accuracy_reward": 0.6569414436817169, "rewards/format_reward": 0.9795918166637421, "step": 477 }, { "completion_length": 206.85713958740234, "epoch": 0.04810062893081761, "grad_norm": 3.170783042907715, "kl": 0.039306640625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6487326622009277, "reward_std": 0.2707487642765045, "rewards/accuracy_reward": 0.6589367389678955, "rewards/format_reward": 0.9897959232330322, "step": 478 }, { "completion_length": 182.71428680419922, "epoch": 0.04820125786163522, "grad_norm": 1.367881178855896, "kl": 0.0484619140625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.589117407798767, "reward_std": 0.18599818646907806, "rewards/accuracy_reward": 0.5891173779964447, "rewards/format_reward": 1.0, "step": 479 }, { "completion_length": 299.74488830566406, "epoch": 0.04830188679245283, "grad_norm": 0.9450684189796448, "kl": 0.03704833984375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.3795117735862732, "reward_std": 0.33365123718976974, "rewards/accuracy_reward": 0.4407362937927246, "rewards/format_reward": 0.9387754797935486, "step": 480 }, { "completion_length": 180.85713958740234, "epoch": 0.048402515723270444, "grad_norm": 1.8157556056976318, "kl": 0.0408935546875, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6681243181228638, "reward_std": 0.1905330866575241, "rewards/accuracy_reward": 0.6681243479251862, "rewards/format_reward": 1.0, "step": 481 }, { "completion_length": 290.78570556640625, "epoch": 0.04850314465408805, "grad_norm": 0.8542895317077637, "kl": 0.0286865234375, "learning_rate": 1e-06, "loss": 0.0011, "reward": 1.3589547276496887, "reward_std": 0.20802249014377594, "rewards/accuracy_reward": 0.3793628513813019, "rewards/format_reward": 0.9795918166637421, "step": 482 }, { "completion_length": 184.85713958740234, "epoch": 0.04860377358490566, "grad_norm": 1.897443175315857, "kl": 0.0489501953125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.681498944759369, "reward_std": 0.21774417161941528, "rewards/accuracy_reward": 0.7019071877002716, "rewards/format_reward": 0.9795918464660645, "step": 483 }, { "completion_length": 230.90816497802734, "epoch": 0.04870440251572327, "grad_norm": 1.1263720989227295, "kl": 0.044921875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.621492087841034, "reward_std": 0.2331894114613533, "rewards/accuracy_reward": 0.6316961944103241, "rewards/format_reward": 0.9897959232330322, "step": 484 }, { "completion_length": 196.54080963134766, "epoch": 0.04880503144654088, "grad_norm": 1.0755656957626343, "kl": 0.04443359375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6625971794128418, "reward_std": 0.25590604543685913, "rewards/accuracy_reward": 0.6932094395160675, "rewards/format_reward": 0.9693877398967743, "step": 485 }, { "completion_length": 202.3571319580078, "epoch": 0.04890566037735849, "grad_norm": 0.8146214485168457, "kl": 0.0439453125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.66379976272583, "reward_std": 0.17237555235624313, "rewards/accuracy_reward": 0.6740038692951202, "rewards/format_reward": 0.9897959232330322, "step": 486 }, { "completion_length": 255.07142639160156, "epoch": 0.0490062893081761, "grad_norm": 0.998040497303009, "kl": 0.0345458984375, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.6017979383468628, "reward_std": 0.3294399380683899, "rewards/accuracy_reward": 0.6426141858100891, "rewards/format_reward": 0.9591836631298065, "step": 487 }, { "completion_length": 219.49999237060547, "epoch": 0.04910691823899371, "grad_norm": 0.8967046737670898, "kl": 0.0374755859375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.4285714030265808, "reward_std": 0.22705987840890884, "rewards/accuracy_reward": 0.4387754946947098, "rewards/format_reward": 0.9897959232330322, "step": 488 }, { "completion_length": 213.88774871826172, "epoch": 0.04920754716981132, "grad_norm": 1.5518419742584229, "kl": 0.0479736328125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.614741027355194, "reward_std": 0.24222103506326675, "rewards/accuracy_reward": 0.6249450892210007, "rewards/format_reward": 0.9897959232330322, "step": 489 }, { "completion_length": 189.2653045654297, "epoch": 0.04930817610062893, "grad_norm": 1.2731974124908447, "kl": 0.0589599609375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6480580568313599, "reward_std": 0.17041579633951187, "rewards/accuracy_reward": 0.6480580270290375, "rewards/format_reward": 1.0, "step": 490 }, { "completion_length": 240.10203552246094, "epoch": 0.04940880503144654, "grad_norm": 1.0473086833953857, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.540816307067871, "reward_std": 0.2585868313908577, "rewards/accuracy_reward": 0.5714285373687744, "rewards/format_reward": 0.9693877398967743, "step": 491 }, { "completion_length": 227.39794921875, "epoch": 0.04950943396226415, "grad_norm": 0.9080423712730408, "kl": 0.0390625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6176394820213318, "reward_std": 0.23540716618299484, "rewards/accuracy_reward": 0.627843588590622, "rewards/format_reward": 0.9897959232330322, "step": 492 }, { "completion_length": 239.06121826171875, "epoch": 0.04961006289308176, "grad_norm": 1.4035974740982056, "kl": 0.037841796875, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.4909921288490295, "reward_std": 0.24147501587867737, "rewards/accuracy_reward": 0.5011962950229645, "rewards/format_reward": 0.9897959232330322, "step": 493 }, { "completion_length": 189.9591827392578, "epoch": 0.049710691823899374, "grad_norm": 1.2055820226669312, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.4954283833503723, "reward_std": 0.23923387378454208, "rewards/accuracy_reward": 0.5056324601173401, "rewards/format_reward": 0.9897959232330322, "step": 494 }, { "completion_length": 224.41836547851562, "epoch": 0.04981132075471698, "grad_norm": 1.1614551544189453, "kl": 0.0465087890625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6632652878761292, "reward_std": 0.2888083755970001, "rewards/accuracy_reward": 0.6632653176784515, "rewards/format_reward": 1.0, "step": 495 }, { "completion_length": 171.6836700439453, "epoch": 0.04991194968553459, "grad_norm": 3.3588545322418213, "kl": 0.0653076171875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.582362413406372, "reward_std": 0.2278861626982689, "rewards/accuracy_reward": 0.5925664454698563, "rewards/format_reward": 0.9897959232330322, "step": 496 }, { "completion_length": 246.09183502197266, "epoch": 0.0500125786163522, "grad_norm": 0.9489980340003967, "kl": 0.0401611328125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.5870747566223145, "reward_std": 0.27718549966812134, "rewards/accuracy_reward": 0.597278892993927, "rewards/format_reward": 0.9897959232330322, "step": 497 }, { "completion_length": 246.72447967529297, "epoch": 0.05011320754716981, "grad_norm": 1.063209056854248, "kl": 0.0550537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.4817824363708496, "reward_std": 0.3242812603712082, "rewards/accuracy_reward": 0.5021905899047852, "rewards/format_reward": 0.9795918464660645, "step": 498 }, { "completion_length": 173.6938705444336, "epoch": 0.05021383647798742, "grad_norm": 1.8783752918243408, "kl": 0.0400390625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.4660099148750305, "reward_std": 0.22068942338228226, "rewards/accuracy_reward": 0.4762140363454819, "rewards/format_reward": 0.9897959232330322, "step": 499 }, { "completion_length": 229.55101776123047, "epoch": 0.050314465408805034, "grad_norm": 0.8152766227722168, "kl": 0.0406494140625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.5850340127944946, "reward_std": 0.18681053817272186, "rewards/accuracy_reward": 0.5850340127944946, "rewards/format_reward": 1.0, "step": 500 }, { "completion_length": 199.8775405883789, "epoch": 0.05041509433962264, "grad_norm": 0.8109144568443298, "kl": 0.0614013671875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5442176461219788, "reward_std": 0.22252143919467926, "rewards/accuracy_reward": 0.5646258145570755, "rewards/format_reward": 0.9795918166637421, "step": 501 }, { "completion_length": 198.61224365234375, "epoch": 0.05051572327044025, "grad_norm": 0.6794519424438477, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7943983674049377, "reward_std": 0.18503019213676453, "rewards/accuracy_reward": 0.8046024441719055, "rewards/format_reward": 0.9897959232330322, "step": 502 }, { "completion_length": 219.75509643554688, "epoch": 0.05061635220125786, "grad_norm": 1.1762524843215942, "kl": 0.0416259765625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5558839440345764, "reward_std": 0.3281470984220505, "rewards/accuracy_reward": 0.566088080406189, "rewards/format_reward": 0.9897959232330322, "step": 503 }, { "completion_length": 256.2142868041992, "epoch": 0.05071698113207547, "grad_norm": 0.7850807309150696, "kl": 0.0386962890625, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5461317896842957, "reward_std": 0.20396920293569565, "rewards/accuracy_reward": 0.5461318492889404, "rewards/format_reward": 1.0, "step": 504 }, { "completion_length": 180.448974609375, "epoch": 0.05081761006289308, "grad_norm": 0.7456995844841003, "kl": 0.03076171875, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.7237908244132996, "reward_std": 0.2033155970275402, "rewards/accuracy_reward": 0.7339949011802673, "rewards/format_reward": 0.9897959232330322, "step": 505 }, { "completion_length": 199.95917510986328, "epoch": 0.050918238993710695, "grad_norm": 1.826258897781372, "kl": 0.043212890625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7431262731552124, "reward_std": 0.2334059476852417, "rewards/accuracy_reward": 0.7431263029575348, "rewards/format_reward": 1.0, "step": 506 }, { "completion_length": 225.97958374023438, "epoch": 0.051018867924528304, "grad_norm": 0.6263205409049988, "kl": 0.0477294921875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.510235846042633, "reward_std": 0.15842144936323166, "rewards/accuracy_reward": 0.530643954873085, "rewards/format_reward": 0.9795918464660645, "step": 507 }, { "completion_length": 182.41836547851562, "epoch": 0.05111949685534591, "grad_norm": 1.2088226079940796, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5378968119621277, "reward_std": 0.15734181553125381, "rewards/accuracy_reward": 0.5378968119621277, "rewards/format_reward": 1.0, "step": 508 }, { "completion_length": 210.448974609375, "epoch": 0.05122012578616352, "grad_norm": 0.968625545501709, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6657422184944153, "reward_std": 0.2640323042869568, "rewards/accuracy_reward": 0.6759462952613831, "rewards/format_reward": 0.9897959232330322, "step": 509 }, { "completion_length": 154.55101776123047, "epoch": 0.05132075471698113, "grad_norm": 0.8448255062103271, "kl": 0.0521240234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7651846408843994, "reward_std": 0.1660684458911419, "rewards/accuracy_reward": 0.7651846706867218, "rewards/format_reward": 1.0, "step": 510 }, { "completion_length": 275.63265228271484, "epoch": 0.05142138364779874, "grad_norm": 1.2814768552780151, "kl": 0.045654296875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.667161762714386, "reward_std": 0.3230176120996475, "rewards/accuracy_reward": 0.7181822061538696, "rewards/format_reward": 0.9489795565605164, "step": 511 }, { "completion_length": 279.34693908691406, "epoch": 0.051522012578616355, "grad_norm": 1.3378461599349976, "kl": 0.03717041015625, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5102040767669678, "reward_std": 0.28427888453006744, "rewards/accuracy_reward": 0.5204081535339355, "rewards/format_reward": 0.9897959232330322, "step": 512 }, { "completion_length": 183.68366241455078, "epoch": 0.051622641509433964, "grad_norm": 1.0557585954666138, "kl": 0.0526123046875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7675994634628296, "reward_std": 0.1680484488606453, "rewards/accuracy_reward": 0.7675994336605072, "rewards/format_reward": 1.0, "step": 513 }, { "completion_length": 282.6122360229492, "epoch": 0.05172327044025157, "grad_norm": 1.0541088581085205, "kl": 0.02740478515625, "learning_rate": 1e-06, "loss": 0.0011, "reward": 1.527103304862976, "reward_std": 0.29848776012659073, "rewards/accuracy_reward": 0.5577154755592346, "rewards/format_reward": 0.9693877398967743, "step": 514 }, { "completion_length": 167.948974609375, "epoch": 0.05182389937106918, "grad_norm": 1.7154979705810547, "kl": 0.0440673828125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7176870703697205, "reward_std": 0.24620123952627182, "rewards/accuracy_reward": 0.7176871001720428, "rewards/format_reward": 1.0, "step": 515 }, { "completion_length": 221.87754821777344, "epoch": 0.05192452830188679, "grad_norm": 0.9696309566497803, "kl": 0.0496826171875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.4820213913917542, "reward_std": 0.0943220667541027, "rewards/accuracy_reward": 0.5024295300245285, "rewards/format_reward": 0.9795918166637421, "step": 516 }, { "completion_length": 275.1122360229492, "epoch": 0.0520251572327044, "grad_norm": 0.640284538269043, "kl": 0.0374755859375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5345577597618103, "reward_std": 0.22652758657932281, "rewards/accuracy_reward": 0.5345577448606491, "rewards/format_reward": 1.0, "step": 517 }, { "completion_length": 205.27550506591797, "epoch": 0.052125786163522016, "grad_norm": 2.2810745239257812, "kl": 0.0478515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.706316888332367, "reward_std": 0.12587671726942062, "rewards/accuracy_reward": 0.7165209054946899, "rewards/format_reward": 0.9897959232330322, "step": 518 }, { "completion_length": 190.38775634765625, "epoch": 0.052226415094339625, "grad_norm": 1.146325945854187, "kl": 0.0419921875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6929057240486145, "reward_std": 0.2865114212036133, "rewards/accuracy_reward": 0.7031098008155823, "rewards/format_reward": 0.9897959232330322, "step": 519 }, { "completion_length": 181.8775405883789, "epoch": 0.052327044025157234, "grad_norm": 1.4205118417739868, "kl": 0.043212890625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.59040105342865, "reward_std": 0.2714220732450485, "rewards/accuracy_reward": 0.621013343334198, "rewards/format_reward": 0.9693877398967743, "step": 520 }, { "completion_length": 173.57142639160156, "epoch": 0.05242767295597484, "grad_norm": 0.9800917506217957, "kl": 0.0506591796875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7346938252449036, "reward_std": 0.22373328357934952, "rewards/accuracy_reward": 0.734693855047226, "rewards/format_reward": 1.0, "step": 521 }, { "completion_length": 277.29591369628906, "epoch": 0.05252830188679245, "grad_norm": 1.3761385679244995, "kl": 0.03759765625, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.4891332983970642, "reward_std": 0.2027726024389267, "rewards/accuracy_reward": 0.4993373900651932, "rewards/format_reward": 0.9897959232330322, "step": 522 }, { "completion_length": 205.9693832397461, "epoch": 0.05262893081761006, "grad_norm": 3.3362603187561035, "kl": 0.0474853515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5897958874702454, "reward_std": 0.282269611954689, "rewards/accuracy_reward": 0.6204081475734711, "rewards/format_reward": 0.9693877398967743, "step": 523 }, { "completion_length": 222.22447967529297, "epoch": 0.052729559748427676, "grad_norm": 0.865077793598175, "kl": 0.038330078125, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.7386142015457153, "reward_std": 0.20557411760091782, "rewards/accuracy_reward": 0.7386142313480377, "rewards/format_reward": 1.0, "step": 524 }, { "completion_length": 187.7551040649414, "epoch": 0.052830188679245285, "grad_norm": 0.732351541519165, "kl": 0.0528564453125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6632652878761292, "reward_std": 0.1783822700381279, "rewards/accuracy_reward": 0.6632653027772903, "rewards/format_reward": 1.0, "step": 525 }, { "completion_length": 269.40816497802734, "epoch": 0.052930817610062894, "grad_norm": 0.7266910076141357, "kl": 0.0439453125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.526484191417694, "reward_std": 0.20934822410345078, "rewards/accuracy_reward": 0.5468923896551132, "rewards/format_reward": 0.9795918166637421, "step": 526 }, { "completion_length": 343.4897918701172, "epoch": 0.0530314465408805, "grad_norm": 1.440658450126648, "kl": 0.03436279296875, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.384183645248413, "reward_std": 0.3953660875558853, "rewards/accuracy_reward": 0.42499998211860657, "rewards/format_reward": 0.9591836333274841, "step": 527 }, { "completion_length": 195.38774871826172, "epoch": 0.05313207547169811, "grad_norm": 2.524853229522705, "kl": 0.048095703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.623015820980072, "reward_std": 0.2949402630329132, "rewards/accuracy_reward": 0.6332199275493622, "rewards/format_reward": 0.9897959232330322, "step": 528 }, { "completion_length": 209.1836700439453, "epoch": 0.05323270440251572, "grad_norm": 1.022301435470581, "kl": 0.0516357421875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6658366918563843, "reward_std": 0.20583681762218475, "rewards/accuracy_reward": 0.6658366918563843, "rewards/format_reward": 1.0, "step": 529 }, { "completion_length": 221.14285278320312, "epoch": 0.05333333333333334, "grad_norm": 1.0251598358154297, "kl": 0.0421142578125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.71933913230896, "reward_std": 0.2981230840086937, "rewards/accuracy_reward": 0.7397473156452179, "rewards/format_reward": 0.9795918464660645, "step": 530 }, { "completion_length": 208.25509643554688, "epoch": 0.053433962264150946, "grad_norm": 0.9649690389633179, "kl": 0.0543212890625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6355834007263184, "reward_std": 0.23377200961112976, "rewards/accuracy_reward": 0.6559916138648987, "rewards/format_reward": 0.9795918464660645, "step": 531 }, { "completion_length": 233.7244873046875, "epoch": 0.053534591194968555, "grad_norm": 1.0756847858428955, "kl": 0.04345703125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5184549689292908, "reward_std": 0.3044579029083252, "rewards/accuracy_reward": 0.5184550732374191, "rewards/format_reward": 1.0, "step": 532 }, { "completion_length": 235.33673858642578, "epoch": 0.053635220125786164, "grad_norm": 1.1251142024993896, "kl": 0.04327392578125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6292484402656555, "reward_std": 0.24625862389802933, "rewards/accuracy_reward": 0.6394525170326233, "rewards/format_reward": 0.9897959232330322, "step": 533 }, { "completion_length": 154.79591369628906, "epoch": 0.05373584905660377, "grad_norm": 1.1178349256515503, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7983478903770447, "reward_std": 0.22111549228429794, "rewards/accuracy_reward": 0.7983478903770447, "rewards/format_reward": 1.0, "step": 534 }, { "completion_length": 250.84693145751953, "epoch": 0.05383647798742138, "grad_norm": 1.1718964576721191, "kl": 0.04638671875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6967929601669312, "reward_std": 0.25433704257011414, "rewards/accuracy_reward": 0.6967930197715759, "rewards/format_reward": 1.0, "step": 535 }, { "completion_length": 269.5408020019531, "epoch": 0.05393710691823899, "grad_norm": 2.3373630046844482, "kl": 0.0631103515625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7207286953926086, "reward_std": 0.3450760245323181, "rewards/accuracy_reward": 0.7309328019618988, "rewards/format_reward": 0.9897959232330322, "step": 536 }, { "completion_length": 257.62244415283203, "epoch": 0.054037735849056606, "grad_norm": 0.9943492412567139, "kl": 0.0340576171875, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.5054664015769958, "reward_std": 0.23456206917762756, "rewards/accuracy_reward": 0.5360787361860275, "rewards/format_reward": 0.9693877398967743, "step": 537 }, { "completion_length": 221.1938705444336, "epoch": 0.054138364779874215, "grad_norm": 1.1157132387161255, "kl": 0.045654296875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6224489212036133, "reward_std": 0.3176620602607727, "rewards/accuracy_reward": 0.6632653176784515, "rewards/format_reward": 0.9591836631298065, "step": 538 }, { "completion_length": 202.4897918701172, "epoch": 0.054238993710691824, "grad_norm": 1.1226099729537964, "kl": 0.0565185546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6635892391204834, "reward_std": 0.16721510887145996, "rewards/accuracy_reward": 0.6635892391204834, "rewards/format_reward": 1.0, "step": 539 }, { "completion_length": 208.4285659790039, "epoch": 0.05433962264150943, "grad_norm": 0.7350931763648987, "kl": 0.05712890625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6827311515808105, "reward_std": 0.27788205444812775, "rewards/accuracy_reward": 0.7031392753124237, "rewards/format_reward": 0.9795918464660645, "step": 540 }, { "completion_length": 226.36734771728516, "epoch": 0.05444025157232704, "grad_norm": 3.302635669708252, "kl": 0.0404052734375, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7769923210144043, "reward_std": 0.33250879496335983, "rewards/accuracy_reward": 0.8178086280822754, "rewards/format_reward": 0.9591836333274841, "step": 541 }, { "completion_length": 274.5408172607422, "epoch": 0.05454088050314465, "grad_norm": 1.0871574878692627, "kl": 0.04046630859375, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.313015103340149, "reward_std": 0.3481514900922775, "rewards/accuracy_reward": 0.364035502076149, "rewards/format_reward": 0.9489795565605164, "step": 542 }, { "completion_length": 242.85714721679688, "epoch": 0.05464150943396227, "grad_norm": 1.7596056461334229, "kl": 0.0391845703125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6523322463035583, "reward_std": 0.12694789096713066, "rewards/accuracy_reward": 0.6523323357105255, "rewards/format_reward": 1.0, "step": 543 }, { "completion_length": 235.1224365234375, "epoch": 0.054742138364779876, "grad_norm": 1.1691126823425293, "kl": 0.04248046875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7346938252449036, "reward_std": 0.2843813896179199, "rewards/accuracy_reward": 0.734693855047226, "rewards/format_reward": 1.0, "step": 544 }, { "completion_length": 242.77549743652344, "epoch": 0.054842767295597485, "grad_norm": 1.6135002374649048, "kl": 0.03466796875, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.6922816038131714, "reward_std": 0.3004837706685066, "rewards/accuracy_reward": 0.7126897871494293, "rewards/format_reward": 0.9795918166637421, "step": 545 }, { "completion_length": 219.6326446533203, "epoch": 0.054943396226415094, "grad_norm": 1.083217978477478, "kl": 0.0399169921875, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.395146906375885, "reward_std": 0.1809234321117401, "rewards/accuracy_reward": 0.41555511951446533, "rewards/format_reward": 0.9795918166637421, "step": 546 }, { "completion_length": 226.91836547851562, "epoch": 0.0550440251572327, "grad_norm": 1.6989130973815918, "kl": 0.037353515625, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.582652986049652, "reward_std": 0.344615638256073, "rewards/accuracy_reward": 0.6234693825244904, "rewards/format_reward": 0.9591836333274841, "step": 547 }, { "completion_length": 251.89795684814453, "epoch": 0.05514465408805031, "grad_norm": 1.218298316001892, "kl": 0.030517578125, "learning_rate": 1e-06, "loss": 0.0012, "reward": 1.6326655745506287, "reward_std": 0.3266024589538574, "rewards/accuracy_reward": 0.6836859881877899, "rewards/format_reward": 0.9489795863628387, "step": 548 }, { "completion_length": 247.07142639160156, "epoch": 0.05524528301886793, "grad_norm": 1.2621062994003296, "kl": 0.0418701171875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.4387754797935486, "reward_std": 0.32581567764282227, "rewards/accuracy_reward": 0.4591836780309677, "rewards/format_reward": 0.9795918166637421, "step": 549 }, { "completion_length": 150.5510139465332, "epoch": 0.055345911949685536, "grad_norm": 1.5702775716781616, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7738094925880432, "reward_std": 0.1647867038846016, "rewards/accuracy_reward": 0.7738094925880432, "rewards/format_reward": 1.0, "step": 550 }, { "completion_length": 332.1632537841797, "epoch": 0.055446540880503145, "grad_norm": 0.7747080326080322, "kl": 0.035400390625, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.4876946210861206, "reward_std": 0.25005778670310974, "rewards/accuracy_reward": 0.5081027299165726, "rewards/format_reward": 0.9795918166637421, "step": 551 }, { "completion_length": 197.57141876220703, "epoch": 0.055547169811320754, "grad_norm": 1.3340266942977905, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7077097296714783, "reward_std": 0.18113499879837036, "rewards/accuracy_reward": 0.7077097296714783, "rewards/format_reward": 1.0, "step": 552 }, { "completion_length": 205.6836700439453, "epoch": 0.05564779874213836, "grad_norm": 1.1890196800231934, "kl": 0.0445556640625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.839002251625061, "reward_std": 0.18106478452682495, "rewards/accuracy_reward": 0.839002251625061, "rewards/format_reward": 1.0, "step": 553 }, { "completion_length": 226.5714340209961, "epoch": 0.05574842767295597, "grad_norm": 0.8524710536003113, "kl": 0.0478515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7332912683486938, "reward_std": 0.25241950154304504, "rewards/accuracy_reward": 0.7639034688472748, "rewards/format_reward": 0.9693877398967743, "step": 554 }, { "completion_length": 276.11224365234375, "epoch": 0.05584905660377359, "grad_norm": 0.9280441403388977, "kl": 0.0330810546875, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.4101189970970154, "reward_std": 0.3651117533445358, "rewards/accuracy_reward": 0.4407312721014023, "rewards/format_reward": 0.9693877398967743, "step": 555 }, { "completion_length": 240.7653045654297, "epoch": 0.0559496855345912, "grad_norm": 1.1500756740570068, "kl": 0.0416259765625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7303206324577332, "reward_std": 0.25803717225790024, "rewards/accuracy_reward": 0.7507288157939911, "rewards/format_reward": 0.9795918464660645, "step": 556 }, { "completion_length": 193.12245178222656, "epoch": 0.056050314465408806, "grad_norm": 1.1187970638275146, "kl": 0.0513916015625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6548280715942383, "reward_std": 0.2521967589855194, "rewards/accuracy_reward": 0.6650322079658508, "rewards/format_reward": 0.9897959232330322, "step": 557 }, { "completion_length": 209.65306091308594, "epoch": 0.056150943396226415, "grad_norm": 0.8743786215782166, "kl": 0.02874755859375, "learning_rate": 1e-06, "loss": 0.0011, "reward": 1.6612962484359741, "reward_std": 0.25901348143815994, "rewards/accuracy_reward": 0.6612962782382965, "rewards/format_reward": 1.0, "step": 558 }, { "completion_length": 152.51020050048828, "epoch": 0.056251572327044023, "grad_norm": 1.645678162574768, "kl": 0.0523681640625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6417086124420166, "reward_std": 0.28117550909519196, "rewards/accuracy_reward": 0.641708642244339, "rewards/format_reward": 1.0, "step": 559 }, { "completion_length": 182.11223602294922, "epoch": 0.05635220125786163, "grad_norm": 1.3571704626083374, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7024754285812378, "reward_std": 0.27022770792245865, "rewards/accuracy_reward": 0.7432917058467865, "rewards/format_reward": 0.9591836333274841, "step": 560 }, { "completion_length": 308.5816345214844, "epoch": 0.05645283018867925, "grad_norm": 0.8710061311721802, "kl": 0.0355224609375, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.5209750533103943, "reward_std": 0.27834688127040863, "rewards/accuracy_reward": 0.5209750533103943, "rewards/format_reward": 1.0, "step": 561 }, { "completion_length": 210.6530532836914, "epoch": 0.05655345911949686, "grad_norm": 1.5166618824005127, "kl": 0.046630859375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6253079175949097, "reward_std": 0.22926808893680573, "rewards/accuracy_reward": 0.6355120241641998, "rewards/format_reward": 0.9897959232330322, "step": 562 }, { "completion_length": 186.6938705444336, "epoch": 0.056654088050314466, "grad_norm": 1.2781797647476196, "kl": 0.0601806640625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6270650029182434, "reward_std": 0.15618659555912018, "rewards/accuracy_reward": 0.6270650923252106, "rewards/format_reward": 1.0, "step": 563 }, { "completion_length": 239.19387817382812, "epoch": 0.056754716981132075, "grad_norm": 1.1773837804794312, "kl": 0.059814453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5015924572944641, "reward_std": 0.1904071867465973, "rewards/accuracy_reward": 0.5117966532707214, "rewards/format_reward": 0.9897959232330322, "step": 564 }, { "completion_length": 219.04080963134766, "epoch": 0.056855345911949684, "grad_norm": 0.9522802829742432, "kl": 0.0518798828125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.739552915096283, "reward_std": 0.2137538120150566, "rewards/accuracy_reward": 0.7497570216655731, "rewards/format_reward": 0.9897959232330322, "step": 565 }, { "completion_length": 160.4693832397461, "epoch": 0.05695597484276729, "grad_norm": 1.6391141414642334, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6575091481208801, "reward_std": 0.17904648184776306, "rewards/accuracy_reward": 0.6677131652832031, "rewards/format_reward": 0.9897959232330322, "step": 566 }, { "completion_length": 179.53060913085938, "epoch": 0.05705660377358491, "grad_norm": 1.3023920059204102, "kl": 0.052734375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.646258533000946, "reward_std": 0.22601937502622604, "rewards/accuracy_reward": 0.656462550163269, "rewards/format_reward": 0.9897959232330322, "step": 567 }, { "completion_length": 289.1428451538086, "epoch": 0.05715723270440252, "grad_norm": 2.0223891735076904, "kl": 0.0509033203125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.4031230211257935, "reward_std": 0.3996752202510834, "rewards/accuracy_reward": 0.413327157497406, "rewards/format_reward": 0.9897959232330322, "step": 568 }, { "completion_length": 263.1530532836914, "epoch": 0.05725786163522013, "grad_norm": 0.9342160820960999, "kl": 0.0465087890625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6114861965179443, "reward_std": 0.3957188278436661, "rewards/accuracy_reward": 0.6523025631904602, "rewards/format_reward": 0.9591836631298065, "step": 569 }, { "completion_length": 270.1428527832031, "epoch": 0.057358490566037736, "grad_norm": 0.8424357175827026, "kl": 0.050048828125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6562776565551758, "reward_std": 0.2624949514865875, "rewards/accuracy_reward": 0.6868899613618851, "rewards/format_reward": 0.9693877398967743, "step": 570 }, { "completion_length": 298.9387664794922, "epoch": 0.057459119496855345, "grad_norm": 0.45346108078956604, "kl": 0.0667724609375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.537012755870819, "reward_std": 0.16883019357919693, "rewards/accuracy_reward": 0.5676249861717224, "rewards/format_reward": 0.9693877398967743, "step": 571 }, { "completion_length": 231.51020050048828, "epoch": 0.05755974842767295, "grad_norm": 1.8941878080368042, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6280148029327393, "reward_std": 0.24248303472995758, "rewards/accuracy_reward": 0.6382189393043518, "rewards/format_reward": 0.9897959232330322, "step": 572 }, { "completion_length": 256.06121826171875, "epoch": 0.05766037735849057, "grad_norm": 0.8801611661911011, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6199420094490051, "reward_std": 0.2569033354520798, "rewards/accuracy_reward": 0.6301460415124893, "rewards/format_reward": 0.9897959232330322, "step": 573 }, { "completion_length": 238.77550506591797, "epoch": 0.05776100628930818, "grad_norm": 1.3415331840515137, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5866982340812683, "reward_std": 0.14988253265619278, "rewards/accuracy_reward": 0.5866982340812683, "rewards/format_reward": 1.0, "step": 574 }, { "completion_length": 197.3571319580078, "epoch": 0.05786163522012579, "grad_norm": 0.7254865169525146, "kl": 0.0572509765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7527520060539246, "reward_std": 0.23291490226984024, "rewards/accuracy_reward": 0.752752035856247, "rewards/format_reward": 1.0, "step": 575 }, { "completion_length": 241.63265228271484, "epoch": 0.057962264150943396, "grad_norm": 0.7909625768661499, "kl": 0.0435791015625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6781843900680542, "reward_std": 0.2278912588953972, "rewards/accuracy_reward": 0.6985925436019897, "rewards/format_reward": 0.9795918464660645, "step": 576 }, { "completion_length": 204.88775634765625, "epoch": 0.058062893081761005, "grad_norm": 0.7977965474128723, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.715249478816986, "reward_std": 0.24112117290496826, "rewards/accuracy_reward": 0.7254535257816315, "rewards/format_reward": 0.9897959232330322, "step": 577 }, { "completion_length": 258.8775405883789, "epoch": 0.058163522012578614, "grad_norm": 0.748950719833374, "kl": 0.0404052734375, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.5416477918624878, "reward_std": 0.29696860909461975, "rewards/accuracy_reward": 0.5824640691280365, "rewards/format_reward": 0.9591836333274841, "step": 578 }, { "completion_length": 199.41836547851562, "epoch": 0.05826415094339623, "grad_norm": 0.7269613146781921, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6816635131835938, "reward_std": 0.18900080025196075, "rewards/accuracy_reward": 0.6816635727882385, "rewards/format_reward": 1.0, "step": 579 }, { "completion_length": 215.35713958740234, "epoch": 0.05836477987421384, "grad_norm": 0.9984932541847229, "kl": 0.05908203125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6141663789749146, "reward_std": 0.21663975715637207, "rewards/accuracy_reward": 0.6345745921134949, "rewards/format_reward": 0.9795918166637421, "step": 580 }, { "completion_length": 205.63265228271484, "epoch": 0.05846540880503145, "grad_norm": 1.4841222763061523, "kl": 0.0421142578125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.589726984500885, "reward_std": 0.19491031765937805, "rewards/accuracy_reward": 0.5897270143032074, "rewards/format_reward": 1.0, "step": 581 }, { "completion_length": 217.01020050048828, "epoch": 0.05856603773584906, "grad_norm": 0.9081935882568359, "kl": 0.0418701171875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.4468334913253784, "reward_std": 0.28951554000377655, "rewards/accuracy_reward": 0.46724164485931396, "rewards/format_reward": 0.9795918464660645, "step": 582 }, { "completion_length": 205.25509643554688, "epoch": 0.058666666666666666, "grad_norm": 1.3216997385025024, "kl": 0.045166015625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.613227665424347, "reward_std": 0.17805609107017517, "rewards/accuracy_reward": 0.6234317421913147, "rewards/format_reward": 0.9897959232330322, "step": 583 }, { "completion_length": 252.38775634765625, "epoch": 0.058767295597484275, "grad_norm": 0.871753990650177, "kl": 0.0516357421875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6499459147453308, "reward_std": 0.19684380292892456, "rewards/accuracy_reward": 0.6601499319076538, "rewards/format_reward": 0.9897959232330322, "step": 584 }, { "completion_length": 196.16326141357422, "epoch": 0.05886792452830188, "grad_norm": 1.7882424592971802, "kl": 0.044677734375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7351797223091125, "reward_std": 0.2804352566599846, "rewards/accuracy_reward": 0.7453838288784027, "rewards/format_reward": 0.9897959232330322, "step": 585 }, { "completion_length": 332.39794921875, "epoch": 0.0589685534591195, "grad_norm": 0.7408169507980347, "kl": 0.0379638671875, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5586734414100647, "reward_std": 0.2581512853503227, "rewards/accuracy_reward": 0.5586734712123871, "rewards/format_reward": 1.0, "step": 586 }, { "completion_length": 264.8163299560547, "epoch": 0.05906918238993711, "grad_norm": 0.7079954743385315, "kl": 0.046630859375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7282062768936157, "reward_std": 0.22300201654434204, "rewards/accuracy_reward": 0.7282063364982605, "rewards/format_reward": 1.0, "step": 587 }, { "completion_length": 130.44897842407227, "epoch": 0.05916981132075472, "grad_norm": 3.2323055267333984, "kl": 0.0556640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8510203957557678, "reward_std": 0.12418832257390022, "rewards/accuracy_reward": 0.8510203957557678, "rewards/format_reward": 1.0, "step": 588 }, { "completion_length": 199.63265228271484, "epoch": 0.059270440251572326, "grad_norm": 2.3962202072143555, "kl": 0.062744140625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7465985417366028, "reward_std": 0.3162122815847397, "rewards/accuracy_reward": 0.7465986013412476, "rewards/format_reward": 1.0, "step": 589 }, { "completion_length": 170.2959213256836, "epoch": 0.059371069182389935, "grad_norm": 1.7629530429840088, "kl": 0.05712890625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6576772928237915, "reward_std": 0.3037860542535782, "rewards/accuracy_reward": 0.7189018428325653, "rewards/format_reward": 0.938775509595871, "step": 590 }, { "completion_length": 152.30612182617188, "epoch": 0.059471698113207544, "grad_norm": 1.2676259279251099, "kl": 0.0670166015625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8166007995605469, "reward_std": 0.18901799619197845, "rewards/accuracy_reward": 0.8166008591651917, "rewards/format_reward": 1.0, "step": 591 }, { "completion_length": 257.9591751098633, "epoch": 0.05957232704402516, "grad_norm": 1.0549476146697998, "kl": 0.0430908203125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6738095879554749, "reward_std": 0.2828284353017807, "rewards/accuracy_reward": 0.6738095283508301, "rewards/format_reward": 1.0, "step": 592 }, { "completion_length": 236.61223602294922, "epoch": 0.05967295597484277, "grad_norm": 2.913694381713867, "kl": 0.03179931640625, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.6229590773582458, "reward_std": 0.34625913202762604, "rewards/accuracy_reward": 0.6331633031368256, "rewards/format_reward": 0.9897959232330322, "step": 593 }, { "completion_length": 237.7551040649414, "epoch": 0.05977358490566038, "grad_norm": 1.352452039718628, "kl": 0.04638671875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5494871735572815, "reward_std": 0.316531777381897, "rewards/accuracy_reward": 0.5698954164981842, "rewards/format_reward": 0.9795918166637421, "step": 594 }, { "completion_length": 245.9897918701172, "epoch": 0.05987421383647799, "grad_norm": 0.6939805150032043, "kl": 0.049072265625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5485251545906067, "reward_std": 0.17666541785001755, "rewards/accuracy_reward": 0.558729276061058, "rewards/format_reward": 0.9897959232330322, "step": 595 }, { "completion_length": 210.4081573486328, "epoch": 0.059974842767295596, "grad_norm": 1.0405529737472534, "kl": 0.0548095703125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.53316330909729, "reward_std": 0.21207620948553085, "rewards/accuracy_reward": 0.5331632643938065, "rewards/format_reward": 1.0, "step": 596 }, { "completion_length": 151.68366622924805, "epoch": 0.060075471698113204, "grad_norm": 1.300233006477356, "kl": 0.052490234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.787569522857666, "reward_std": 0.2710425332188606, "rewards/accuracy_reward": 0.7977736294269562, "rewards/format_reward": 0.9897959232330322, "step": 597 }, { "completion_length": 175.41836547851562, "epoch": 0.06017610062893082, "grad_norm": 0.7575361132621765, "kl": 0.04345703125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7448979020118713, "reward_std": 0.18887970596551895, "rewards/accuracy_reward": 0.7448979318141937, "rewards/format_reward": 1.0, "step": 598 }, { "completion_length": 214.6836700439453, "epoch": 0.06027672955974843, "grad_norm": 7.225508689880371, "kl": 0.053955078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.638893485069275, "reward_std": 0.2509370967745781, "rewards/accuracy_reward": 0.6695058047771454, "rewards/format_reward": 0.9693877398967743, "step": 599 }, { "completion_length": 167.38774871826172, "epoch": 0.06037735849056604, "grad_norm": 1.0116561651229858, "kl": 0.056396484375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.795918345451355, "reward_std": 0.20006242021918297, "rewards/accuracy_reward": 0.8061224520206451, "rewards/format_reward": 0.9897959232330322, "step": 600 }, { "completion_length": 276.28570556640625, "epoch": 0.06047798742138365, "grad_norm": 0.8444428443908691, "kl": 0.045654296875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.5363326668739319, "reward_std": 0.26247961819171906, "rewards/accuracy_reward": 0.5465367883443832, "rewards/format_reward": 0.9897959232330322, "step": 601 }, { "completion_length": 238.30612182617188, "epoch": 0.060578616352201256, "grad_norm": 0.7837564945220947, "kl": 0.0426025390625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6882195472717285, "reward_std": 0.24806922674179077, "rewards/accuracy_reward": 0.7086276710033417, "rewards/format_reward": 0.9795918464660645, "step": 602 }, { "completion_length": 161.4795913696289, "epoch": 0.060679245283018865, "grad_norm": 1.1902313232421875, "kl": 0.0601806640625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8167282938957214, "reward_std": 0.1769126020371914, "rewards/accuracy_reward": 0.8269323706626892, "rewards/format_reward": 0.9897959232330322, "step": 603 }, { "completion_length": 175.3775405883789, "epoch": 0.06077987421383648, "grad_norm": 0.6536528468132019, "kl": 0.0489501953125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8226289749145508, "reward_std": 0.17855096980929375, "rewards/accuracy_reward": 0.8226290047168732, "rewards/format_reward": 1.0, "step": 604 }, { "completion_length": 224.26529693603516, "epoch": 0.06088050314465409, "grad_norm": 0.7453655004501343, "kl": 0.0384521484375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.7088594436645508, "reward_std": 0.1642460934817791, "rewards/accuracy_reward": 0.7088595032691956, "rewards/format_reward": 1.0, "step": 605 }, { "completion_length": 157.73468780517578, "epoch": 0.0609811320754717, "grad_norm": 1.2026736736297607, "kl": 0.0516357421875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6609116792678833, "reward_std": 0.15215105563402176, "rewards/accuracy_reward": 0.6609116941690445, "rewards/format_reward": 1.0, "step": 606 }, { "completion_length": 251.95917510986328, "epoch": 0.06108176100628931, "grad_norm": 1.3836373090744019, "kl": 0.04541015625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6409488320350647, "reward_std": 0.29800087213516235, "rewards/accuracy_reward": 0.6511529088020325, "rewards/format_reward": 0.9897959232330322, "step": 607 }, { "completion_length": 219.1938705444336, "epoch": 0.06118238993710692, "grad_norm": 0.7695648074150085, "kl": 0.0439453125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6292517185211182, "reward_std": 0.18534985929727554, "rewards/accuracy_reward": 0.6496598720550537, "rewards/format_reward": 0.9795918166637421, "step": 608 }, { "completion_length": 179.45917892456055, "epoch": 0.061283018867924526, "grad_norm": 1.5296813249588013, "kl": 0.0606689453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6717007756233215, "reward_std": 0.22948335111141205, "rewards/accuracy_reward": 0.6819049119949341, "rewards/format_reward": 0.9897959232330322, "step": 609 }, { "completion_length": 189.47958374023438, "epoch": 0.06138364779874214, "grad_norm": 0.9325540065765381, "kl": 0.043701171875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5545674562454224, "reward_std": 0.21840369701385498, "rewards/accuracy_reward": 0.5647715926170349, "rewards/format_reward": 0.9897959232330322, "step": 610 }, { "completion_length": 212.05101776123047, "epoch": 0.06148427672955975, "grad_norm": 1.1194623708724976, "kl": 0.0526123046875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7356964349746704, "reward_std": 0.19415920227766037, "rewards/accuracy_reward": 0.745900422334671, "rewards/format_reward": 0.9897959232330322, "step": 611 }, { "completion_length": 210.69387817382812, "epoch": 0.06158490566037736, "grad_norm": 0.9515805244445801, "kl": 0.04681396484375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.548861801624298, "reward_std": 0.1542324721813202, "rewards/accuracy_reward": 0.5488617271184921, "rewards/format_reward": 1.0, "step": 612 }, { "completion_length": 231.06121826171875, "epoch": 0.06168553459119497, "grad_norm": 2.0208516120910645, "kl": 0.0416259765625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6690961718559265, "reward_std": 0.2998085170984268, "rewards/accuracy_reward": 0.6895043551921844, "rewards/format_reward": 0.9795918464660645, "step": 613 }, { "completion_length": 186.52040100097656, "epoch": 0.06178616352201258, "grad_norm": 0.9749804735183716, "kl": 0.0438232421875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.825375497341156, "reward_std": 0.1694789007306099, "rewards/accuracy_reward": 0.8253755867481232, "rewards/format_reward": 1.0, "step": 614 }, { "completion_length": 222.4285659790039, "epoch": 0.061886792452830186, "grad_norm": 1.291024923324585, "kl": 0.056884765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.574343979358673, "reward_std": 0.29838877171278, "rewards/accuracy_reward": 0.5845480561256409, "rewards/format_reward": 0.9897959232330322, "step": 615 }, { "completion_length": 252.6326446533203, "epoch": 0.0619874213836478, "grad_norm": 0.8187521696090698, "kl": 0.0380859375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5452787280082703, "reward_std": 0.28779877722263336, "rewards/accuracy_reward": 0.5554827749729156, "rewards/format_reward": 0.9897959232330322, "step": 616 }, { "completion_length": 262.05101013183594, "epoch": 0.06208805031446541, "grad_norm": 0.8906151056289673, "kl": 0.0479736328125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.549106776714325, "reward_std": 0.2286851368844509, "rewards/accuracy_reward": 0.5797191560268402, "rewards/format_reward": 0.9693877398967743, "step": 617 }, { "completion_length": 164.10203552246094, "epoch": 0.06218867924528302, "grad_norm": 0.75602126121521, "kl": 0.042724609375, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6836734414100647, "reward_std": 0.1652088463306427, "rewards/accuracy_reward": 0.7040816247463226, "rewards/format_reward": 0.9795918166637421, "step": 618 }, { "completion_length": 215.16326141357422, "epoch": 0.06228930817610063, "grad_norm": 1.1629854440689087, "kl": 0.0477294921875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6293487548828125, "reward_std": 0.16499963402748108, "rewards/accuracy_reward": 0.6497570276260376, "rewards/format_reward": 0.9795918166637421, "step": 619 }, { "completion_length": 246.4081573486328, "epoch": 0.06238993710691824, "grad_norm": 1.3104926347732544, "kl": 0.045166015625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.5427643656730652, "reward_std": 0.19909356534481049, "rewards/accuracy_reward": 0.5631725192070007, "rewards/format_reward": 0.9795918464660645, "step": 620 }, { "completion_length": 127.37754440307617, "epoch": 0.06249056603773585, "grad_norm": 0.8751658201217651, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8407907485961914, "reward_std": 0.1611941158771515, "rewards/accuracy_reward": 0.8509949147701263, "rewards/format_reward": 0.9897959232330322, "step": 621 }, { "completion_length": 223.94898223876953, "epoch": 0.06259119496855346, "grad_norm": 1.4346580505371094, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5926057696342468, "reward_std": 0.26968298852443695, "rewards/accuracy_reward": 0.592605784535408, "rewards/format_reward": 1.0, "step": 622 }, { "completion_length": 228.60204315185547, "epoch": 0.06269182389937107, "grad_norm": 1.1447936296463013, "kl": 0.0614013671875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6122448444366455, "reward_std": 0.23954802751541138, "rewards/accuracy_reward": 0.6224489808082581, "rewards/format_reward": 0.9897959232330322, "step": 623 }, { "completion_length": 207.5, "epoch": 0.06279245283018868, "grad_norm": 1.1250125169754028, "kl": 0.03564453125, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.5654228329658508, "reward_std": 0.2577954903244972, "rewards/accuracy_reward": 0.5654228925704956, "rewards/format_reward": 1.0, "step": 624 }, { "completion_length": 150.1836700439453, "epoch": 0.06289308176100629, "grad_norm": 3.098036289215088, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7304988503456116, "reward_std": 0.2036053091287613, "rewards/accuracy_reward": 0.7304988503456116, "rewards/format_reward": 1.0, "step": 625 }, { "completion_length": 212.03060913085938, "epoch": 0.0629937106918239, "grad_norm": 1.129137635231018, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5876646637916565, "reward_std": 0.2768736183643341, "rewards/accuracy_reward": 0.5876646637916565, "rewards/format_reward": 1.0, "step": 626 }, { "completion_length": 188.1836700439453, "epoch": 0.06309433962264151, "grad_norm": 0.9160040616989136, "kl": 0.048095703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.71683669090271, "reward_std": 0.17455783858895302, "rewards/accuracy_reward": 0.7168367207050323, "rewards/format_reward": 1.0, "step": 627 }, { "completion_length": 203.32652282714844, "epoch": 0.06319496855345912, "grad_norm": 0.8473353385925293, "kl": 0.0496826171875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6679264903068542, "reward_std": 0.22440577298402786, "rewards/accuracy_reward": 0.6679264903068542, "rewards/format_reward": 1.0, "step": 628 }, { "completion_length": 255.82652282714844, "epoch": 0.06329559748427672, "grad_norm": 1.5553635358810425, "kl": 0.0477294921875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5698785185813904, "reward_std": 0.26472947001457214, "rewards/accuracy_reward": 0.5698785185813904, "rewards/format_reward": 1.0, "step": 629 }, { "completion_length": 221.9081573486328, "epoch": 0.06339622641509433, "grad_norm": 1.9211862087249756, "kl": 0.10205078125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.634750485420227, "reward_std": 0.163607906550169, "rewards/accuracy_reward": 0.644954651594162, "rewards/format_reward": 0.9897959232330322, "step": 630 }, { "completion_length": 262.67346954345703, "epoch": 0.06349685534591194, "grad_norm": 1.429965853691101, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5280147790908813, "reward_std": 0.3097486048936844, "rewards/accuracy_reward": 0.5484230071306229, "rewards/format_reward": 0.9795918464660645, "step": 631 }, { "completion_length": 212.83673095703125, "epoch": 0.06359748427672957, "grad_norm": 3.0430831909179688, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7020943760871887, "reward_std": 0.17621804773807526, "rewards/accuracy_reward": 0.7122985124588013, "rewards/format_reward": 0.9897959232330322, "step": 632 }, { "completion_length": 226.78570556640625, "epoch": 0.06369811320754717, "grad_norm": 2.9523470401763916, "kl": 0.042236328125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.656210720539093, "reward_std": 0.2518479749560356, "rewards/accuracy_reward": 0.6562106609344482, "rewards/format_reward": 1.0, "step": 633 }, { "completion_length": 245.26529693603516, "epoch": 0.06379874213836478, "grad_norm": 0.8441362977027893, "kl": 0.04638671875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5714285373687744, "reward_std": 0.23038646578788757, "rewards/accuracy_reward": 0.5714285522699356, "rewards/format_reward": 1.0, "step": 634 }, { "completion_length": 143.83673477172852, "epoch": 0.06389937106918239, "grad_norm": 3.2597835063934326, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7325624227523804, "reward_std": 0.18637292087078094, "rewards/accuracy_reward": 0.732562392950058, "rewards/format_reward": 1.0, "step": 635 }, { "completion_length": 286.8571472167969, "epoch": 0.064, "grad_norm": 0.9820572733879089, "kl": 0.0469970703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.584183692932129, "reward_std": 0.27758076041936874, "rewards/accuracy_reward": 0.6045918315649033, "rewards/format_reward": 0.9795918464660645, "step": 636 }, { "completion_length": 212.83673095703125, "epoch": 0.06410062893081761, "grad_norm": 1.1416425704956055, "kl": 0.059326171875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7414860129356384, "reward_std": 0.190762497484684, "rewards/accuracy_reward": 0.7414860129356384, "rewards/format_reward": 1.0, "step": 637 }, { "completion_length": 283.1428527832031, "epoch": 0.06420125786163522, "grad_norm": 1.020917296409607, "kl": 0.05126953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5656772255897522, "reward_std": 0.2725513428449631, "rewards/accuracy_reward": 0.586085319519043, "rewards/format_reward": 0.9795918464660645, "step": 638 }, { "completion_length": 239.1326446533203, "epoch": 0.06430188679245283, "grad_norm": 1.3747550249099731, "kl": 0.048095703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5609569549560547, "reward_std": 0.11091002821922302, "rewards/accuracy_reward": 0.5609569549560547, "rewards/format_reward": 1.0, "step": 639 }, { "completion_length": 266.6122360229492, "epoch": 0.06440251572327044, "grad_norm": 0.73558109998703, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5827149152755737, "reward_std": 0.24642712622880936, "rewards/accuracy_reward": 0.5929190069437027, "rewards/format_reward": 0.9897959232330322, "step": 640 }, { "completion_length": 229.54080963134766, "epoch": 0.06450314465408805, "grad_norm": 0.9223865270614624, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5966553092002869, "reward_std": 0.28493088483810425, "rewards/accuracy_reward": 0.6170634627342224, "rewards/format_reward": 0.9795918166637421, "step": 641 }, { "completion_length": 229.5, "epoch": 0.06460377358490565, "grad_norm": 1.6562703847885132, "kl": 0.046630859375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.574691653251648, "reward_std": 0.28560158610343933, "rewards/accuracy_reward": 0.574691653251648, "rewards/format_reward": 1.0, "step": 642 }, { "completion_length": 218.55101776123047, "epoch": 0.06470440251572326, "grad_norm": 3.8347489833831787, "kl": 0.03515625, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.7767394185066223, "reward_std": 0.18256894126534462, "rewards/accuracy_reward": 0.7767394185066223, "rewards/format_reward": 1.0, "step": 643 }, { "completion_length": 210.48978424072266, "epoch": 0.06480503144654089, "grad_norm": 0.9217144250869751, "kl": 0.051025390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6804340481758118, "reward_std": 0.172087199985981, "rewards/accuracy_reward": 0.6906381249427795, "rewards/format_reward": 0.9897959232330322, "step": 644 }, { "completion_length": 188.67346954345703, "epoch": 0.0649056603773585, "grad_norm": 0.9189055562019348, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7040815949440002, "reward_std": 0.20791853964328766, "rewards/accuracy_reward": 0.7142857015132904, "rewards/format_reward": 0.9897959232330322, "step": 645 }, { "completion_length": 241.3775405883789, "epoch": 0.0650062893081761, "grad_norm": 1.344831943511963, "kl": 0.115478515625, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.5520716905593872, "reward_std": 0.20721201598644257, "rewards/accuracy_reward": 0.5622758120298386, "rewards/format_reward": 0.9897959232330322, "step": 646 }, { "completion_length": 227.2653045654297, "epoch": 0.06510691823899371, "grad_norm": 1.2658963203430176, "kl": 0.0472412109375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5416909456253052, "reward_std": 0.16115843504667282, "rewards/accuracy_reward": 0.5416909605264664, "rewards/format_reward": 1.0, "step": 647 }, { "completion_length": 180.28570556640625, "epoch": 0.06520754716981132, "grad_norm": 3.6243181228637695, "kl": 0.047607421875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.572741448879242, "reward_std": 0.10787662491202354, "rewards/accuracy_reward": 0.5727413594722748, "rewards/format_reward": 1.0, "step": 648 }, { "completion_length": 207.7142791748047, "epoch": 0.06530817610062893, "grad_norm": 1.3266823291778564, "kl": 0.052490234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7338435649871826, "reward_std": 0.23492302745580673, "rewards/accuracy_reward": 0.7440475821495056, "rewards/format_reward": 0.9897959232330322, "step": 649 }, { "completion_length": 224.14285278320312, "epoch": 0.06540880503144654, "grad_norm": 0.770022451877594, "kl": 0.0400390625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6631957292556763, "reward_std": 0.1185486949980259, "rewards/accuracy_reward": 0.6631958186626434, "rewards/format_reward": 1.0, "step": 650 }, { "completion_length": 151.5408172607422, "epoch": 0.06550943396226415, "grad_norm": 1.1983436346054077, "kl": 0.04833984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7897310853004456, "reward_std": 0.2330954521894455, "rewards/accuracy_reward": 0.7897311449050903, "rewards/format_reward": 1.0, "step": 651 }, { "completion_length": 246.4081573486328, "epoch": 0.06561006289308176, "grad_norm": 1.2567065954208374, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.458132266998291, "reward_std": 0.3051834851503372, "rewards/accuracy_reward": 0.4989486485719681, "rewards/format_reward": 0.9591836333274841, "step": 652 }, { "completion_length": 254.73468780517578, "epoch": 0.06571069182389937, "grad_norm": 0.8115875720977783, "kl": 0.0428466796875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.4803427457809448, "reward_std": 0.28078388422727585, "rewards/accuracy_reward": 0.5109550058841705, "rewards/format_reward": 0.9693877398967743, "step": 653 }, { "completion_length": 266.8163299560547, "epoch": 0.06581132075471698, "grad_norm": 1.3023743629455566, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5285714268684387, "reward_std": 0.3368034213781357, "rewards/accuracy_reward": 0.5489795804023743, "rewards/format_reward": 0.9795918464660645, "step": 654 }, { "completion_length": 231.75509643554688, "epoch": 0.06591194968553458, "grad_norm": 1.428227424621582, "kl": 0.052001953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5910459756851196, "reward_std": 0.2619079351425171, "rewards/accuracy_reward": 0.5910460352897644, "rewards/format_reward": 1.0, "step": 655 }, { "completion_length": 227.0, "epoch": 0.06601257861635221, "grad_norm": 0.6970523595809937, "kl": 0.049560546875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6438966989517212, "reward_std": 0.179672509431839, "rewards/accuracy_reward": 0.6541007608175278, "rewards/format_reward": 0.9897959232330322, "step": 656 }, { "completion_length": 170.5204086303711, "epoch": 0.06611320754716982, "grad_norm": 1.3814505338668823, "kl": 0.0428466796875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7664016485214233, "reward_std": 0.16261335089802742, "rewards/accuracy_reward": 0.7766056656837463, "rewards/format_reward": 0.9897959232330322, "step": 657 }, { "completion_length": 183.10203552246094, "epoch": 0.06621383647798743, "grad_norm": 1.5888592004776, "kl": 0.048095703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6428571343421936, "reward_std": 0.257116474211216, "rewards/accuracy_reward": 0.6530612111091614, "rewards/format_reward": 0.9897959232330322, "step": 658 }, { "completion_length": 127.31632614135742, "epoch": 0.06631446540880503, "grad_norm": 1.5618621110916138, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7521070837974548, "reward_std": 0.1854408048093319, "rewards/accuracy_reward": 0.7521070539951324, "rewards/format_reward": 1.0, "step": 659 }, { "completion_length": 214.82653045654297, "epoch": 0.06641509433962264, "grad_norm": 1.3208829164505005, "kl": 0.0511474609375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.684280812740326, "reward_std": 0.25471457093954086, "rewards/accuracy_reward": 0.6944848895072937, "rewards/format_reward": 0.9897959232330322, "step": 660 }, { "completion_length": 237.04080963134766, "epoch": 0.06651572327044025, "grad_norm": 1.4107530117034912, "kl": 0.0494384765625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6994411945343018, "reward_std": 0.26228031516075134, "rewards/accuracy_reward": 0.6994412839412689, "rewards/format_reward": 1.0, "step": 661 }, { "completion_length": 190.2040786743164, "epoch": 0.06661635220125786, "grad_norm": 0.7155662775039673, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7346938252449036, "reward_std": 0.1348847858607769, "rewards/accuracy_reward": 0.734693855047226, "rewards/format_reward": 1.0, "step": 662 }, { "completion_length": 226.1734619140625, "epoch": 0.06671698113207547, "grad_norm": 0.6217595934867859, "kl": 0.04052734375, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.739646077156067, "reward_std": 0.167621448636055, "rewards/accuracy_reward": 0.7396462261676788, "rewards/format_reward": 1.0, "step": 663 }, { "completion_length": 233.60204315185547, "epoch": 0.06681761006289308, "grad_norm": 1.3978074789047241, "kl": 0.0498046875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.597707450389862, "reward_std": 0.2580036297440529, "rewards/accuracy_reward": 0.6283197104930878, "rewards/format_reward": 0.9693877398967743, "step": 664 }, { "completion_length": 242.59182739257812, "epoch": 0.06691823899371069, "grad_norm": 1.4066390991210938, "kl": 0.052734375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5322643518447876, "reward_std": 0.28123943507671356, "rewards/accuracy_reward": 0.5526724755764008, "rewards/format_reward": 0.9795918464660645, "step": 665 }, { "completion_length": 227.7142791748047, "epoch": 0.0670188679245283, "grad_norm": 1.2136627435684204, "kl": 0.0408935546875, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7697764039039612, "reward_std": 0.29487377405166626, "rewards/accuracy_reward": 0.7901846170425415, "rewards/format_reward": 0.9795918464660645, "step": 666 }, { "completion_length": 200.06122589111328, "epoch": 0.0671194968553459, "grad_norm": 1.7433042526245117, "kl": 0.037109375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.7091566920280457, "reward_std": 0.22639130055904388, "rewards/accuracy_reward": 0.729564905166626, "rewards/format_reward": 0.9795918464660645, "step": 667 }, { "completion_length": 220.09182739257812, "epoch": 0.06722012578616353, "grad_norm": 1.0753329992294312, "kl": 0.0643310546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.427304208278656, "reward_std": 0.21167374402284622, "rewards/accuracy_reward": 0.4273042231798172, "rewards/format_reward": 1.0, "step": 668 }, { "completion_length": 214.65306091308594, "epoch": 0.06732075471698114, "grad_norm": 1.2847342491149902, "kl": 0.0528564453125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.4093294739723206, "reward_std": 0.2761451378464699, "rewards/accuracy_reward": 0.41953352093696594, "rewards/format_reward": 0.9897959232330322, "step": 669 }, { "completion_length": 207.9081573486328, "epoch": 0.06742138364779875, "grad_norm": 0.7691369652748108, "kl": 0.0416259765625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.679786205291748, "reward_std": 0.2325768657028675, "rewards/accuracy_reward": 0.6899902522563934, "rewards/format_reward": 0.9897959232330322, "step": 670 }, { "completion_length": 168.28571319580078, "epoch": 0.06752201257861636, "grad_norm": 2.9625487327575684, "kl": 0.0572509765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6938774585723877, "reward_std": 0.13035526871681213, "rewards/accuracy_reward": 0.6938775181770325, "rewards/format_reward": 1.0, "step": 671 }, { "completion_length": 233.66326141357422, "epoch": 0.06762264150943396, "grad_norm": 0.8796616792678833, "kl": 0.048095703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.4961127042770386, "reward_std": 0.17333883047103882, "rewards/accuracy_reward": 0.5063168108463287, "rewards/format_reward": 0.9897959232330322, "step": 672 }, { "completion_length": 245.2653045654297, "epoch": 0.06772327044025157, "grad_norm": 1.4569802284240723, "kl": 0.05572509765625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.656089961528778, "reward_std": 0.26021796464920044, "rewards/accuracy_reward": 0.6560900509357452, "rewards/format_reward": 1.0, "step": 673 }, { "completion_length": 256.25508880615234, "epoch": 0.06782389937106918, "grad_norm": 0.8101058602333069, "kl": 0.0445556640625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6829931735992432, "reward_std": 0.20657069236040115, "rewards/accuracy_reward": 0.6931972503662109, "rewards/format_reward": 0.9897959232330322, "step": 674 }, { "completion_length": 158.9591827392578, "epoch": 0.06792452830188679, "grad_norm": 1.2254451513290405, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5286686420440674, "reward_std": 0.13811438530683517, "rewards/accuracy_reward": 0.5286686569452286, "rewards/format_reward": 1.0, "step": 675 }, { "completion_length": 221.6734619140625, "epoch": 0.0680251572327044, "grad_norm": 1.1027588844299316, "kl": 0.047607421875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7653496861457825, "reward_std": 0.19557232409715652, "rewards/accuracy_reward": 0.775553822517395, "rewards/format_reward": 0.9897959232330322, "step": 676 }, { "completion_length": 203.61224365234375, "epoch": 0.06812578616352201, "grad_norm": 0.8578269481658936, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7653403878211975, "reward_std": 0.1809326931834221, "rewards/accuracy_reward": 0.7959526777267456, "rewards/format_reward": 0.9693877398967743, "step": 677 }, { "completion_length": 263.24488830566406, "epoch": 0.06822641509433962, "grad_norm": 0.723158597946167, "kl": 0.0435791015625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7938775420188904, "reward_std": 0.16193947196006775, "rewards/accuracy_reward": 0.793877512216568, "rewards/format_reward": 1.0, "step": 678 }, { "completion_length": 125.78570938110352, "epoch": 0.06832704402515723, "grad_norm": 1.3770027160644531, "kl": 0.0633544921875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7798833847045898, "reward_std": 0.21413689851760864, "rewards/accuracy_reward": 0.7900874316692352, "rewards/format_reward": 0.9897959232330322, "step": 679 }, { "completion_length": 238.59183502197266, "epoch": 0.06842767295597484, "grad_norm": 1.1633793115615845, "kl": 0.0435791015625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.490414321422577, "reward_std": 0.2383635938167572, "rewards/accuracy_reward": 0.4904143512248993, "rewards/format_reward": 1.0, "step": 680 }, { "completion_length": 236.4183578491211, "epoch": 0.06852830188679246, "grad_norm": 0.5698215961456299, "kl": 0.037841796875, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.6673468947410583, "reward_std": 0.08383273333311081, "rewards/accuracy_reward": 0.6673469245433807, "rewards/format_reward": 1.0, "step": 681 }, { "completion_length": 211.03060913085938, "epoch": 0.06862893081761007, "grad_norm": 1.0712405443191528, "kl": 0.0438232421875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.762755036354065, "reward_std": 0.14246393740177155, "rewards/accuracy_reward": 0.7831632494926453, "rewards/format_reward": 0.9795918166637421, "step": 682 }, { "completion_length": 288.23468017578125, "epoch": 0.06872955974842768, "grad_norm": 0.6354060769081116, "kl": 0.02685546875, "learning_rate": 1e-06, "loss": 0.0011, "reward": 1.565139889717102, "reward_std": 0.18712979555130005, "rewards/accuracy_reward": 0.5651399493217468, "rewards/format_reward": 1.0, "step": 683 }, { "completion_length": 191.1734619140625, "epoch": 0.06883018867924529, "grad_norm": 1.2581950426101685, "kl": 0.0489501953125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6877551078796387, "reward_std": 0.18386393785476685, "rewards/accuracy_reward": 0.6877550780773163, "rewards/format_reward": 1.0, "step": 684 }, { "completion_length": 214.81632232666016, "epoch": 0.0689308176100629, "grad_norm": 1.5352354049682617, "kl": 0.04437255859375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.5433858036994934, "reward_std": 0.2890883758664131, "rewards/accuracy_reward": 0.5739981085062027, "rewards/format_reward": 0.9693877398967743, "step": 685 }, { "completion_length": 189.48979949951172, "epoch": 0.0690314465408805, "grad_norm": 1.7435126304626465, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6578586101531982, "reward_std": 0.270507775247097, "rewards/accuracy_reward": 0.6680627465248108, "rewards/format_reward": 0.9897959232330322, "step": 686 }, { "completion_length": 248.2244873046875, "epoch": 0.06913207547169811, "grad_norm": 0.8626840710639954, "kl": 0.0352783203125, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.5203258991241455, "reward_std": 0.11565091647207737, "rewards/accuracy_reward": 0.5407340973615646, "rewards/format_reward": 0.9795918464660645, "step": 687 }, { "completion_length": 246.12244415283203, "epoch": 0.06923270440251572, "grad_norm": 0.8313063979148865, "kl": 0.047119140625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5896500945091248, "reward_std": 0.1735193468630314, "rewards/accuracy_reward": 0.5896501243114471, "rewards/format_reward": 1.0, "step": 688 }, { "completion_length": 326.39794921875, "epoch": 0.06933333333333333, "grad_norm": 1.2486337423324585, "kl": 0.0347900390625, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.4522307515144348, "reward_std": 0.29169680923223495, "rewards/accuracy_reward": 0.47263897955417633, "rewards/format_reward": 0.9795918166637421, "step": 689 }, { "completion_length": 283.4387664794922, "epoch": 0.06943396226415094, "grad_norm": 1.0893895626068115, "kl": 0.03955078125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6188045740127563, "reward_std": 0.21079561859369278, "rewards/accuracy_reward": 0.6188046336174011, "rewards/format_reward": 1.0, "step": 690 }, { "completion_length": 209.06121826171875, "epoch": 0.06953459119496855, "grad_norm": 0.771465539932251, "kl": 0.046142578125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6410431265830994, "reward_std": 0.20154938846826553, "rewards/accuracy_reward": 0.6512471735477448, "rewards/format_reward": 0.9897959232330322, "step": 691 }, { "completion_length": 166.2244873046875, "epoch": 0.06963522012578616, "grad_norm": 3.1745591163635254, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6861456632614136, "reward_std": 0.1592569276690483, "rewards/accuracy_reward": 0.6963498294353485, "rewards/format_reward": 0.9897959232330322, "step": 692 }, { "completion_length": 244.63265228271484, "epoch": 0.06973584905660378, "grad_norm": 1.5937020778656006, "kl": 0.0478515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.485148012638092, "reward_std": 0.2716085761785507, "rewards/accuracy_reward": 0.4953521639108658, "rewards/format_reward": 0.9897959232330322, "step": 693 }, { "completion_length": 217.89795684814453, "epoch": 0.06983647798742139, "grad_norm": 0.8890781402587891, "kl": 0.0711669921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7061223983764648, "reward_std": 0.30594048649072647, "rewards/accuracy_reward": 0.7265306115150452, "rewards/format_reward": 0.9795918464660645, "step": 694 }, { "completion_length": 175.78571319580078, "epoch": 0.069937106918239, "grad_norm": 0.9294074773788452, "kl": 0.0496826171875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.779522716999054, "reward_std": 0.12357144430279732, "rewards/accuracy_reward": 0.779522716999054, "rewards/format_reward": 1.0, "step": 695 }, { "completion_length": 329.5918426513672, "epoch": 0.0700377358490566, "grad_norm": 0.8897565603256226, "kl": 0.04254150390625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.3841086626052856, "reward_std": 0.2688257694244385, "rewards/accuracy_reward": 0.39431270956993103, "rewards/format_reward": 0.9897959232330322, "step": 696 }, { "completion_length": 173.68366622924805, "epoch": 0.07013836477987422, "grad_norm": 0.961010754108429, "kl": 0.048583984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7312925457954407, "reward_std": 0.20382416620850563, "rewards/accuracy_reward": 0.7414965927600861, "rewards/format_reward": 0.9897959232330322, "step": 697 }, { "completion_length": 265.37754821777344, "epoch": 0.07023899371069182, "grad_norm": 0.7715185880661011, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6587300896644592, "reward_std": 0.18527401983737946, "rewards/accuracy_reward": 0.658730149269104, "rewards/format_reward": 1.0, "step": 698 }, { "completion_length": 216.2142791748047, "epoch": 0.07033962264150943, "grad_norm": 1.0476585626602173, "kl": 0.0439453125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6028343439102173, "reward_std": 0.295586422085762, "rewards/accuracy_reward": 0.6028344333171844, "rewards/format_reward": 1.0, "step": 699 }, { "completion_length": 228.04080963134766, "epoch": 0.07044025157232704, "grad_norm": 0.5944713950157166, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6456631422042847, "reward_std": 0.18261761963367462, "rewards/accuracy_reward": 0.6456632614135742, "rewards/format_reward": 1.0, "step": 700 }, { "completion_length": 217.4795913696289, "epoch": 0.07054088050314465, "grad_norm": 0.8113008737564087, "kl": 0.05169677734375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.663143813610077, "reward_std": 0.16099806409329176, "rewards/accuracy_reward": 0.6631438136100769, "rewards/format_reward": 1.0, "step": 701 }, { "completion_length": 252.26529693603516, "epoch": 0.07064150943396226, "grad_norm": 1.126932978630066, "kl": 0.05078125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5287900567054749, "reward_std": 0.22281023114919662, "rewards/accuracy_reward": 0.5594023168087006, "rewards/format_reward": 0.9693877398967743, "step": 702 }, { "completion_length": 229.96939086914062, "epoch": 0.07074213836477987, "grad_norm": 0.9852256178855896, "kl": 0.05078125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5493847727775574, "reward_std": 0.2002788707613945, "rewards/accuracy_reward": 0.5595889687538147, "rewards/format_reward": 0.9897959232330322, "step": 703 }, { "completion_length": 271.15306091308594, "epoch": 0.07084276729559748, "grad_norm": 0.4984405040740967, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7922077775001526, "reward_std": 0.21118272095918655, "rewards/accuracy_reward": 0.802411824464798, "rewards/format_reward": 0.9897959232330322, "step": 704 }, { "completion_length": 236.948974609375, "epoch": 0.0709433962264151, "grad_norm": 1.1124261617660522, "kl": 0.0570068359375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7546381950378418, "reward_std": 0.1946568265557289, "rewards/accuracy_reward": 0.7546381950378418, "rewards/format_reward": 1.0, "step": 705 }, { "completion_length": 216.59182739257812, "epoch": 0.07104402515723271, "grad_norm": 0.8912032246589661, "kl": 0.052001953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6176604628562927, "reward_std": 0.19620900601148605, "rewards/accuracy_reward": 0.6176605224609375, "rewards/format_reward": 1.0, "step": 706 }, { "completion_length": 195.61224365234375, "epoch": 0.07114465408805032, "grad_norm": 0.8968697786331177, "kl": 0.04248046875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.878884494304657, "reward_std": 0.17754638940095901, "rewards/accuracy_reward": 0.8788845241069794, "rewards/format_reward": 1.0, "step": 707 }, { "completion_length": 228.1938705444336, "epoch": 0.07124528301886793, "grad_norm": 0.9258435368537903, "kl": 0.039306640625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.4215116500854492, "reward_std": 0.2388445995748043, "rewards/accuracy_reward": 0.4317156970500946, "rewards/format_reward": 0.9897959232330322, "step": 708 }, { "completion_length": 272.7040710449219, "epoch": 0.07134591194968554, "grad_norm": 0.7346752882003784, "kl": 0.038818359375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.4375988841056824, "reward_std": 0.22317558526992798, "rewards/accuracy_reward": 0.4478031098842621, "rewards/format_reward": 0.9897959232330322, "step": 709 }, { "completion_length": 203.34693145751953, "epoch": 0.07144654088050315, "grad_norm": 0.9293739199638367, "kl": 0.0455322265625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7201862931251526, "reward_std": 0.0553523451089859, "rewards/accuracy_reward": 0.7201863527297974, "rewards/format_reward": 1.0, "step": 710 }, { "completion_length": 261.08162689208984, "epoch": 0.07154716981132075, "grad_norm": 0.6209505796432495, "kl": 0.0457763671875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.585387647151947, "reward_std": 0.21585600078105927, "rewards/accuracy_reward": 0.6057959198951721, "rewards/format_reward": 0.9795918166637421, "step": 711 }, { "completion_length": 172.948974609375, "epoch": 0.07164779874213836, "grad_norm": 1.401841640472412, "kl": 0.037841796875, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.6929901838302612, "reward_std": 0.22798896580934525, "rewards/accuracy_reward": 0.7031942307949066, "rewards/format_reward": 0.9897959232330322, "step": 712 }, { "completion_length": 203.52040100097656, "epoch": 0.07174842767295597, "grad_norm": 1.4904059171676636, "kl": 0.049072265625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5908458232879639, "reward_std": 0.286621555685997, "rewards/accuracy_reward": 0.601049929857254, "rewards/format_reward": 0.9897959232330322, "step": 713 }, { "completion_length": 208.45917510986328, "epoch": 0.07184905660377358, "grad_norm": 1.5964620113372803, "kl": 0.05029296875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6445268392562866, "reward_std": 0.28634484112262726, "rewards/accuracy_reward": 0.6547309756278992, "rewards/format_reward": 0.9897959232330322, "step": 714 }, { "completion_length": 223.64285278320312, "epoch": 0.07194968553459119, "grad_norm": 1.541381597518921, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5729963183403015, "reward_std": 0.1978636234998703, "rewards/accuracy_reward": 0.5832004845142365, "rewards/format_reward": 0.9897959232330322, "step": 715 }, { "completion_length": 217.53060913085938, "epoch": 0.0720503144654088, "grad_norm": 0.8687487244606018, "kl": 0.043701171875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7165168523788452, "reward_std": 0.20828182995319366, "rewards/accuracy_reward": 0.7165168225765228, "rewards/format_reward": 1.0, "step": 716 }, { "completion_length": 218.10203552246094, "epoch": 0.07215094339622642, "grad_norm": 0.915959894657135, "kl": 0.0526123046875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.729752242565155, "reward_std": 0.2757623791694641, "rewards/accuracy_reward": 0.729752242565155, "rewards/format_reward": 1.0, "step": 717 }, { "completion_length": 221.39794921875, "epoch": 0.07225157232704403, "grad_norm": 0.869407594203949, "kl": 0.0513916015625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6157153844833374, "reward_std": 0.1885705068707466, "rewards/accuracy_reward": 0.62591952085495, "rewards/format_reward": 0.9897959232330322, "step": 718 }, { "completion_length": 246.9897918701172, "epoch": 0.07235220125786164, "grad_norm": 1.054971694946289, "kl": 0.03125, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.8061224222183228, "reward_std": 0.2967670261859894, "rewards/accuracy_reward": 0.8061224222183228, "rewards/format_reward": 1.0, "step": 719 }, { "completion_length": 229.7448959350586, "epoch": 0.07245283018867925, "grad_norm": 0.9559988975524902, "kl": 0.048828125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.504286766052246, "reward_std": 0.21144986897706985, "rewards/accuracy_reward": 0.5144908130168915, "rewards/format_reward": 0.9897959232330322, "step": 720 }, { "completion_length": 165.9591827392578, "epoch": 0.07255345911949686, "grad_norm": 1.0980579853057861, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7100340127944946, "reward_std": 0.19133341312408447, "rewards/accuracy_reward": 0.740646243095398, "rewards/format_reward": 0.9693877398967743, "step": 721 }, { "completion_length": 275.60203552246094, "epoch": 0.07265408805031447, "grad_norm": 1.8010473251342773, "kl": 0.0447998046875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6360707879066467, "reward_std": 0.3318144232034683, "rewards/accuracy_reward": 0.6462748944759369, "rewards/format_reward": 0.9897959232330322, "step": 722 }, { "completion_length": 186.64285278320312, "epoch": 0.07275471698113208, "grad_norm": 0.8176466226577759, "kl": 0.065185546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8017426133155823, "reward_std": 0.16576708108186722, "rewards/accuracy_reward": 0.8017426133155823, "rewards/format_reward": 1.0, "step": 723 }, { "completion_length": 224.56121826171875, "epoch": 0.07285534591194968, "grad_norm": 1.2201218605041504, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5408162474632263, "reward_std": 0.15069951862096786, "rewards/accuracy_reward": 0.5510203987360001, "rewards/format_reward": 0.9897959232330322, "step": 724 }, { "completion_length": 231.57142639160156, "epoch": 0.0729559748427673, "grad_norm": 0.8405968546867371, "kl": 0.03759765625, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5908944010734558, "reward_std": 0.3096584230661392, "rewards/accuracy_reward": 0.6010983884334564, "rewards/format_reward": 0.9897959232330322, "step": 725 }, { "completion_length": 191.02040100097656, "epoch": 0.0730566037735849, "grad_norm": 0.6585449576377869, "kl": 0.0574951171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7585033774375916, "reward_std": 0.14967603981494904, "rewards/accuracy_reward": 0.7585033774375916, "rewards/format_reward": 1.0, "step": 726 }, { "completion_length": 267.60203552246094, "epoch": 0.07315723270440251, "grad_norm": 0.9810383319854736, "kl": 0.04986572265625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7039660215377808, "reward_std": 0.2689041644334793, "rewards/accuracy_reward": 0.7141701579093933, "rewards/format_reward": 0.9897959232330322, "step": 727 }, { "completion_length": 181.83673095703125, "epoch": 0.07325786163522012, "grad_norm": 1.105580449104309, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5918367505073547, "reward_std": 0.18887970596551895, "rewards/accuracy_reward": 0.5918367356061935, "rewards/format_reward": 1.0, "step": 728 }, { "completion_length": 190.41836547851562, "epoch": 0.07335849056603773, "grad_norm": 0.8729532957077026, "kl": 0.0706787109375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7857142686843872, "reward_std": 0.10003121197223663, "rewards/accuracy_reward": 0.795918345451355, "rewards/format_reward": 0.9897959232330322, "step": 729 }, { "completion_length": 230.57141876220703, "epoch": 0.07345911949685535, "grad_norm": 0.9403661489486694, "kl": 0.05712890625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6128629446029663, "reward_std": 0.17945878393948078, "rewards/accuracy_reward": 0.6230670213699341, "rewards/format_reward": 0.9897959232330322, "step": 730 }, { "completion_length": 236.61223602294922, "epoch": 0.07355974842767296, "grad_norm": 0.7581003308296204, "kl": 0.05029296875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6359937191009521, "reward_std": 0.21745234727859497, "rewards/accuracy_reward": 0.6461977064609528, "rewards/format_reward": 0.9897959232330322, "step": 731 }, { "completion_length": 230.2346954345703, "epoch": 0.07366037735849057, "grad_norm": 1.2333297729492188, "kl": 0.0721435546875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.625364363193512, "reward_std": 0.2427174672484398, "rewards/accuracy_reward": 0.6253644227981567, "rewards/format_reward": 1.0, "step": 732 }, { "completion_length": 203.06121826171875, "epoch": 0.07376100628930818, "grad_norm": 0.93631911277771, "kl": 0.0635986328125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.697086215019226, "reward_std": 0.23420803248882294, "rewards/accuracy_reward": 0.6970862448215485, "rewards/format_reward": 1.0, "step": 733 }, { "completion_length": 220.84693908691406, "epoch": 0.07386163522012579, "grad_norm": 1.0579257011413574, "kl": 0.0457763671875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.519841730594635, "reward_std": 0.22470807284116745, "rewards/accuracy_reward": 0.5300458669662476, "rewards/format_reward": 0.9897959232330322, "step": 734 }, { "completion_length": 297.9285583496094, "epoch": 0.0739622641509434, "grad_norm": 0.754994809627533, "kl": 0.04345703125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5979591608047485, "reward_std": 0.2363893836736679, "rewards/accuracy_reward": 0.5979591310024261, "rewards/format_reward": 1.0, "step": 735 }, { "completion_length": 270.76529693603516, "epoch": 0.074062893081761, "grad_norm": 1.088557481765747, "kl": 0.053466796875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.673469364643097, "reward_std": 0.24952784180641174, "rewards/accuracy_reward": 0.6734693944454193, "rewards/format_reward": 1.0, "step": 736 }, { "completion_length": 226.30611419677734, "epoch": 0.07416352201257861, "grad_norm": 1.0617057085037231, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5637563467025757, "reward_std": 0.28010421991348267, "rewards/accuracy_reward": 0.5739603936672211, "rewards/format_reward": 0.9897959232330322, "step": 737 }, { "completion_length": 220.6734619140625, "epoch": 0.07426415094339622, "grad_norm": 1.2623885869979858, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.773345708847046, "reward_std": 0.244972862303257, "rewards/accuracy_reward": 0.8039579391479492, "rewards/format_reward": 0.9693877398967743, "step": 738 }, { "completion_length": 214.08162689208984, "epoch": 0.07436477987421383, "grad_norm": 1.037089228630066, "kl": 0.05126953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6630521416664124, "reward_std": 0.19847723841667175, "rewards/accuracy_reward": 0.6630522012710571, "rewards/format_reward": 1.0, "step": 739 }, { "completion_length": 232.56122589111328, "epoch": 0.07446540880503144, "grad_norm": 2.064046621322632, "kl": 0.0450439453125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6256084442138672, "reward_std": 0.22722502797842026, "rewards/accuracy_reward": 0.6358125805854797, "rewards/format_reward": 0.9897959232330322, "step": 740 }, { "completion_length": 253.93877410888672, "epoch": 0.07456603773584905, "grad_norm": 0.8070952892303467, "kl": 0.0430908203125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6355684995651245, "reward_std": 0.26291924715042114, "rewards/accuracy_reward": 0.6355684846639633, "rewards/format_reward": 1.0, "step": 741 }, { "completion_length": 267.54080963134766, "epoch": 0.07466666666666667, "grad_norm": 2.973308801651001, "kl": 0.042724609375, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.589447796344757, "reward_std": 0.2701553553342819, "rewards/accuracy_reward": 0.5894477665424347, "rewards/format_reward": 1.0, "step": 742 }, { "completion_length": 297.2653045654297, "epoch": 0.07476729559748428, "grad_norm": 30.18510627746582, "kl": 1.5218505859375, "learning_rate": 1e-06, "loss": 0.0608, "reward": 1.5409014821052551, "reward_std": 0.2514525018632412, "rewards/accuracy_reward": 0.5613096207380295, "rewards/format_reward": 0.9795918166637421, "step": 743 }, { "completion_length": 180.38774871826172, "epoch": 0.07486792452830189, "grad_norm": 0.7876823544502258, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.782021403312683, "reward_std": 0.14402127638459206, "rewards/accuracy_reward": 0.7820213437080383, "rewards/format_reward": 1.0, "step": 744 }, { "completion_length": 200.10204315185547, "epoch": 0.0749685534591195, "grad_norm": 1.0395005941390991, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7249815464019775, "reward_std": 0.2018023133277893, "rewards/accuracy_reward": 0.7249815165996552, "rewards/format_reward": 1.0, "step": 745 }, { "completion_length": 217.57142639160156, "epoch": 0.07506918238993711, "grad_norm": 1.4068859815597534, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7053375244140625, "reward_std": 0.18801679462194443, "rewards/accuracy_reward": 0.7053374946117401, "rewards/format_reward": 1.0, "step": 746 }, { "completion_length": 208.80611419677734, "epoch": 0.07516981132075472, "grad_norm": 0.9464669227600098, "kl": 0.03851318359375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.6262977123260498, "reward_std": 0.15334022417664528, "rewards/accuracy_reward": 0.6262976229190826, "rewards/format_reward": 1.0, "step": 747 }, { "completion_length": 258.9081497192383, "epoch": 0.07527044025157233, "grad_norm": 2.119476556777954, "kl": 0.04534912109375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.651669681072235, "reward_std": 0.1615234538912773, "rewards/accuracy_reward": 0.6516697406768799, "rewards/format_reward": 1.0, "step": 748 }, { "completion_length": 180.43877410888672, "epoch": 0.07537106918238994, "grad_norm": 0.7255302667617798, "kl": 0.038330078125, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.6642481088638306, "reward_std": 0.12454583495855331, "rewards/accuracy_reward": 0.6642481684684753, "rewards/format_reward": 1.0, "step": 749 }, { "completion_length": 189.2040786743164, "epoch": 0.07547169811320754, "grad_norm": 1.865682601928711, "kl": 0.0521240234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6865707039833069, "reward_std": 0.10485878214240074, "rewards/accuracy_reward": 0.6865707635879517, "rewards/format_reward": 1.0, "step": 750 }, { "completion_length": 214.7551040649414, "epoch": 0.07557232704402515, "grad_norm": 5.659226894378662, "kl": 0.3287353515625, "learning_rate": 1e-06, "loss": 0.0132, "reward": 1.8064261078834534, "reward_std": 0.2137417495250702, "rewards/accuracy_reward": 0.8166302144527435, "rewards/format_reward": 0.9897959232330322, "step": 751 }, { "completion_length": 225.07141876220703, "epoch": 0.07567295597484276, "grad_norm": 1.1000335216522217, "kl": 0.0482177734375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6062520146369934, "reward_std": 0.15658395737409592, "rewards/accuracy_reward": 0.6062520146369934, "rewards/format_reward": 1.0, "step": 752 }, { "completion_length": 208.07142639160156, "epoch": 0.07577358490566037, "grad_norm": 0.8717836737632751, "kl": 0.0552978515625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7764378190040588, "reward_std": 0.17122966796159744, "rewards/accuracy_reward": 0.7764378190040588, "rewards/format_reward": 1.0, "step": 753 }, { "completion_length": 257.4081573486328, "epoch": 0.075874213836478, "grad_norm": 0.6881804466247559, "kl": 0.03729248046875, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.56210058927536, "reward_std": 0.21543137729167938, "rewards/accuracy_reward": 0.5723046362400055, "rewards/format_reward": 0.9897959232330322, "step": 754 }, { "completion_length": 225.23469161987305, "epoch": 0.0759748427672956, "grad_norm": 6.040635108947754, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.672983467578888, "reward_std": 0.25709356367588043, "rewards/accuracy_reward": 0.6933916211128235, "rewards/format_reward": 0.9795918166637421, "step": 755 }, { "completion_length": 257.31632232666016, "epoch": 0.07607547169811321, "grad_norm": 0.8815358877182007, "kl": 0.042724609375, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.4604188799858093, "reward_std": 0.22186838905327022, "rewards/accuracy_reward": 0.5216434001922607, "rewards/format_reward": 0.938775509595871, "step": 756 }, { "completion_length": 249.948974609375, "epoch": 0.07617610062893082, "grad_norm": 1.0328155755996704, "kl": 0.045654296875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.5633573532104492, "reward_std": 0.20954875648021698, "rewards/accuracy_reward": 0.573561429977417, "rewards/format_reward": 0.9897959232330322, "step": 757 }, { "completion_length": 219.59182739257812, "epoch": 0.07627672955974843, "grad_norm": 0.7947759032249451, "kl": 0.0504150390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6400133967399597, "reward_std": 0.1378120444715023, "rewards/accuracy_reward": 0.6604216396808624, "rewards/format_reward": 0.9795918464660645, "step": 758 }, { "completion_length": 273.22447204589844, "epoch": 0.07637735849056604, "grad_norm": 1.22933828830719, "kl": 0.048583984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.3590526580810547, "reward_std": 0.3110163062810898, "rewards/accuracy_reward": 0.389664925634861, "rewards/format_reward": 0.9693877398967743, "step": 759 }, { "completion_length": 197.57143020629883, "epoch": 0.07647798742138365, "grad_norm": 1.0648747682571411, "kl": 0.043701171875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.8163264393806458, "reward_std": 0.22926463931798935, "rewards/accuracy_reward": 0.8877550661563873, "rewards/format_reward": 0.9285714328289032, "step": 760 }, { "completion_length": 273.9285583496094, "epoch": 0.07657861635220126, "grad_norm": 1.1773186922073364, "kl": 0.0400390625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6330503821372986, "reward_std": 0.2903982698917389, "rewards/accuracy_reward": 0.6636626124382019, "rewards/format_reward": 0.9693877398967743, "step": 761 }, { "completion_length": 219.16326141357422, "epoch": 0.07667924528301887, "grad_norm": 2.083859920501709, "kl": 0.055908203125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6020407676696777, "reward_std": 0.2697695642709732, "rewards/accuracy_reward": 0.6326530426740646, "rewards/format_reward": 0.9693877398967743, "step": 762 }, { "completion_length": 210.29591369628906, "epoch": 0.07677987421383647, "grad_norm": 1.2529940605163574, "kl": 0.0743408203125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6551947593688965, "reward_std": 0.20265815407037735, "rewards/accuracy_reward": 0.6551947891712189, "rewards/format_reward": 1.0, "step": 763 }, { "completion_length": 161.32653045654297, "epoch": 0.07688050314465408, "grad_norm": 1.3502296209335327, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7963600754737854, "reward_std": 0.20583199709653854, "rewards/accuracy_reward": 0.8269723355770111, "rewards/format_reward": 0.9693877398967743, "step": 764 }, { "completion_length": 248.14286041259766, "epoch": 0.07698113207547169, "grad_norm": 1.2307437658309937, "kl": 0.0460205078125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.637850821018219, "reward_std": 0.19704709947109222, "rewards/accuracy_reward": 0.6582589745521545, "rewards/format_reward": 0.9795918464660645, "step": 765 }, { "completion_length": 190.4285659790039, "epoch": 0.07708176100628931, "grad_norm": 0.8440259695053101, "kl": 0.038818359375, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6967929005622864, "reward_std": 0.11674392223358154, "rewards/accuracy_reward": 0.7172011137008667, "rewards/format_reward": 0.9795918464660645, "step": 766 }, { "completion_length": 246.448974609375, "epoch": 0.07718238993710692, "grad_norm": 0.9314135909080505, "kl": 0.0482177734375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7303206324577332, "reward_std": 0.23615073412656784, "rewards/accuracy_reward": 0.7609329223632812, "rewards/format_reward": 0.9693877398967743, "step": 767 }, { "completion_length": 235.73468017578125, "epoch": 0.07728301886792453, "grad_norm": 0.9916264414787292, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7488661408424377, "reward_std": 0.22928724437952042, "rewards/accuracy_reward": 0.7692743539810181, "rewards/format_reward": 0.9795918166637421, "step": 768 }, { "completion_length": 265.7040786743164, "epoch": 0.07738364779874214, "grad_norm": 5.676779270172119, "kl": 0.054443359375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5601049661636353, "reward_std": 0.364993616938591, "rewards/accuracy_reward": 0.6009213030338287, "rewards/format_reward": 0.9591836333274841, "step": 769 }, { "completion_length": 292.39795684814453, "epoch": 0.07748427672955975, "grad_norm": 1.0256872177124023, "kl": 0.04315185546875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5709426999092102, "reward_std": 0.388669028878212, "rewards/accuracy_reward": 0.6117589771747589, "rewards/format_reward": 0.9591836631298065, "step": 770 }, { "completion_length": 215.7653045654297, "epoch": 0.07758490566037736, "grad_norm": 0.7322142720222473, "kl": 0.04296875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6778424978256226, "reward_std": 0.1914769560098648, "rewards/accuracy_reward": 0.6880466341972351, "rewards/format_reward": 0.9897959232330322, "step": 771 }, { "completion_length": 253.51020050048828, "epoch": 0.07768553459119497, "grad_norm": 1.4304100275039673, "kl": 0.055419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5846161246299744, "reward_std": 0.2488591969013214, "rewards/accuracy_reward": 0.5846161097288132, "rewards/format_reward": 1.0, "step": 772 }, { "completion_length": 254.88775634765625, "epoch": 0.07778616352201258, "grad_norm": 0.9433024525642395, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.5680493712425232, "reward_std": 0.3155386447906494, "rewards/accuracy_reward": 0.5782533586025238, "rewards/format_reward": 0.9897959232330322, "step": 773 }, { "completion_length": 220.01019287109375, "epoch": 0.07788679245283019, "grad_norm": 0.4197593629360199, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7522727251052856, "reward_std": 0.05729444697499275, "rewards/accuracy_reward": 0.7522727251052856, "rewards/format_reward": 1.0, "step": 774 }, { "completion_length": 260.9795837402344, "epoch": 0.0779874213836478, "grad_norm": 0.7428535223007202, "kl": 0.0498046875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5111758708953857, "reward_std": 0.1697264388203621, "rewards/accuracy_reward": 0.5315840691328049, "rewards/format_reward": 0.9795918464660645, "step": 775 }, { "completion_length": 252.88775634765625, "epoch": 0.0780880503144654, "grad_norm": 0.7605223655700684, "kl": 0.062744140625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6466190814971924, "reward_std": 0.19691771268844604, "rewards/accuracy_reward": 0.6670272946357727, "rewards/format_reward": 0.9795918166637421, "step": 776 }, { "completion_length": 254.63265228271484, "epoch": 0.07818867924528301, "grad_norm": 1.0155388116836548, "kl": 0.04669189453125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.610632061958313, "reward_std": 0.18643953278660774, "rewards/accuracy_reward": 0.6310402154922485, "rewards/format_reward": 0.9795918166637421, "step": 777 }, { "completion_length": 236.34693908691406, "epoch": 0.07828930817610062, "grad_norm": 0.7821791172027588, "kl": 0.04638671875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6775509715080261, "reward_std": 0.17264417558908463, "rewards/accuracy_reward": 0.6877550780773163, "rewards/format_reward": 0.9897959232330322, "step": 778 }, { "completion_length": 211.89795684814453, "epoch": 0.07838993710691824, "grad_norm": 0.7120704650878906, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7186588644981384, "reward_std": 0.22587460279464722, "rewards/accuracy_reward": 0.739067018032074, "rewards/format_reward": 0.9795918166637421, "step": 779 }, { "completion_length": 268.2142868041992, "epoch": 0.07849056603773585, "grad_norm": 1.0214192867279053, "kl": 0.0428466796875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6630336046218872, "reward_std": 0.1981365978717804, "rewards/accuracy_reward": 0.6732376217842102, "rewards/format_reward": 0.9897959232330322, "step": 780 }, { "completion_length": 243.9693832397461, "epoch": 0.07859119496855346, "grad_norm": 0.615574061870575, "kl": 0.048583984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.628344714641571, "reward_std": 0.24139627069234848, "rewards/accuracy_reward": 0.6385487616062164, "rewards/format_reward": 0.9897959232330322, "step": 781 }, { "completion_length": 268.84693145751953, "epoch": 0.07869182389937107, "grad_norm": 1.200311303138733, "kl": 0.0335693359375, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.5869370698928833, "reward_std": 0.2396194264292717, "rewards/accuracy_reward": 0.6175493597984314, "rewards/format_reward": 0.9693877398967743, "step": 782 }, { "completion_length": 213.9081573486328, "epoch": 0.07879245283018868, "grad_norm": 1.4800158739089966, "kl": 0.0762939453125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5950034856796265, "reward_std": 0.23232509195804596, "rewards/accuracy_reward": 0.6052076816558838, "rewards/format_reward": 0.9897959232330322, "step": 783 }, { "completion_length": 250.5408172607422, "epoch": 0.07889308176100629, "grad_norm": 0.87638258934021, "kl": 0.0430908203125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.4873398542404175, "reward_std": 0.19602636247873306, "rewards/accuracy_reward": 0.5077480673789978, "rewards/format_reward": 0.9795918464660645, "step": 784 }, { "completion_length": 271.3673400878906, "epoch": 0.0789937106918239, "grad_norm": 1.9926539659500122, "kl": 0.0462646484375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.589941680431366, "reward_std": 0.34971320629119873, "rewards/accuracy_reward": 0.6205539405345917, "rewards/format_reward": 0.9693877398967743, "step": 785 }, { "completion_length": 289.0102005004883, "epoch": 0.07909433962264151, "grad_norm": 0.7197906374931335, "kl": 0.03955078125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.5022311806678772, "reward_std": 0.2668965384364128, "rewards/accuracy_reward": 0.5328434705734253, "rewards/format_reward": 0.9693877398967743, "step": 786 }, { "completion_length": 199.9795913696289, "epoch": 0.07919496855345912, "grad_norm": 0.905303418636322, "kl": 0.0582275390625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7565862536430359, "reward_std": 0.12127770110964775, "rewards/accuracy_reward": 0.7565861940383911, "rewards/format_reward": 1.0, "step": 787 }, { "completion_length": 219.31632232666016, "epoch": 0.07929559748427673, "grad_norm": 0.8634675741195679, "kl": 0.056640625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5438053607940674, "reward_std": 0.25021763145923615, "rewards/accuracy_reward": 0.5642135441303253, "rewards/format_reward": 0.9795918464660645, "step": 788 }, { "completion_length": 205.52040100097656, "epoch": 0.07939622641509433, "grad_norm": 0.8989050984382629, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7448979020118713, "reward_std": 0.22024428099393845, "rewards/accuracy_reward": 0.7551020383834839, "rewards/format_reward": 0.9897959232330322, "step": 789 }, { "completion_length": 181.61224365234375, "epoch": 0.07949685534591194, "grad_norm": 0.5313494801521301, "kl": 0.0535888671875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8787201642990112, "reward_std": 0.1316949538886547, "rewards/accuracy_reward": 0.8889241814613342, "rewards/format_reward": 0.9897959232330322, "step": 790 }, { "completion_length": 154.93877029418945, "epoch": 0.07959748427672957, "grad_norm": 3.961494207382202, "kl": 0.12744140625, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.7529551982879639, "reward_std": 0.18813134729862213, "rewards/accuracy_reward": 0.7631592750549316, "rewards/format_reward": 0.9897959232330322, "step": 791 }, { "completion_length": 245.3673324584961, "epoch": 0.07969811320754717, "grad_norm": 0.7986770868301392, "kl": 0.03338623046875, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.707057774066925, "reward_std": 0.23305818438529968, "rewards/accuracy_reward": 0.707057774066925, "rewards/format_reward": 1.0, "step": 792 }, { "completion_length": 248.2653045654297, "epoch": 0.07979874213836478, "grad_norm": 0.6940059661865234, "kl": 0.039306640625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7465986609458923, "reward_std": 0.24102631956338882, "rewards/accuracy_reward": 0.7772108912467957, "rewards/format_reward": 0.9693877398967743, "step": 793 }, { "completion_length": 212.0408172607422, "epoch": 0.07989937106918239, "grad_norm": 1.834753394126892, "kl": 0.0589599609375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7022027373313904, "reward_std": 0.28027087450027466, "rewards/accuracy_reward": 0.7124068439006805, "rewards/format_reward": 0.9897959232330322, "step": 794 }, { "completion_length": 196.3775405883789, "epoch": 0.08, "grad_norm": 0.9576225876808167, "kl": 0.0465087890625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6594104170799255, "reward_std": 0.1733284369111061, "rewards/accuracy_reward": 0.6594104170799255, "rewards/format_reward": 1.0, "step": 795 }, { "completion_length": 191.9183578491211, "epoch": 0.08010062893081761, "grad_norm": 1.9198901653289795, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5290113687515259, "reward_std": 0.1664571650326252, "rewards/accuracy_reward": 0.5290113985538483, "rewards/format_reward": 1.0, "step": 796 }, { "completion_length": 253.7653045654297, "epoch": 0.08020125786163522, "grad_norm": 1.0392982959747314, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6093382239341736, "reward_std": 0.2833357900381088, "rewards/accuracy_reward": 0.6297464370727539, "rewards/format_reward": 0.9795918166637421, "step": 797 }, { "completion_length": 204.51020050048828, "epoch": 0.08030188679245283, "grad_norm": 0.7205880880355835, "kl": 0.0379638671875, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.673469364643097, "reward_std": 0.16973835229873657, "rewards/accuracy_reward": 0.6734693646430969, "rewards/format_reward": 1.0, "step": 798 }, { "completion_length": 178.82652282714844, "epoch": 0.08040251572327044, "grad_norm": 4.529298782348633, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.666180670261383, "reward_std": 0.21692464500665665, "rewards/accuracy_reward": 0.6865888833999634, "rewards/format_reward": 0.9795918464660645, "step": 799 }, { "completion_length": 227.80612182617188, "epoch": 0.08050314465408805, "grad_norm": 0.7784883379936218, "kl": 0.0631103515625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.610612392425537, "reward_std": 0.23749911040067673, "rewards/accuracy_reward": 0.6106123626232147, "rewards/format_reward": 1.0, "step": 800 }, { "completion_length": 133.9285659790039, "epoch": 0.08060377358490566, "grad_norm": 0.7253788709640503, "kl": 0.0631103515625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7816524505615234, "reward_std": 0.11236719228327274, "rewards/accuracy_reward": 0.7816525399684906, "rewards/format_reward": 1.0, "step": 801 }, { "completion_length": 210.28570556640625, "epoch": 0.08070440251572326, "grad_norm": 10.037849426269531, "kl": 0.0491943359375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.657920241355896, "reward_std": 0.22026710957288742, "rewards/accuracy_reward": 0.6681243777275085, "rewards/format_reward": 0.9897959232330322, "step": 802 }, { "completion_length": 267.79591369628906, "epoch": 0.08080503144654089, "grad_norm": 1.8546888828277588, "kl": 0.0489501953125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.574343979358673, "reward_std": 0.2966897636651993, "rewards/accuracy_reward": 0.5845480859279633, "rewards/format_reward": 0.9897959232330322, "step": 803 }, { "completion_length": 187.948974609375, "epoch": 0.0809056603773585, "grad_norm": 0.9801518321037292, "kl": 0.05615234375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6936821341514587, "reward_std": 0.11349829658865929, "rewards/accuracy_reward": 0.6936821341514587, "rewards/format_reward": 1.0, "step": 804 }, { "completion_length": 234.2040786743164, "epoch": 0.0810062893081761, "grad_norm": 1.4321757555007935, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7527828216552734, "reward_std": 0.1210319846868515, "rewards/accuracy_reward": 0.7527828812599182, "rewards/format_reward": 1.0, "step": 805 }, { "completion_length": 235.96937561035156, "epoch": 0.08110691823899371, "grad_norm": 1.0758355855941772, "kl": 0.0556640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.583673357963562, "reward_std": 0.22419482469558716, "rewards/accuracy_reward": 0.5836734473705292, "rewards/format_reward": 1.0, "step": 806 }, { "completion_length": 202.62244415283203, "epoch": 0.08120754716981132, "grad_norm": 0.8834856152534485, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6825377345085144, "reward_std": 0.2696230411529541, "rewards/accuracy_reward": 0.7131499648094177, "rewards/format_reward": 0.9693877398967743, "step": 807 }, { "completion_length": 199.91836547851562, "epoch": 0.08130817610062893, "grad_norm": 0.7921033501625061, "kl": 0.0540771484375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6687074899673462, "reward_std": 0.24573630094528198, "rewards/accuracy_reward": 0.6687074899673462, "rewards/format_reward": 1.0, "step": 808 }, { "completion_length": 242.4081573486328, "epoch": 0.08140880503144654, "grad_norm": 1.4839457273483276, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.642431914806366, "reward_std": 0.34685830771923065, "rewards/accuracy_reward": 0.6526360511779785, "rewards/format_reward": 0.9897959232330322, "step": 809 }, { "completion_length": 159.2653045654297, "epoch": 0.08150943396226415, "grad_norm": 1.2320785522460938, "kl": 0.0478515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6758075952529907, "reward_std": 0.17238383367657661, "rewards/accuracy_reward": 0.6758075952529907, "rewards/format_reward": 1.0, "step": 810 }, { "completion_length": 242.23468017578125, "epoch": 0.08161006289308176, "grad_norm": 1.8081374168395996, "kl": 0.059814453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.586467444896698, "reward_std": 0.26966848224401474, "rewards/accuracy_reward": 0.5966715514659882, "rewards/format_reward": 0.9897959232330322, "step": 811 }, { "completion_length": 153.16326141357422, "epoch": 0.08171069182389937, "grad_norm": 0.8495372533798218, "kl": 0.0506591796875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7243683338165283, "reward_std": 0.12402348592877388, "rewards/accuracy_reward": 0.7243682742118835, "rewards/format_reward": 1.0, "step": 812 }, { "completion_length": 206.37755584716797, "epoch": 0.08181132075471698, "grad_norm": 0.8091540932655334, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6836734414100647, "reward_std": 0.06517763808369637, "rewards/accuracy_reward": 0.6938775181770325, "rewards/format_reward": 0.9897959232330322, "step": 813 }, { "completion_length": 183.9795913696289, "epoch": 0.08191194968553459, "grad_norm": 0.8876581192016602, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7258513569831848, "reward_std": 0.12911416217684746, "rewards/accuracy_reward": 0.7258513867855072, "rewards/format_reward": 1.0, "step": 814 }, { "completion_length": 205.81632232666016, "epoch": 0.08201257861635221, "grad_norm": 1.599245548248291, "kl": 0.0479736328125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.8268199563026428, "reward_std": 0.18835296854376793, "rewards/accuracy_reward": 0.837024062871933, "rewards/format_reward": 0.9897959232330322, "step": 815 }, { "completion_length": 140.6836700439453, "epoch": 0.08211320754716982, "grad_norm": 1.133841872215271, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7445376515388489, "reward_std": 0.17573731392621994, "rewards/accuracy_reward": 0.7547417283058167, "rewards/format_reward": 0.9897959232330322, "step": 816 }, { "completion_length": 220.2653045654297, "epoch": 0.08221383647798743, "grad_norm": 0.9283222556114197, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6698214411735535, "reward_std": 0.33549581468105316, "rewards/accuracy_reward": 0.7106376588344574, "rewards/format_reward": 0.9591836333274841, "step": 817 }, { "completion_length": 169.39795684814453, "epoch": 0.08231446540880503, "grad_norm": 4.021496295928955, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8394557237625122, "reward_std": 0.10929933562874794, "rewards/accuracy_reward": 0.8496598303318024, "rewards/format_reward": 0.9897959232330322, "step": 818 }, { "completion_length": 278.4387664794922, "epoch": 0.08241509433962264, "grad_norm": 0.9388066530227661, "kl": 0.03662109375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.4266277551651, "reward_std": 0.307175412774086, "rewards/accuracy_reward": 0.44703593850135803, "rewards/format_reward": 0.9795918166637421, "step": 819 }, { "completion_length": 243.61224365234375, "epoch": 0.08251572327044025, "grad_norm": 0.7307436466217041, "kl": 0.0439453125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.5612244009971619, "reward_std": 0.21587716042995453, "rewards/accuracy_reward": 0.5816326290369034, "rewards/format_reward": 0.9795918464660645, "step": 820 }, { "completion_length": 205.27550506591797, "epoch": 0.08261635220125786, "grad_norm": 0.8259570598602295, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6532570719718933, "reward_std": 0.23831652849912643, "rewards/accuracy_reward": 0.6634611487388611, "rewards/format_reward": 0.9897959232330322, "step": 821 }, { "completion_length": 149.51020050048828, "epoch": 0.08271698113207547, "grad_norm": 1.4278966188430786, "kl": 0.0487060546875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8112244606018066, "reward_std": 0.18641380965709686, "rewards/accuracy_reward": 0.811224490404129, "rewards/format_reward": 1.0, "step": 822 }, { "completion_length": 240.23468780517578, "epoch": 0.08281761006289308, "grad_norm": 0.9496552348136902, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.723760962486267, "reward_std": 0.2201230600476265, "rewards/accuracy_reward": 0.7339649796485901, "rewards/format_reward": 0.9897959232330322, "step": 823 }, { "completion_length": 241.58162689208984, "epoch": 0.08291823899371069, "grad_norm": 0.7172396779060364, "kl": 0.037109375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.64463871717453, "reward_std": 0.15332765132188797, "rewards/accuracy_reward": 0.6446388065814972, "rewards/format_reward": 1.0, "step": 824 }, { "completion_length": 240.53060150146484, "epoch": 0.0830188679245283, "grad_norm": 0.7364143133163452, "kl": 0.0517578125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5792441964149475, "reward_std": 0.2438550814986229, "rewards/accuracy_reward": 0.5894481986761093, "rewards/format_reward": 0.9897959232330322, "step": 825 }, { "completion_length": 246.15306091308594, "epoch": 0.0831194968553459, "grad_norm": 0.7670943737030029, "kl": 0.04339599609375, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5014577507972717, "reward_std": 0.19510336220264435, "rewards/accuracy_reward": 0.5218658745288849, "rewards/format_reward": 0.9795918464660645, "step": 826 }, { "completion_length": 230.4693832397461, "epoch": 0.08322012578616352, "grad_norm": 2.9126996994018555, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6841754913330078, "reward_std": 0.1709442250430584, "rewards/accuracy_reward": 0.6841755509376526, "rewards/format_reward": 1.0, "step": 827 }, { "completion_length": 186.60203552246094, "epoch": 0.08332075471698114, "grad_norm": 1.0056990385055542, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7621645331382751, "reward_std": 0.14499930292367935, "rewards/accuracy_reward": 0.7621646225452423, "rewards/format_reward": 1.0, "step": 828 }, { "completion_length": 236.6734619140625, "epoch": 0.08342138364779875, "grad_norm": 2.5004680156707764, "kl": 0.0411376953125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.57383531332016, "reward_std": 0.22828016430139542, "rewards/accuracy_reward": 0.5840393602848053, "rewards/format_reward": 0.9897959232330322, "step": 829 }, { "completion_length": 277.1632537841797, "epoch": 0.08352201257861636, "grad_norm": 0.8790431618690491, "kl": 0.0379638671875, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.590406894683838, "reward_std": 0.4058893024921417, "rewards/accuracy_reward": 0.6516313552856445, "rewards/format_reward": 0.9387754797935486, "step": 830 }, { "completion_length": 321.2653045654297, "epoch": 0.08362264150943396, "grad_norm": 1.0066419839859009, "kl": 0.0474853515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5241429209709167, "reward_std": 0.28594161570072174, "rewards/accuracy_reward": 0.5547551065683365, "rewards/format_reward": 0.9693877398967743, "step": 831 }, { "completion_length": 294.05101776123047, "epoch": 0.08372327044025157, "grad_norm": 1.7229136228561401, "kl": 0.0662841796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.433701753616333, "reward_std": 0.29657140374183655, "rewards/accuracy_reward": 0.4643140435218811, "rewards/format_reward": 0.9693877398967743, "step": 832 }, { "completion_length": 255.8163299560547, "epoch": 0.08382389937106918, "grad_norm": 0.8889840841293335, "kl": 0.03955078125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6575315594673157, "reward_std": 0.21544740349054337, "rewards/accuracy_reward": 0.6575315594673157, "rewards/format_reward": 1.0, "step": 833 }, { "completion_length": 191.61224365234375, "epoch": 0.08392452830188679, "grad_norm": 2.7142269611358643, "kl": 0.0677490234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7049723267555237, "reward_std": 0.22219545394182205, "rewards/accuracy_reward": 0.7049724459648132, "rewards/format_reward": 1.0, "step": 834 }, { "completion_length": 239.06121826171875, "epoch": 0.0840251572327044, "grad_norm": 1.2660372257232666, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5846732258796692, "reward_std": 0.2648536041378975, "rewards/accuracy_reward": 0.5948773622512817, "rewards/format_reward": 0.9897959232330322, "step": 835 }, { "completion_length": 245.78570556640625, "epoch": 0.08412578616352201, "grad_norm": 0.7730669379234314, "kl": 0.0416259765625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7353427410125732, "reward_std": 0.07467595860362053, "rewards/accuracy_reward": 0.7353427112102509, "rewards/format_reward": 1.0, "step": 836 }, { "completion_length": 280.87754821777344, "epoch": 0.08422641509433962, "grad_norm": 0.9654404520988464, "kl": 0.048583984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.4107244610786438, "reward_std": 0.2705277279019356, "rewards/accuracy_reward": 0.4311326593160629, "rewards/format_reward": 0.9795918166637421, "step": 837 }, { "completion_length": 227.9285659790039, "epoch": 0.08432704402515723, "grad_norm": 1.930525779724121, "kl": 0.0697021484375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6853740811347961, "reward_std": 0.27632199227809906, "rewards/accuracy_reward": 0.7057822644710541, "rewards/format_reward": 0.9795918166637421, "step": 838 }, { "completion_length": 237.47958374023438, "epoch": 0.08442767295597484, "grad_norm": 0.7542005181312561, "kl": 0.059814453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6038939356803894, "reward_std": 0.1649937443435192, "rewards/accuracy_reward": 0.6038939952850342, "rewards/format_reward": 1.0, "step": 839 }, { "completion_length": 200.34693908691406, "epoch": 0.08452830188679246, "grad_norm": 1.6089861392974854, "kl": 0.0540771484375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.680757999420166, "reward_std": 0.21414539963006973, "rewards/accuracy_reward": 0.680757999420166, "rewards/format_reward": 1.0, "step": 840 }, { "completion_length": 187.25509643554688, "epoch": 0.08462893081761007, "grad_norm": 2.0712966918945312, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6239254474639893, "reward_std": 0.17784612625837326, "rewards/accuracy_reward": 0.6341295540332794, "rewards/format_reward": 0.9897959232330322, "step": 841 }, { "completion_length": 244.4081573486328, "epoch": 0.08472955974842768, "grad_norm": 0.6261221766471863, "kl": 0.03302001953125, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.6317867040634155, "reward_std": 0.21464785188436508, "rewards/accuracy_reward": 0.6317867338657379, "rewards/format_reward": 1.0, "step": 842 }, { "completion_length": 213.9081573486328, "epoch": 0.08483018867924529, "grad_norm": 1.1590545177459717, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.4880467653274536, "reward_std": 0.18870516121387482, "rewards/accuracy_reward": 0.48804664611816406, "rewards/format_reward": 1.0, "step": 843 }, { "completion_length": 195.73468780517578, "epoch": 0.0849308176100629, "grad_norm": 0.6639349460601807, "kl": 0.0504150390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8289116024971008, "reward_std": 0.11457088589668274, "rewards/accuracy_reward": 0.8391156792640686, "rewards/format_reward": 0.9897959232330322, "step": 844 }, { "completion_length": 205.39795684814453, "epoch": 0.0850314465408805, "grad_norm": 2.3630359172821045, "kl": 0.125, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.6895930171012878, "reward_std": 0.26243406534194946, "rewards/accuracy_reward": 0.6997972130775452, "rewards/format_reward": 0.9897959232330322, "step": 845 }, { "completion_length": 208.31632232666016, "epoch": 0.08513207547169811, "grad_norm": 3.078758716583252, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5660172700881958, "reward_std": 0.18909330666065216, "rewards/accuracy_reward": 0.5660172700881958, "rewards/format_reward": 1.0, "step": 846 }, { "completion_length": 205.5408172607422, "epoch": 0.08523270440251572, "grad_norm": 1.156077265739441, "kl": 0.0406494140625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6083255410194397, "reward_std": 0.17254731804132462, "rewards/accuracy_reward": 0.6083255708217621, "rewards/format_reward": 1.0, "step": 847 }, { "completion_length": 255.2653045654297, "epoch": 0.08533333333333333, "grad_norm": 0.8253063559532166, "kl": 0.04296875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7249433398246765, "reward_std": 0.230966217815876, "rewards/accuracy_reward": 0.7453514635562897, "rewards/format_reward": 0.9795918464660645, "step": 848 }, { "completion_length": 288.2653045654297, "epoch": 0.08543396226415094, "grad_norm": 0.7869397401809692, "kl": 0.04150390625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5987167358398438, "reward_std": 0.22291987389326096, "rewards/accuracy_reward": 0.5987167656421661, "rewards/format_reward": 1.0, "step": 849 }, { "completion_length": 186.4795913696289, "epoch": 0.08553459119496855, "grad_norm": 1.7910001277923584, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.691359281539917, "reward_std": 0.13077084347605705, "rewards/accuracy_reward": 0.6913593411445618, "rewards/format_reward": 1.0, "step": 850 }, { "completion_length": 206.7755126953125, "epoch": 0.08563522012578616, "grad_norm": 0.8659568428993225, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.459836483001709, "reward_std": 0.12729085236787796, "rewards/accuracy_reward": 0.4700406491756439, "rewards/format_reward": 0.9897959232330322, "step": 851 }, { "completion_length": 194.29591369628906, "epoch": 0.08573584905660378, "grad_norm": 2.056478977203369, "kl": 0.0592041015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7327690124511719, "reward_std": 0.30789658427238464, "rewards/accuracy_reward": 0.7531770765781403, "rewards/format_reward": 0.9795918464660645, "step": 852 }, { "completion_length": 210.45917510986328, "epoch": 0.08583647798742139, "grad_norm": 1.0405389070510864, "kl": 0.0478515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6609752774238586, "reward_std": 0.2177921012043953, "rewards/accuracy_reward": 0.681383490562439, "rewards/format_reward": 0.9795918464660645, "step": 853 }, { "completion_length": 241.94898223876953, "epoch": 0.085937106918239, "grad_norm": 1.1213735342025757, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7035534977912903, "reward_std": 0.2547962963581085, "rewards/accuracy_reward": 0.7239616215229034, "rewards/format_reward": 0.9795918166637421, "step": 854 }, { "completion_length": 219.28570556640625, "epoch": 0.0860377358490566, "grad_norm": 0.9121930003166199, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6526320576667786, "reward_std": 0.21443431079387665, "rewards/accuracy_reward": 0.6628361493349075, "rewards/format_reward": 0.9897959232330322, "step": 855 }, { "completion_length": 209.44898223876953, "epoch": 0.08613836477987422, "grad_norm": 1.4509941339492798, "kl": 0.0728759765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7258049845695496, "reward_std": 0.1901458278298378, "rewards/accuracy_reward": 0.7258049547672272, "rewards/format_reward": 1.0, "step": 856 }, { "completion_length": 290.9591751098633, "epoch": 0.08623899371069182, "grad_norm": 0.4998001754283905, "kl": 0.03778076171875, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5557741522789001, "reward_std": 0.1836806945502758, "rewards/accuracy_reward": 0.5761823654174805, "rewards/format_reward": 0.9795918166637421, "step": 857 }, { "completion_length": 267.6734619140625, "epoch": 0.08633962264150943, "grad_norm": 0.5544843673706055, "kl": 0.05859375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7040895819664001, "reward_std": 0.19506392627954483, "rewards/accuracy_reward": 0.7142936885356903, "rewards/format_reward": 0.9897959232330322, "step": 858 }, { "completion_length": 228.448974609375, "epoch": 0.08644025157232704, "grad_norm": 0.5501435399055481, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5972240567207336, "reward_std": 0.23069293797016144, "rewards/accuracy_reward": 0.6380403637886047, "rewards/format_reward": 0.9591836631298065, "step": 859 }, { "completion_length": 243.69387817382812, "epoch": 0.08654088050314465, "grad_norm": 0.5342289805412292, "kl": 0.0516357421875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6067476272583008, "reward_std": 0.19273757189512253, "rewards/accuracy_reward": 0.6271558105945587, "rewards/format_reward": 0.9795918464660645, "step": 860 }, { "completion_length": 285.9183654785156, "epoch": 0.08664150943396226, "grad_norm": 0.6621696949005127, "kl": 0.059326171875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.73702073097229, "reward_std": 0.30623985081911087, "rewards/accuracy_reward": 0.7880411446094513, "rewards/format_reward": 0.9489795565605164, "step": 861 }, { "completion_length": 205.7653045654297, "epoch": 0.08674213836477987, "grad_norm": 0.9004980325698853, "kl": 0.0491943359375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7578825950622559, "reward_std": 0.21735603362321854, "rewards/accuracy_reward": 0.7680867314338684, "rewards/format_reward": 0.9897959232330322, "step": 862 }, { "completion_length": 225.25509643554688, "epoch": 0.08684276729559748, "grad_norm": 2.047431468963623, "kl": 0.0584716796875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6272595524787903, "reward_std": 0.24967582523822784, "rewards/accuracy_reward": 0.6374635398387909, "rewards/format_reward": 0.9897959232330322, "step": 863 }, { "completion_length": 246.2040786743164, "epoch": 0.0869433962264151, "grad_norm": 1.0008245706558228, "kl": 0.0484619140625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5426291227340698, "reward_std": 0.24860120564699173, "rewards/accuracy_reward": 0.5528331995010376, "rewards/format_reward": 0.9897959232330322, "step": 864 }, { "completion_length": 201.25509643554688, "epoch": 0.08704402515723271, "grad_norm": 0.9778475165367126, "kl": 0.05859375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.680391252040863, "reward_std": 0.2725956067442894, "rewards/accuracy_reward": 0.6905953884124756, "rewards/format_reward": 0.9897959232330322, "step": 865 }, { "completion_length": 221.57142639160156, "epoch": 0.08714465408805032, "grad_norm": 1.535807490348816, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5718128681182861, "reward_std": 0.3576429709792137, "rewards/accuracy_reward": 0.5922210812568665, "rewards/format_reward": 0.9795918464660645, "step": 866 }, { "completion_length": 224.12244415283203, "epoch": 0.08724528301886793, "grad_norm": 0.8133891224861145, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7294724583625793, "reward_std": 0.2087610363960266, "rewards/accuracy_reward": 0.7396764755249023, "rewards/format_reward": 0.9897959232330322, "step": 867 }, { "completion_length": 254.73468017578125, "epoch": 0.08734591194968554, "grad_norm": 1.0070725679397583, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7346938848495483, "reward_std": 0.1348847821354866, "rewards/accuracy_reward": 0.734693855047226, "rewards/format_reward": 1.0, "step": 868 }, { "completion_length": 204.59183502197266, "epoch": 0.08744654088050315, "grad_norm": 0.8997864127159119, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8045557141304016, "reward_std": 0.11592771019786596, "rewards/accuracy_reward": 0.8249639570713043, "rewards/format_reward": 0.9795918166637421, "step": 869 }, { "completion_length": 286.8571319580078, "epoch": 0.08754716981132075, "grad_norm": 1.1430702209472656, "kl": 0.0452880859375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.388264775276184, "reward_std": 0.3092566728591919, "rewards/accuracy_reward": 0.43928518891334534, "rewards/format_reward": 0.9489795565605164, "step": 870 }, { "completion_length": 186.78570556640625, "epoch": 0.08764779874213836, "grad_norm": 0.9316595792770386, "kl": 0.06689453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7244898080825806, "reward_std": 0.16973835974931717, "rewards/accuracy_reward": 0.7244897782802582, "rewards/format_reward": 1.0, "step": 871 }, { "completion_length": 325.83673095703125, "epoch": 0.08774842767295597, "grad_norm": 4.880774974822998, "kl": 0.055908203125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.4209250211715698, "reward_std": 0.2742765173316002, "rewards/accuracy_reward": 0.44133318960666656, "rewards/format_reward": 0.9795918166637421, "step": 872 }, { "completion_length": 224.3775405883789, "epoch": 0.08784905660377358, "grad_norm": 0.6635849475860596, "kl": 0.0482177734375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6224489212036133, "reward_std": 0.18413827195763588, "rewards/accuracy_reward": 0.6224489808082581, "rewards/format_reward": 1.0, "step": 873 }, { "completion_length": 201.99999237060547, "epoch": 0.08794968553459119, "grad_norm": 1.7412370443344116, "kl": 0.0538330078125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6898444294929504, "reward_std": 0.2390686199069023, "rewards/accuracy_reward": 0.7102526724338531, "rewards/format_reward": 0.9795918166637421, "step": 874 }, { "completion_length": 214.38774871826172, "epoch": 0.0880503144654088, "grad_norm": 0.6236991286277771, "kl": 0.0682373046875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7398751378059387, "reward_std": 0.21941671520471573, "rewards/accuracy_reward": 0.7602833211421967, "rewards/format_reward": 0.9795918166637421, "step": 875 }, { "completion_length": 235.9285659790039, "epoch": 0.08815094339622641, "grad_norm": 1.960957646369934, "kl": 0.04278564453125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6292517185211182, "reward_std": 0.195150725543499, "rewards/accuracy_reward": 0.6292516589164734, "rewards/format_reward": 1.0, "step": 876 }, { "completion_length": 345.6836700439453, "epoch": 0.08825157232704403, "grad_norm": 0.8793219923973083, "kl": 0.048095703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.3164869546890259, "reward_std": 0.19837574288249016, "rewards/accuracy_reward": 0.32669101655483246, "rewards/format_reward": 0.9897959232330322, "step": 877 }, { "completion_length": 271.28570556640625, "epoch": 0.08835220125786164, "grad_norm": 0.6738544702529907, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6145904064178467, "reward_std": 0.21275970339775085, "rewards/accuracy_reward": 0.6145904958248138, "rewards/format_reward": 1.0, "step": 878 }, { "completion_length": 211.07142639160156, "epoch": 0.08845283018867925, "grad_norm": 0.7350566387176514, "kl": 0.036865234375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.639831304550171, "reward_std": 0.16740333288908005, "rewards/accuracy_reward": 0.6602394878864288, "rewards/format_reward": 0.9795918464660645, "step": 879 }, { "completion_length": 249.2448959350586, "epoch": 0.08855345911949686, "grad_norm": 0.8427582383155823, "kl": 0.048828125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7129656672477722, "reward_std": 0.21960216760635376, "rewards/accuracy_reward": 0.7129656374454498, "rewards/format_reward": 1.0, "step": 880 }, { "completion_length": 214.39795684814453, "epoch": 0.08865408805031447, "grad_norm": 0.6077181696891785, "kl": 0.0784912109375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.666958212852478, "reward_std": 0.1441105492413044, "rewards/accuracy_reward": 0.666958212852478, "rewards/format_reward": 1.0, "step": 881 }, { "completion_length": 297.12245178222656, "epoch": 0.08875471698113208, "grad_norm": 0.5160326361656189, "kl": 0.033935546875, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.5991315841674805, "reward_std": 0.1652422994375229, "rewards/accuracy_reward": 0.6297438144683838, "rewards/format_reward": 0.9693877398967743, "step": 882 }, { "completion_length": 201.65306091308594, "epoch": 0.08885534591194968, "grad_norm": 0.5147208571434021, "kl": 0.03857421875, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.761318325996399, "reward_std": 0.12060107290744781, "rewards/accuracy_reward": 0.7715223729610443, "rewards/format_reward": 0.9897959232330322, "step": 883 }, { "completion_length": 244.12245178222656, "epoch": 0.0889559748427673, "grad_norm": 0.4857829809188843, "kl": 0.0347900390625, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.6622275114059448, "reward_std": 0.11265519633889198, "rewards/accuracy_reward": 0.6724315881729126, "rewards/format_reward": 0.9897959232330322, "step": 884 }, { "completion_length": 249.10203552246094, "epoch": 0.0890566037735849, "grad_norm": 0.7393012046813965, "kl": 0.05224609375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.689244270324707, "reward_std": 0.23012571036815643, "rewards/accuracy_reward": 0.6994484066963196, "rewards/format_reward": 0.9897959232330322, "step": 885 }, { "completion_length": 237.62244415283203, "epoch": 0.08915723270440251, "grad_norm": 15.187448501586914, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.732699990272522, "reward_std": 0.13418199867010117, "rewards/accuracy_reward": 0.7429040670394897, "rewards/format_reward": 0.9897959232330322, "step": 886 }, { "completion_length": 262.87754821777344, "epoch": 0.08925786163522012, "grad_norm": 0.7164854407310486, "kl": 0.05078125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6469387412071228, "reward_std": 0.21293430030345917, "rewards/accuracy_reward": 0.6469387710094452, "rewards/format_reward": 1.0, "step": 887 }, { "completion_length": 245.54080963134766, "epoch": 0.08935849056603773, "grad_norm": 0.6665762662887573, "kl": 0.052001953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7699708342552185, "reward_std": 0.21441636234521866, "rewards/accuracy_reward": 0.7801749408245087, "rewards/format_reward": 0.9897959232330322, "step": 888 }, { "completion_length": 250.6734619140625, "epoch": 0.08945911949685535, "grad_norm": 1.6195968389511108, "kl": 0.0450439453125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6938775181770325, "reward_std": 0.23491598665714264, "rewards/accuracy_reward": 0.6938775479793549, "rewards/format_reward": 1.0, "step": 889 }, { "completion_length": 248.89795684814453, "epoch": 0.08955974842767296, "grad_norm": 0.7968849539756775, "kl": 0.0518798828125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6034847497940063, "reward_std": 0.11759207397699356, "rewards/accuracy_reward": 0.603484719991684, "rewards/format_reward": 1.0, "step": 890 }, { "completion_length": 263.1326446533203, "epoch": 0.08966037735849057, "grad_norm": 0.8111340403556824, "kl": 0.046875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6267491579055786, "reward_std": 0.27233756333589554, "rewards/accuracy_reward": 0.6267492771148682, "rewards/format_reward": 1.0, "step": 891 }, { "completion_length": 283.63265228271484, "epoch": 0.08976100628930818, "grad_norm": 0.7511139512062073, "kl": 0.0445556640625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.5636575818061829, "reward_std": 0.19585898891091347, "rewards/accuracy_reward": 0.5636577010154724, "rewards/format_reward": 1.0, "step": 892 }, { "completion_length": 251.59183502197266, "epoch": 0.08986163522012579, "grad_norm": 0.9403900504112244, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7018433809280396, "reward_std": 0.3204573839902878, "rewards/accuracy_reward": 0.7120474278926849, "rewards/format_reward": 0.9897959232330322, "step": 893 }, { "completion_length": 218.448974609375, "epoch": 0.0899622641509434, "grad_norm": 5.095252990722656, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7265305519104004, "reward_std": 0.14667000621557236, "rewards/accuracy_reward": 0.7571428418159485, "rewards/format_reward": 0.9693877398967743, "step": 894 }, { "completion_length": 208.7040786743164, "epoch": 0.090062893081761, "grad_norm": 1.047179102897644, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7557510733604431, "reward_std": 0.21225948631763458, "rewards/accuracy_reward": 0.7761591970920563, "rewards/format_reward": 0.9795918464660645, "step": 895 }, { "completion_length": 242.23468780517578, "epoch": 0.09016352201257861, "grad_norm": 0.8529000282287598, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6003401279449463, "reward_std": 0.17340492829680443, "rewards/accuracy_reward": 0.6003400981426239, "rewards/format_reward": 1.0, "step": 896 }, { "completion_length": 262.39794921875, "epoch": 0.09026415094339622, "grad_norm": 0.8046742677688599, "kl": 0.04718017578125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6241981983184814, "reward_std": 0.2597734332084656, "rewards/accuracy_reward": 0.6446064114570618, "rewards/format_reward": 0.9795918166637421, "step": 897 }, { "completion_length": 228.6326446533203, "epoch": 0.09036477987421383, "grad_norm": 2.6233575344085693, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.588948130607605, "reward_std": 0.23952243477106094, "rewards/accuracy_reward": 0.6093563139438629, "rewards/format_reward": 0.9795918166637421, "step": 898 }, { "completion_length": 168.48979949951172, "epoch": 0.09046540880503144, "grad_norm": 1.4481614828109741, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5384154319763184, "reward_std": 0.1637556292116642, "rewards/accuracy_reward": 0.5384155064821243, "rewards/format_reward": 1.0, "step": 899 }, { "completion_length": 252.4897918701172, "epoch": 0.09056603773584905, "grad_norm": 0.9614699482917786, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.691801130771637, "reward_std": 0.18122725933790207, "rewards/accuracy_reward": 0.7020052373409271, "rewards/format_reward": 0.9897959232330322, "step": 900 }, { "completion_length": 206.31632232666016, "epoch": 0.09066666666666667, "grad_norm": 0.8865187764167786, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6384353637695312, "reward_std": 0.21662871539592743, "rewards/accuracy_reward": 0.648639440536499, "rewards/format_reward": 0.9897959232330322, "step": 901 }, { "completion_length": 211.37754821777344, "epoch": 0.09076729559748428, "grad_norm": 1.2863200902938843, "kl": 0.04052734375, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7145373821258545, "reward_std": 0.13167384639382362, "rewards/accuracy_reward": 0.7145373523235321, "rewards/format_reward": 1.0, "step": 902 }, { "completion_length": 241.78570556640625, "epoch": 0.09086792452830189, "grad_norm": 0.7853947877883911, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8030778765678406, "reward_std": 0.17423007637262344, "rewards/accuracy_reward": 0.8234860599040985, "rewards/format_reward": 0.9795918166637421, "step": 903 }, { "completion_length": 279.74488830566406, "epoch": 0.0909685534591195, "grad_norm": 0.7240082025527954, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5177356600761414, "reward_std": 0.21219895780086517, "rewards/accuracy_reward": 0.5279397666454315, "rewards/format_reward": 0.9897959232330322, "step": 904 }, { "completion_length": 268.54080963134766, "epoch": 0.09106918238993711, "grad_norm": 0.7931084632873535, "kl": 0.049072265625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5254607796669006, "reward_std": 0.2463872730731964, "rewards/accuracy_reward": 0.5254607051610947, "rewards/format_reward": 1.0, "step": 905 }, { "completion_length": 223.34693145751953, "epoch": 0.09116981132075472, "grad_norm": 0.8037495613098145, "kl": 0.0457763671875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.676403522491455, "reward_std": 0.26502998918294907, "rewards/accuracy_reward": 0.6764034926891327, "rewards/format_reward": 1.0, "step": 906 }, { "completion_length": 243.551025390625, "epoch": 0.09127044025157233, "grad_norm": 0.8852136135101318, "kl": 0.03338623046875, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.5603954195976257, "reward_std": 0.2682941257953644, "rewards/accuracy_reward": 0.5603954344987869, "rewards/format_reward": 1.0, "step": 907 }, { "completion_length": 192.52040100097656, "epoch": 0.09137106918238994, "grad_norm": 2.0003833770751953, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.686813235282898, "reward_std": 0.20408735424280167, "rewards/accuracy_reward": 0.686813235282898, "rewards/format_reward": 1.0, "step": 908 }, { "completion_length": 222.76529693603516, "epoch": 0.09147169811320754, "grad_norm": 0.8323714733123779, "kl": 0.0545654296875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7575315833091736, "reward_std": 0.19708926230669022, "rewards/accuracy_reward": 0.7575315833091736, "rewards/format_reward": 1.0, "step": 909 }, { "completion_length": 210.4081573486328, "epoch": 0.09157232704402515, "grad_norm": 0.8760356903076172, "kl": 0.0716552734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6209641098976135, "reward_std": 0.27211252599954605, "rewards/accuracy_reward": 0.6209640800952911, "rewards/format_reward": 1.0, "step": 910 }, { "completion_length": 237.98979949951172, "epoch": 0.09167295597484276, "grad_norm": 1.2426841259002686, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6881614923477173, "reward_std": 0.2619655579328537, "rewards/accuracy_reward": 0.6983655691146851, "rewards/format_reward": 0.9897959232330322, "step": 911 }, { "completion_length": 212.25509643554688, "epoch": 0.09177358490566037, "grad_norm": 1.4395055770874023, "kl": 0.0589599609375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5663280487060547, "reward_std": 0.3530150353908539, "rewards/accuracy_reward": 0.596940353512764, "rewards/format_reward": 0.9693877398967743, "step": 912 }, { "completion_length": 230.35713958740234, "epoch": 0.091874213836478, "grad_norm": 1.1608928442001343, "kl": 0.0677490234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7906854748725891, "reward_std": 0.25655561685562134, "rewards/accuracy_reward": 0.8008895516395569, "rewards/format_reward": 0.9897959232330322, "step": 913 }, { "completion_length": 176.64285278320312, "epoch": 0.0919748427672956, "grad_norm": 1.1230387687683105, "kl": 0.0697021484375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.769364595413208, "reward_std": 0.2347715198993683, "rewards/accuracy_reward": 0.7897728085517883, "rewards/format_reward": 0.9795918166637421, "step": 914 }, { "completion_length": 274.84693145751953, "epoch": 0.09207547169811321, "grad_norm": 1.1121940612792969, "kl": 0.032470703125, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.7510967254638672, "reward_std": 0.3210863396525383, "rewards/accuracy_reward": 0.7715049386024475, "rewards/format_reward": 0.9795918166637421, "step": 915 }, { "completion_length": 231.57141876220703, "epoch": 0.09217610062893082, "grad_norm": 0.8412212133407593, "kl": 0.0445556640625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7422366738319397, "reward_std": 0.17160462960600853, "rewards/accuracy_reward": 0.7422367036342621, "rewards/format_reward": 1.0, "step": 916 }, { "completion_length": 232.36734008789062, "epoch": 0.09227672955974843, "grad_norm": 1.1741979122161865, "kl": 0.0694580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5527979135513306, "reward_std": 0.18594343587756157, "rewards/accuracy_reward": 0.5630019903182983, "rewards/format_reward": 0.9897959232330322, "step": 917 }, { "completion_length": 311.99998474121094, "epoch": 0.09237735849056604, "grad_norm": 1.5993320941925049, "kl": 0.0369873046875, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.434465765953064, "reward_std": 0.33387628197669983, "rewards/accuracy_reward": 0.4548740088939667, "rewards/format_reward": 0.9795918166637421, "step": 918 }, { "completion_length": 187.6530532836914, "epoch": 0.09247798742138365, "grad_norm": 1.0228118896484375, "kl": 0.065185546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6738580465316772, "reward_std": 0.13402467593550682, "rewards/accuracy_reward": 0.6738581657409668, "rewards/format_reward": 1.0, "step": 919 }, { "completion_length": 284.2857131958008, "epoch": 0.09257861635220126, "grad_norm": 0.807532548904419, "kl": 0.048583984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.564625859260559, "reward_std": 0.2393343597650528, "rewards/accuracy_reward": 0.6054421961307526, "rewards/format_reward": 0.9591836333274841, "step": 920 }, { "completion_length": 216.32652282714844, "epoch": 0.09267924528301887, "grad_norm": 1.2368793487548828, "kl": 0.0440673828125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6337684392929077, "reward_std": 0.3079911768436432, "rewards/accuracy_reward": 0.6337684988975525, "rewards/format_reward": 1.0, "step": 921 }, { "completion_length": 190.60203552246094, "epoch": 0.09277987421383647, "grad_norm": 1.158417820930481, "kl": 0.05908203125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7602084279060364, "reward_std": 0.2661292999982834, "rewards/accuracy_reward": 0.7704124450683594, "rewards/format_reward": 0.9897959232330322, "step": 922 }, { "completion_length": 239.6836700439453, "epoch": 0.09288050314465408, "grad_norm": 1.180692434310913, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6304891109466553, "reward_std": 0.2640368491411209, "rewards/accuracy_reward": 0.6406931579113007, "rewards/format_reward": 0.9897959232330322, "step": 923 }, { "completion_length": 273.28570556640625, "epoch": 0.09298113207547169, "grad_norm": 0.6946415901184082, "kl": 0.051025390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5812547206878662, "reward_std": 0.15403539314866066, "rewards/accuracy_reward": 0.591458797454834, "rewards/format_reward": 0.9897959232330322, "step": 924 }, { "completion_length": 311.5102081298828, "epoch": 0.0930817610062893, "grad_norm": 0.8506745100021362, "kl": 0.0394287109375, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.571795105934143, "reward_std": 0.363990917801857, "rewards/accuracy_reward": 0.6126113831996918, "rewards/format_reward": 0.9591836631298065, "step": 925 }, { "completion_length": 179.57142639160156, "epoch": 0.09318238993710692, "grad_norm": 0.7806074023246765, "kl": 0.0496826171875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7433106899261475, "reward_std": 0.11785290390253067, "rewards/accuracy_reward": 0.7433106303215027, "rewards/format_reward": 1.0, "step": 926 }, { "completion_length": 237.73468017578125, "epoch": 0.09328301886792453, "grad_norm": 0.7498762607574463, "kl": 0.05462646484375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6430197358131409, "reward_std": 0.16381589323282242, "rewards/accuracy_reward": 0.653223842382431, "rewards/format_reward": 0.9897959232330322, "step": 927 }, { "completion_length": 230.948974609375, "epoch": 0.09338364779874214, "grad_norm": 0.58657306432724, "kl": 0.0472412109375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7800231575965881, "reward_std": 0.07800853252410889, "rewards/accuracy_reward": 0.7800232470035553, "rewards/format_reward": 1.0, "step": 928 }, { "completion_length": 253.37754821777344, "epoch": 0.09348427672955975, "grad_norm": 1.153808355331421, "kl": 0.0499267578125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.4810243248939514, "reward_std": 0.3007735162973404, "rewards/accuracy_reward": 0.5014324933290482, "rewards/format_reward": 0.9795918464660645, "step": 929 }, { "completion_length": 191.77550506591797, "epoch": 0.09358490566037736, "grad_norm": 0.7790164351463318, "kl": 0.0550537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.732430100440979, "reward_std": 0.14855307526886463, "rewards/accuracy_reward": 0.7324301302433014, "rewards/format_reward": 1.0, "step": 930 }, { "completion_length": 210.9795913696289, "epoch": 0.09368553459119497, "grad_norm": 0.46461525559425354, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6179189682006836, "reward_std": 0.12191035225987434, "rewards/accuracy_reward": 0.617918998003006, "rewards/format_reward": 1.0, "step": 931 }, { "completion_length": 235.43877410888672, "epoch": 0.09378616352201258, "grad_norm": 1.0585540533065796, "kl": 0.0574951171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5020408630371094, "reward_std": 0.24264001846313477, "rewards/accuracy_reward": 0.5122449100017548, "rewards/format_reward": 0.9897959232330322, "step": 932 }, { "completion_length": 256.1428451538086, "epoch": 0.09388679245283019, "grad_norm": 0.6218725442886353, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6030715703964233, "reward_std": 0.13452672958374023, "rewards/accuracy_reward": 0.6132756471633911, "rewards/format_reward": 0.9897959232330322, "step": 933 }, { "completion_length": 298.96937561035156, "epoch": 0.0939874213836478, "grad_norm": 0.936805248260498, "kl": 0.0435791015625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5538994073867798, "reward_std": 0.14578072726726532, "rewards/accuracy_reward": 0.5641034841537476, "rewards/format_reward": 0.9897959232330322, "step": 934 }, { "completion_length": 247.1326446533203, "epoch": 0.0940880503144654, "grad_norm": 0.7870685458183289, "kl": 0.0516357421875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7526130080223083, "reward_std": 0.23182744532823563, "rewards/accuracy_reward": 0.7628171443939209, "rewards/format_reward": 0.9897959232330322, "step": 935 }, { "completion_length": 183.4897918701172, "epoch": 0.09418867924528301, "grad_norm": 0.9446008205413818, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8049886226654053, "reward_std": 0.2576056867837906, "rewards/accuracy_reward": 0.8049886524677277, "rewards/format_reward": 1.0, "step": 936 }, { "completion_length": 230.948974609375, "epoch": 0.09428930817610062, "grad_norm": 0.7829163074493408, "kl": 0.04736328125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.8469387292861938, "reward_std": 0.15069952607154846, "rewards/accuracy_reward": 0.8469387590885162, "rewards/format_reward": 1.0, "step": 937 }, { "completion_length": 186.80612182617188, "epoch": 0.09438993710691825, "grad_norm": 0.9975273013114929, "kl": 0.0523681640625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6971514225006104, "reward_std": 0.22577552497386932, "rewards/accuracy_reward": 0.7175595462322235, "rewards/format_reward": 0.9795918464660645, "step": 938 }, { "completion_length": 285.55101013183594, "epoch": 0.09449056603773585, "grad_norm": 1.3229514360427856, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.554012656211853, "reward_std": 0.2768019139766693, "rewards/accuracy_reward": 0.5744208097457886, "rewards/format_reward": 0.9795918464660645, "step": 939 }, { "completion_length": 260.5612106323242, "epoch": 0.09459119496855346, "grad_norm": 4.977051734924316, "kl": 0.1429443359375, "learning_rate": 1e-06, "loss": 0.0057, "reward": 1.8089671730995178, "reward_std": 0.14067328721284866, "rewards/accuracy_reward": 0.8089672327041626, "rewards/format_reward": 1.0, "step": 940 }, { "completion_length": 204.58163452148438, "epoch": 0.09469182389937107, "grad_norm": 1.2206058502197266, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7753314971923828, "reward_std": 0.1921900361776352, "rewards/accuracy_reward": 0.795739620923996, "rewards/format_reward": 0.9795918464660645, "step": 941 }, { "completion_length": 215.9183578491211, "epoch": 0.09479245283018868, "grad_norm": 1.1276397705078125, "kl": 0.03448486328125, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.65597665309906, "reward_std": 0.19922291487455368, "rewards/accuracy_reward": 0.6559766530990601, "rewards/format_reward": 1.0, "step": 942 }, { "completion_length": 209.05101776123047, "epoch": 0.09489308176100629, "grad_norm": 1.9981858730316162, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6125710010528564, "reward_std": 0.20813453197479248, "rewards/accuracy_reward": 0.6227751076221466, "rewards/format_reward": 0.9897959232330322, "step": 943 }, { "completion_length": 218.90816497802734, "epoch": 0.0949937106918239, "grad_norm": 1.4440075159072876, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7427602410316467, "reward_std": 0.2631742060184479, "rewards/accuracy_reward": 0.7529643774032593, "rewards/format_reward": 0.9897959232330322, "step": 944 }, { "completion_length": 197.2346954345703, "epoch": 0.09509433962264151, "grad_norm": 0.6188681721687317, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.728243887424469, "reward_std": 0.13247238844633102, "rewards/accuracy_reward": 0.7282439172267914, "rewards/format_reward": 1.0, "step": 945 }, { "completion_length": 225.64285278320312, "epoch": 0.09519496855345912, "grad_norm": 0.9082074165344238, "kl": 0.04345703125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7090378999710083, "reward_std": 0.1908634752035141, "rewards/accuracy_reward": 0.7192419469356537, "rewards/format_reward": 0.9897959232330322, "step": 946 }, { "completion_length": 177.81632232666016, "epoch": 0.09529559748427673, "grad_norm": 2.087998628616333, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.669387698173523, "reward_std": 0.27375099807977676, "rewards/accuracy_reward": 0.6795918047428131, "rewards/format_reward": 0.9897959232330322, "step": 947 }, { "completion_length": 264.3061218261719, "epoch": 0.09539622641509433, "grad_norm": 1.6752976179122925, "kl": 0.0498046875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6778756380081177, "reward_std": 0.22525569796562195, "rewards/accuracy_reward": 0.6778756976127625, "rewards/format_reward": 1.0, "step": 948 }, { "completion_length": 204.60204315185547, "epoch": 0.09549685534591194, "grad_norm": 0.6625308394432068, "kl": 0.04278564453125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6999204754829407, "reward_std": 0.16187924146652222, "rewards/accuracy_reward": 0.7101245820522308, "rewards/format_reward": 0.9897959232330322, "step": 949 }, { "completion_length": 182.7040786743164, "epoch": 0.09559748427672957, "grad_norm": 1.4377686977386475, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7398208379745483, "reward_std": 0.14278017356991768, "rewards/accuracy_reward": 0.7398209273815155, "rewards/format_reward": 1.0, "step": 950 }, { "completion_length": 226.5306167602539, "epoch": 0.09569811320754718, "grad_norm": 0.8922439217567444, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.621372401714325, "reward_std": 0.2364855408668518, "rewards/accuracy_reward": 0.6315764486789703, "rewards/format_reward": 0.9897959232330322, "step": 951 }, { "completion_length": 172.10204315185547, "epoch": 0.09579874213836478, "grad_norm": 1.1641069650650024, "kl": 0.0653076171875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7376802563667297, "reward_std": 0.15887406840920448, "rewards/accuracy_reward": 0.7376802861690521, "rewards/format_reward": 1.0, "step": 952 }, { "completion_length": 197.27550506591797, "epoch": 0.0958993710691824, "grad_norm": 1.3062129020690918, "kl": 0.094970703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7944959998130798, "reward_std": 0.17799173295497894, "rewards/accuracy_reward": 0.8047000467777252, "rewards/format_reward": 0.9897959232330322, "step": 953 }, { "completion_length": 158.64285278320312, "epoch": 0.096, "grad_norm": 4.365025997161865, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8163264989852905, "reward_std": 0.16302528232336044, "rewards/accuracy_reward": 0.8265305757522583, "rewards/format_reward": 0.9897959232330322, "step": 954 }, { "completion_length": 168.9693832397461, "epoch": 0.09610062893081761, "grad_norm": 0.5818960070610046, "kl": 0.0548095703125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8469387292861938, "reward_std": 0.1270286738872528, "rewards/accuracy_reward": 0.8469387590885162, "rewards/format_reward": 1.0, "step": 955 }, { "completion_length": 226.03060913085938, "epoch": 0.09620125786163522, "grad_norm": 0.642296552658081, "kl": 0.0731201171875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.658253252506256, "reward_std": 0.23288630694150925, "rewards/accuracy_reward": 0.678661435842514, "rewards/format_reward": 0.9795918464660645, "step": 956 }, { "completion_length": 209.38774871826172, "epoch": 0.09630188679245283, "grad_norm": 1.3646646738052368, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8951246738433838, "reward_std": 0.20138927549123764, "rewards/accuracy_reward": 0.905328780412674, "rewards/format_reward": 0.9897959232330322, "step": 957 }, { "completion_length": 236.53060913085938, "epoch": 0.09640251572327044, "grad_norm": 0.9614511132240295, "kl": 0.0562744140625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5229714512825012, "reward_std": 0.22360632568597794, "rewards/accuracy_reward": 0.5229714512825012, "rewards/format_reward": 1.0, "step": 958 }, { "completion_length": 211.9693832397461, "epoch": 0.09650314465408805, "grad_norm": 0.994245707988739, "kl": 0.0494384765625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6816326379776, "reward_std": 0.19083648175001144, "rewards/accuracy_reward": 0.6918367147445679, "rewards/format_reward": 0.9897959232330322, "step": 959 }, { "completion_length": 203.11224365234375, "epoch": 0.09660377358490566, "grad_norm": 1.322145700454712, "kl": 0.0733642578125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5244898200035095, "reward_std": 0.20649219304323196, "rewards/accuracy_reward": 0.5653060972690582, "rewards/format_reward": 0.9591836631298065, "step": 960 }, { "completion_length": 173.14285278320312, "epoch": 0.09670440251572326, "grad_norm": 1.3800655603408813, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7418270111083984, "reward_std": 0.24446451663970947, "rewards/accuracy_reward": 0.7826433181762695, "rewards/format_reward": 0.9591836631298065, "step": 961 }, { "completion_length": 260.1428527832031, "epoch": 0.09680503144654089, "grad_norm": 0.471605122089386, "kl": 0.0545654296875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6850340366363525, "reward_std": 0.2355017364025116, "rewards/accuracy_reward": 0.7156462371349335, "rewards/format_reward": 0.9693877398967743, "step": 962 }, { "completion_length": 212.15306091308594, "epoch": 0.0969056603773585, "grad_norm": 1.3412365913391113, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6423711776733398, "reward_std": 0.2936323806643486, "rewards/accuracy_reward": 0.6627793908119202, "rewards/format_reward": 0.9795918166637421, "step": 963 }, { "completion_length": 159.9387664794922, "epoch": 0.0970062893081761, "grad_norm": 1.2910951375961304, "kl": 0.0450439453125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.8244897723197937, "reward_std": 0.18642308562994003, "rewards/accuracy_reward": 0.8244897723197937, "rewards/format_reward": 1.0, "step": 964 }, { "completion_length": 198.05101013183594, "epoch": 0.09710691823899371, "grad_norm": 8.321893692016602, "kl": 0.055908203125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.735251545906067, "reward_std": 0.27427829802036285, "rewards/accuracy_reward": 0.7454556524753571, "rewards/format_reward": 0.9897959232330322, "step": 965 }, { "completion_length": 238.948974609375, "epoch": 0.09720754716981132, "grad_norm": 1.2712323665618896, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.577551007270813, "reward_std": 0.22049149870872498, "rewards/accuracy_reward": 0.5775510370731354, "rewards/format_reward": 1.0, "step": 966 }, { "completion_length": 294.05101013183594, "epoch": 0.09730817610062893, "grad_norm": 0.8999652862548828, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5031465888023376, "reward_std": 0.2708842232823372, "rewards/accuracy_reward": 0.5031466782093048, "rewards/format_reward": 1.0, "step": 967 }, { "completion_length": 224.75509643554688, "epoch": 0.09740880503144654, "grad_norm": 1.4563145637512207, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.689980924129486, "reward_std": 0.30204570293426514, "rewards/accuracy_reward": 0.6899808645248413, "rewards/format_reward": 1.0, "step": 968 }, { "completion_length": 234.38774871826172, "epoch": 0.09750943396226415, "grad_norm": 0.8879889845848083, "kl": 0.0509033203125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7512677907943726, "reward_std": 0.23449905216693878, "rewards/accuracy_reward": 0.7614718377590179, "rewards/format_reward": 0.9897959232330322, "step": 969 }, { "completion_length": 207.448974609375, "epoch": 0.09761006289308176, "grad_norm": 0.8406742811203003, "kl": 0.0394287109375, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7708556652069092, "reward_std": 0.20876939594745636, "rewards/accuracy_reward": 0.7708556652069092, "rewards/format_reward": 1.0, "step": 970 }, { "completion_length": 198.1938705444336, "epoch": 0.09771069182389937, "grad_norm": 0.7793022394180298, "kl": 0.0452880859375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6521620154380798, "reward_std": 0.16060733795166016, "rewards/accuracy_reward": 0.6725701987743378, "rewards/format_reward": 0.9795918166637421, "step": 971 }, { "completion_length": 185.79591369628906, "epoch": 0.09781132075471698, "grad_norm": 1.0794398784637451, "kl": 0.0460205078125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6749271750450134, "reward_std": 0.23080143332481384, "rewards/accuracy_reward": 0.6851311922073364, "rewards/format_reward": 0.9897959232330322, "step": 972 }, { "completion_length": 338.551025390625, "epoch": 0.09791194968553459, "grad_norm": 2.4073281288146973, "kl": 0.056884765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.4102040529251099, "reward_std": 0.2903100848197937, "rewards/accuracy_reward": 0.430612251162529, "rewards/format_reward": 0.9795918464660645, "step": 973 }, { "completion_length": 219.88774871826172, "epoch": 0.0980125786163522, "grad_norm": 0.6986329555511475, "kl": 0.0584716796875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7180206775665283, "reward_std": 0.1012127548456192, "rewards/accuracy_reward": 0.7282247841358185, "rewards/format_reward": 0.9897959232330322, "step": 974 }, { "completion_length": 272.49999237060547, "epoch": 0.09811320754716982, "grad_norm": 0.5290104746818542, "kl": 0.0469970703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7894569039344788, "reward_std": 0.19760526344180107, "rewards/accuracy_reward": 0.7996610403060913, "rewards/format_reward": 0.9897959232330322, "step": 975 }, { "completion_length": 251.81631469726562, "epoch": 0.09821383647798743, "grad_norm": 1.048566460609436, "kl": 0.048095703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.626660168170929, "reward_std": 0.31234803795814514, "rewards/accuracy_reward": 0.6572723984718323, "rewards/format_reward": 0.9693877398967743, "step": 976 }, { "completion_length": 227.23468780517578, "epoch": 0.09831446540880504, "grad_norm": 0.7925608158111572, "kl": 0.052490234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6231778264045715, "reward_std": 0.18003105372190475, "rewards/accuracy_reward": 0.6333818882703781, "rewards/format_reward": 0.9897959232330322, "step": 977 }, { "completion_length": 234.9285659790039, "epoch": 0.09841509433962264, "grad_norm": 0.6964252591133118, "kl": 0.0487060546875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7070489525794983, "reward_std": 0.1953403651714325, "rewards/accuracy_reward": 0.7172530293464661, "rewards/format_reward": 0.9897959232330322, "step": 978 }, { "completion_length": 290.6632537841797, "epoch": 0.09851572327044025, "grad_norm": 0.5419672727584839, "kl": 0.04248046875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.665306031703949, "reward_std": 0.16032027080655098, "rewards/accuracy_reward": 0.6755101680755615, "rewards/format_reward": 0.9897959232330322, "step": 979 }, { "completion_length": 273.7142868041992, "epoch": 0.09861635220125786, "grad_norm": 0.5729218125343323, "kl": 0.04052734375, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.740524709224701, "reward_std": 0.24179081618785858, "rewards/accuracy_reward": 0.7507288455963135, "rewards/format_reward": 0.9897959232330322, "step": 980 }, { "completion_length": 237.51020050048828, "epoch": 0.09871698113207547, "grad_norm": 1.4427727460861206, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6608927249908447, "reward_std": 0.2736624553799629, "rewards/accuracy_reward": 0.6710967719554901, "rewards/format_reward": 0.9897959232330322, "step": 981 }, { "completion_length": 240.16326141357422, "epoch": 0.09881761006289308, "grad_norm": 0.9661228656768799, "kl": 0.0455322265625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6321443915367126, "reward_std": 0.17809298262000084, "rewards/accuracy_reward": 0.6423485279083252, "rewards/format_reward": 0.9897959232330322, "step": 982 }, { "completion_length": 234.24488830566406, "epoch": 0.09891823899371069, "grad_norm": 0.9877350330352783, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5139208436012268, "reward_std": 0.24612388759851456, "rewards/accuracy_reward": 0.5241248905658722, "rewards/format_reward": 0.9897959232330322, "step": 983 }, { "completion_length": 200.58163452148438, "epoch": 0.0990188679245283, "grad_norm": 0.4018634855747223, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7857142090797424, "reward_std": 0.09437987208366394, "rewards/accuracy_reward": 0.8061224222183228, "rewards/format_reward": 0.9795918166637421, "step": 984 }, { "completion_length": 265.7142791748047, "epoch": 0.0991194968553459, "grad_norm": 1.246738314628601, "kl": 0.052734375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6175143718719482, "reward_std": 0.23837263137102127, "rewards/accuracy_reward": 0.6583306789398193, "rewards/format_reward": 0.9591836631298065, "step": 985 }, { "completion_length": 240.6938705444336, "epoch": 0.09922012578616352, "grad_norm": 0.9364804029464722, "kl": 0.042236328125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.455862045288086, "reward_std": 0.22188221663236618, "rewards/accuracy_reward": 0.4660661518573761, "rewards/format_reward": 0.9897959232330322, "step": 986 }, { "completion_length": 239.78571319580078, "epoch": 0.09932075471698114, "grad_norm": 2.441499710083008, "kl": 0.0928955078125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.4415037631988525, "reward_std": 0.3101746290922165, "rewards/accuracy_reward": 0.4517078548669815, "rewards/format_reward": 0.9897959232330322, "step": 987 }, { "completion_length": 263.7040710449219, "epoch": 0.09942138364779875, "grad_norm": 1.5627021789550781, "kl": 0.0423583984375, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.49885094165802, "reward_std": 0.2891492247581482, "rewards/accuracy_reward": 0.5090550780296326, "rewards/format_reward": 0.9897959232330322, "step": 988 }, { "completion_length": 153.25509643554688, "epoch": 0.09952201257861636, "grad_norm": 4.473246097564697, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6692676544189453, "reward_std": 0.2417593151330948, "rewards/accuracy_reward": 0.6794717609882355, "rewards/format_reward": 0.9897959232330322, "step": 989 }, { "completion_length": 266.2346878051758, "epoch": 0.09962264150943397, "grad_norm": 1.1956872940063477, "kl": 0.0877685546875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6843537092208862, "reward_std": 0.2827950790524483, "rewards/accuracy_reward": 0.7047618925571442, "rewards/format_reward": 0.9795918464660645, "step": 990 }, { "completion_length": 217.59182739257812, "epoch": 0.09972327044025157, "grad_norm": 1.3953561782836914, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6207174062728882, "reward_std": 0.23219048231840134, "rewards/accuracy_reward": 0.6207173764705658, "rewards/format_reward": 1.0, "step": 991 }, { "completion_length": 210.34693145751953, "epoch": 0.09982389937106918, "grad_norm": 0.8145104050636292, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.700370967388153, "reward_std": 0.1402660757303238, "rewards/accuracy_reward": 0.7003710269927979, "rewards/format_reward": 1.0, "step": 992 }, { "completion_length": 278.27549743652344, "epoch": 0.09992452830188679, "grad_norm": 1.0045194625854492, "kl": 0.0484619140625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5421584248542786, "reward_std": 0.2321646809577942, "rewards/accuracy_reward": 0.5421584695577621, "rewards/format_reward": 1.0, "step": 993 }, { "completion_length": 310.1530456542969, "epoch": 0.1000251572327044, "grad_norm": 0.8764633536338806, "kl": 0.049560546875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5510203838348389, "reward_std": 0.3247663378715515, "rewards/accuracy_reward": 0.5714285373687744, "rewards/format_reward": 0.9795918166637421, "step": 994 }, { "completion_length": 195.16326141357422, "epoch": 0.10012578616352201, "grad_norm": 1.301651120185852, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7154493927955627, "reward_std": 0.19950930029153824, "rewards/accuracy_reward": 0.7154494225978851, "rewards/format_reward": 1.0, "step": 995 }, { "completion_length": 286.4693832397461, "epoch": 0.10022641509433962, "grad_norm": 0.8186428546905518, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6326587796211243, "reward_std": 0.2158619686961174, "rewards/accuracy_reward": 0.66327103972435, "rewards/format_reward": 0.9693877398967743, "step": 996 }, { "completion_length": 241.86734771728516, "epoch": 0.10032704402515723, "grad_norm": 1.534914255142212, "kl": 0.0562744140625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6701247692108154, "reward_std": 0.3466052711009979, "rewards/accuracy_reward": 0.7109410166740417, "rewards/format_reward": 0.9591836631298065, "step": 997 }, { "completion_length": 223.80611419677734, "epoch": 0.10042767295597484, "grad_norm": 1.1962194442749023, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7503401041030884, "reward_std": 0.10930342227220535, "rewards/accuracy_reward": 0.7605442404747009, "rewards/format_reward": 0.9897959232330322, "step": 998 }, { "completion_length": 271.60203552246094, "epoch": 0.10052830188679246, "grad_norm": 0.9760859608650208, "kl": 0.0465087890625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6613237857818604, "reward_std": 0.20724325627088547, "rewards/accuracy_reward": 0.6919361352920532, "rewards/format_reward": 0.9693877398967743, "step": 999 }, { "completion_length": 203.79591369628906, "epoch": 0.10062893081761007, "grad_norm": 1.1527886390686035, "kl": 0.0474853515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.8278910517692566, "reward_std": 0.18794985115528107, "rewards/accuracy_reward": 0.8482992947101593, "rewards/format_reward": 0.9795918166637421, "step": 1000 }, { "completion_length": 253.25509643554688, "epoch": 0.10072955974842768, "grad_norm": 3.4504776000976562, "kl": 0.107666015625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.5217642188072205, "reward_std": 0.31700053811073303, "rewards/accuracy_reward": 0.5319682955741882, "rewards/format_reward": 0.9897959232330322, "step": 1001 }, { "completion_length": 270.33673095703125, "epoch": 0.10083018867924529, "grad_norm": 1.1323031187057495, "kl": 0.0369873046875, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.548899233341217, "reward_std": 0.3834957778453827, "rewards/accuracy_reward": 0.5693073868751526, "rewards/format_reward": 0.9795918166637421, "step": 1002 }, { "completion_length": 255.06122589111328, "epoch": 0.1009308176100629, "grad_norm": 0.9165955185890198, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6510552763938904, "reward_std": 0.22339371591806412, "rewards/accuracy_reward": 0.6612593531608582, "rewards/format_reward": 0.9897959232330322, "step": 1003 }, { "completion_length": 222.34693145751953, "epoch": 0.1010314465408805, "grad_norm": 0.7545350790023804, "kl": 0.0499267578125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7360543608665466, "reward_std": 0.20999371260404587, "rewards/accuracy_reward": 0.736054390668869, "rewards/format_reward": 1.0, "step": 1004 }, { "completion_length": 272.55101776123047, "epoch": 0.10113207547169811, "grad_norm": 1.170790672302246, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5061758756637573, "reward_std": 0.2133755013346672, "rewards/accuracy_reward": 0.5163799971342087, "rewards/format_reward": 0.9897959232330322, "step": 1005 }, { "completion_length": 216.84693908691406, "epoch": 0.10123270440251572, "grad_norm": 0.5773947238922119, "kl": 0.0404052734375, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6122448444366455, "reward_std": 0.1270286738872528, "rewards/accuracy_reward": 0.6122448742389679, "rewards/format_reward": 1.0, "step": 1006 }, { "completion_length": 261.7244873046875, "epoch": 0.10133333333333333, "grad_norm": 0.5699846148490906, "kl": 0.0582275390625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6598873734474182, "reward_std": 0.19860567897558212, "rewards/accuracy_reward": 0.6802955567836761, "rewards/format_reward": 0.9795918464660645, "step": 1007 }, { "completion_length": 274.10203552246094, "epoch": 0.10143396226415094, "grad_norm": 0.9973979592323303, "kl": 0.059326171875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.4897959232330322, "reward_std": 0.152210034430027, "rewards/accuracy_reward": 0.48979590833187103, "rewards/format_reward": 1.0, "step": 1008 }, { "completion_length": 206.99999237060547, "epoch": 0.10153459119496855, "grad_norm": 1.0118119716644287, "kl": 0.0513916015625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6908405423164368, "reward_std": 0.1037081815302372, "rewards/accuracy_reward": 0.6908406019210815, "rewards/format_reward": 1.0, "step": 1009 }, { "completion_length": 259.0306091308594, "epoch": 0.10163522012578616, "grad_norm": 2.367161512374878, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5838677287101746, "reward_std": 0.1874590367078781, "rewards/accuracy_reward": 0.5940718948841095, "rewards/format_reward": 0.9897959232330322, "step": 1010 }, { "completion_length": 275.83673095703125, "epoch": 0.10173584905660378, "grad_norm": 0.5766984224319458, "kl": 0.03790283203125, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.6428571343421936, "reward_std": 0.19834674149751663, "rewards/accuracy_reward": 0.6632652878761292, "rewards/format_reward": 0.9795918166637421, "step": 1011 }, { "completion_length": 275.52040100097656, "epoch": 0.10183647798742139, "grad_norm": 0.6909803748130798, "kl": 0.0316162109375, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.6569411754608154, "reward_std": 0.209748312830925, "rewards/accuracy_reward": 0.6671452522277832, "rewards/format_reward": 0.9897959232330322, "step": 1012 }, { "completion_length": 229.66326141357422, "epoch": 0.101937106918239, "grad_norm": 2.033867120742798, "kl": 0.0537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7197279334068298, "reward_std": 0.20221781730651855, "rewards/accuracy_reward": 0.7197278439998627, "rewards/format_reward": 1.0, "step": 1013 }, { "completion_length": 295.551025390625, "epoch": 0.10203773584905661, "grad_norm": 1.1397457122802734, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6309523582458496, "reward_std": 0.2531791478395462, "rewards/accuracy_reward": 0.6411564648151398, "rewards/format_reward": 0.9897959232330322, "step": 1014 }, { "completion_length": 263.55101013183594, "epoch": 0.10213836477987422, "grad_norm": 0.782468855381012, "kl": 0.0369873046875, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.6976653337478638, "reward_std": 0.2531367763876915, "rewards/accuracy_reward": 0.7078694701194763, "rewards/format_reward": 0.9897959232330322, "step": 1015 }, { "completion_length": 233.20407104492188, "epoch": 0.10223899371069183, "grad_norm": 1.7181217670440674, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.692510962486267, "reward_std": 0.1929338052868843, "rewards/accuracy_reward": 0.7027150690555573, "rewards/format_reward": 0.9897959232330322, "step": 1016 }, { "completion_length": 209.08162689208984, "epoch": 0.10233962264150943, "grad_norm": 0.7207564115524292, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7694278359413147, "reward_std": 0.20626383647322655, "rewards/accuracy_reward": 0.7694278657436371, "rewards/format_reward": 1.0, "step": 1017 }, { "completion_length": 206.53060913085938, "epoch": 0.10244025157232704, "grad_norm": 1.313743233680725, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7611611485481262, "reward_std": 0.1528782844543457, "rewards/accuracy_reward": 0.7611612677574158, "rewards/format_reward": 1.0, "step": 1018 }, { "completion_length": 258.7244873046875, "epoch": 0.10254088050314465, "grad_norm": 1.7955472469329834, "kl": 0.0576171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.680272102355957, "reward_std": 0.20731844007968903, "rewards/accuracy_reward": 0.7108843326568604, "rewards/format_reward": 0.9693877398967743, "step": 1019 }, { "completion_length": 189.02040100097656, "epoch": 0.10264150943396226, "grad_norm": 1.751122236251831, "kl": 0.0877685546875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6978044509887695, "reward_std": 0.20477762073278427, "rewards/accuracy_reward": 0.6978045403957367, "rewards/format_reward": 1.0, "step": 1020 }, { "completion_length": 268.05101013183594, "epoch": 0.10274213836477987, "grad_norm": 0.6351332664489746, "kl": 0.07415771484375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6715584993362427, "reward_std": 0.15126977488398552, "rewards/accuracy_reward": 0.6715584397315979, "rewards/format_reward": 1.0, "step": 1021 }, { "completion_length": 215.2551040649414, "epoch": 0.10284276729559748, "grad_norm": 1.0309464931488037, "kl": 0.0499267578125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.779220700263977, "reward_std": 0.20058126747608185, "rewards/accuracy_reward": 0.7792207598686218, "rewards/format_reward": 1.0, "step": 1022 }, { "completion_length": 217.2244873046875, "epoch": 0.10294339622641509, "grad_norm": 0.9338682293891907, "kl": 0.0565185546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7470076084136963, "reward_std": 0.2349373996257782, "rewards/accuracy_reward": 0.7776198387145996, "rewards/format_reward": 0.9693877398967743, "step": 1023 }, { "completion_length": 251.78571319580078, "epoch": 0.10304402515723271, "grad_norm": 1.001433253288269, "kl": 0.0635986328125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.553813099861145, "reward_std": 0.1365418299101293, "rewards/accuracy_reward": 0.5640172064304352, "rewards/format_reward": 0.9897959232330322, "step": 1024 }, { "completion_length": 240.1530532836914, "epoch": 0.10314465408805032, "grad_norm": 1.5394295454025269, "kl": 0.0606689453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5120593309402466, "reward_std": 0.2662455290555954, "rewards/accuracy_reward": 0.5222634077072144, "rewards/format_reward": 0.9897959232330322, "step": 1025 }, { "completion_length": 296.2755126953125, "epoch": 0.10324528301886793, "grad_norm": 0.6810087561607361, "kl": 0.05078125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6181582808494568, "reward_std": 0.19223853200674057, "rewards/accuracy_reward": 0.6283623576164246, "rewards/format_reward": 0.9897959232330322, "step": 1026 }, { "completion_length": 230.4285659790039, "epoch": 0.10334591194968554, "grad_norm": 1.411124587059021, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5727970600128174, "reward_std": 0.2686118111014366, "rewards/accuracy_reward": 0.5830011963844299, "rewards/format_reward": 0.9897959232330322, "step": 1027 }, { "completion_length": 263.15306091308594, "epoch": 0.10344654088050315, "grad_norm": 1.0971214771270752, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6600399017333984, "reward_std": 0.22358879446983337, "rewards/accuracy_reward": 0.6702440679073334, "rewards/format_reward": 0.9897959232330322, "step": 1028 }, { "completion_length": 176.4285659790039, "epoch": 0.10354716981132076, "grad_norm": 2.432762622833252, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5975663661956787, "reward_std": 0.28450894355773926, "rewards/accuracy_reward": 0.617974579334259, "rewards/format_reward": 0.9795918464660645, "step": 1029 }, { "completion_length": 219.69387817382812, "epoch": 0.10364779874213836, "grad_norm": 1.2040445804595947, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7298235893249512, "reward_std": 0.1880200430750847, "rewards/accuracy_reward": 0.7400278151035309, "rewards/format_reward": 0.9897959232330322, "step": 1030 }, { "completion_length": 320.6326446533203, "epoch": 0.10374842767295597, "grad_norm": 3.5764541625976562, "kl": 0.1925048828125, "learning_rate": 1e-06, "loss": 0.0077, "reward": 1.657780408859253, "reward_std": 0.22177453339099884, "rewards/accuracy_reward": 0.6985968053340912, "rewards/format_reward": 0.9591836631298065, "step": 1031 }, { "completion_length": 206.26529693603516, "epoch": 0.10384905660377358, "grad_norm": 0.6766352653503418, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.4728954434394836, "reward_std": 0.15672528743743896, "rewards/accuracy_reward": 0.4830995500087738, "rewards/format_reward": 0.9897959232330322, "step": 1032 }, { "completion_length": 244.27549743652344, "epoch": 0.10394968553459119, "grad_norm": 0.7966241836547852, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7703242301940918, "reward_std": 0.15272490680217743, "rewards/accuracy_reward": 0.7805283069610596, "rewards/format_reward": 0.9897959232330322, "step": 1033 }, { "completion_length": 237.52040100097656, "epoch": 0.1040503144654088, "grad_norm": 1.653734803199768, "kl": 0.0595703125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5591468214988708, "reward_std": 0.23201309889554977, "rewards/accuracy_reward": 0.5795549154281616, "rewards/format_reward": 0.9795918464660645, "step": 1034 }, { "completion_length": 270.02040100097656, "epoch": 0.10415094339622641, "grad_norm": 0.7951254844665527, "kl": 0.053955078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.700880229473114, "reward_std": 0.1402314119040966, "rewards/accuracy_reward": 0.700880229473114, "rewards/format_reward": 1.0, "step": 1035 }, { "completion_length": 228.31632232666016, "epoch": 0.10425157232704403, "grad_norm": 0.8410582542419434, "kl": 0.0428466796875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7690476179122925, "reward_std": 0.17019392549991608, "rewards/accuracy_reward": 0.7792516946792603, "rewards/format_reward": 0.9897959232330322, "step": 1036 }, { "completion_length": 257.9693832397461, "epoch": 0.10435220125786164, "grad_norm": 1.5043590068817139, "kl": 0.04559326171875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.586734652519226, "reward_std": 0.26275530457496643, "rewards/accuracy_reward": 0.5969387590885162, "rewards/format_reward": 0.9897959232330322, "step": 1037 }, { "completion_length": 235.44898223876953, "epoch": 0.10445283018867925, "grad_norm": 0.7304278612136841, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7244897484779358, "reward_std": 0.189375102519989, "rewards/accuracy_reward": 0.7448979616165161, "rewards/format_reward": 0.9795918166637421, "step": 1038 }, { "completion_length": 247.74488830566406, "epoch": 0.10455345911949686, "grad_norm": 2.02422833442688, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5862993597984314, "reward_std": 0.20780719816684723, "rewards/accuracy_reward": 0.6067074239253998, "rewards/format_reward": 0.9795918166637421, "step": 1039 }, { "completion_length": 210.11224365234375, "epoch": 0.10465408805031447, "grad_norm": 0.765052080154419, "kl": 0.04718017578125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7130708694458008, "reward_std": 0.18286235630512238, "rewards/accuracy_reward": 0.7130708992481232, "rewards/format_reward": 1.0, "step": 1040 }, { "completion_length": 267.06121826171875, "epoch": 0.10475471698113208, "grad_norm": 1.428918719291687, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.4732504487037659, "reward_std": 0.3488420695066452, "rewards/accuracy_reward": 0.48345451056957245, "rewards/format_reward": 0.9897959232330322, "step": 1041 }, { "completion_length": 295.3163299560547, "epoch": 0.10485534591194969, "grad_norm": 0.7588297724723816, "kl": 0.048095703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.4742615818977356, "reward_std": 0.23634684830904007, "rewards/accuracy_reward": 0.49466970562934875, "rewards/format_reward": 0.9795918166637421, "step": 1042 }, { "completion_length": 256.1020278930664, "epoch": 0.1049559748427673, "grad_norm": 0.8684857487678528, "kl": 0.0416259765625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7140766978263855, "reward_std": 0.30007657408714294, "rewards/accuracy_reward": 0.7242807745933533, "rewards/format_reward": 0.9897959232330322, "step": 1043 }, { "completion_length": 265.7550964355469, "epoch": 0.1050566037735849, "grad_norm": 0.8945037126541138, "kl": 0.052490234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7303206324577332, "reward_std": 0.22437769174575806, "rewards/accuracy_reward": 0.7405247390270233, "rewards/format_reward": 0.9897959232330322, "step": 1044 }, { "completion_length": 239.11223602294922, "epoch": 0.10515723270440251, "grad_norm": 111.80280303955078, "kl": 1.6646728515625, "learning_rate": 1e-06, "loss": 0.0669, "reward": 1.7024221420288086, "reward_std": 0.3084298297762871, "rewards/accuracy_reward": 0.712626188993454, "rewards/format_reward": 0.9897959232330322, "step": 1045 }, { "completion_length": 265.04080963134766, "epoch": 0.10525786163522012, "grad_norm": 0.6075758934020996, "kl": 0.0523681640625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6057806611061096, "reward_std": 0.20222563296556473, "rewards/accuracy_reward": 0.6057807505130768, "rewards/format_reward": 1.0, "step": 1046 }, { "completion_length": 276.57142639160156, "epoch": 0.10535849056603773, "grad_norm": 1.0841710567474365, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5482993125915527, "reward_std": 0.3242322504520416, "rewards/accuracy_reward": 0.5687074810266495, "rewards/format_reward": 0.9795918166637421, "step": 1047 }, { "completion_length": 230.88775634765625, "epoch": 0.10545911949685535, "grad_norm": 0.8523383736610413, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6306808590888977, "reward_std": 0.1871284767985344, "rewards/accuracy_reward": 0.6408849358558655, "rewards/format_reward": 0.9897959232330322, "step": 1048 }, { "completion_length": 193.07142639160156, "epoch": 0.10555974842767296, "grad_norm": 0.49489524960517883, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7293004989624023, "reward_std": 0.12646225094795227, "rewards/accuracy_reward": 0.7293004989624023, "rewards/format_reward": 1.0, "step": 1049 }, { "completion_length": 257.7142791748047, "epoch": 0.10566037735849057, "grad_norm": 1.0688635110855103, "kl": 0.0552978515625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6028575897216797, "reward_std": 0.20063255727291107, "rewards/accuracy_reward": 0.6028576791286469, "rewards/format_reward": 1.0, "step": 1050 }, { "completion_length": 286.8163146972656, "epoch": 0.10576100628930818, "grad_norm": 0.6521627306938171, "kl": 0.0484619140625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6399140357971191, "reward_std": 0.18853459507226944, "rewards/accuracy_reward": 0.6603222191333771, "rewards/format_reward": 0.9795918464660645, "step": 1051 }, { "completion_length": 292.28570556640625, "epoch": 0.10586163522012579, "grad_norm": 0.8718270659446716, "kl": 0.03143310546875, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.528706133365631, "reward_std": 0.19162601977586746, "rewards/accuracy_reward": 0.5287061333656311, "rewards/format_reward": 1.0, "step": 1052 }, { "completion_length": 220.51020050048828, "epoch": 0.1059622641509434, "grad_norm": 0.6499840021133423, "kl": 0.05859375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6755101084709167, "reward_std": 0.13875406235456467, "rewards/accuracy_reward": 0.6755101680755615, "rewards/format_reward": 1.0, "step": 1053 }, { "completion_length": 222.79591369628906, "epoch": 0.106062893081761, "grad_norm": 1.5210041999816895, "kl": 0.0452880859375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6582847833633423, "reward_std": 0.24094925075769424, "rewards/accuracy_reward": 0.6684888005256653, "rewards/format_reward": 0.9897959232330322, "step": 1054 }, { "completion_length": 228.41836547851562, "epoch": 0.10616352201257861, "grad_norm": 1.2137198448181152, "kl": 0.045166015625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.651760458946228, "reward_std": 0.13911127299070358, "rewards/accuracy_reward": 0.6517605185508728, "rewards/format_reward": 1.0, "step": 1055 }, { "completion_length": 237.39794921875, "epoch": 0.10626415094339622, "grad_norm": 1.2867825031280518, "kl": 0.0518798828125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5671970844268799, "reward_std": 0.2756821885704994, "rewards/accuracy_reward": 0.5774011611938477, "rewards/format_reward": 0.9897959232330322, "step": 1056 }, { "completion_length": 220.9693832397461, "epoch": 0.10636477987421383, "grad_norm": 1.6809360980987549, "kl": 0.059326171875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7518667578697205, "reward_std": 0.21673351526260376, "rewards/accuracy_reward": 0.772274911403656, "rewards/format_reward": 0.9795918166637421, "step": 1057 }, { "completion_length": 225.35713958740234, "epoch": 0.10646540880503144, "grad_norm": 1.297412395477295, "kl": 0.0587158203125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6537064909934998, "reward_std": 0.2677619829773903, "rewards/accuracy_reward": 0.653706431388855, "rewards/format_reward": 1.0, "step": 1058 }, { "completion_length": 147.2653045654297, "epoch": 0.10656603773584905, "grad_norm": 0.8043281435966492, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7768706679344177, "reward_std": 0.07358486577868462, "rewards/accuracy_reward": 0.7768707275390625, "rewards/format_reward": 1.0, "step": 1059 }, { "completion_length": 263.60203552246094, "epoch": 0.10666666666666667, "grad_norm": 1.0400100946426392, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6224735379219055, "reward_std": 0.23321224749088287, "rewards/accuracy_reward": 0.6326776146888733, "rewards/format_reward": 0.9897959232330322, "step": 1060 }, { "completion_length": 252.62245178222656, "epoch": 0.10676729559748428, "grad_norm": 1.0263559818267822, "kl": 0.0469970703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.690362274646759, "reward_std": 0.24596790969371796, "rewards/accuracy_reward": 0.7005664110183716, "rewards/format_reward": 0.9897959232330322, "step": 1061 }, { "completion_length": 221.0204086303711, "epoch": 0.10686792452830189, "grad_norm": 1.4131572246551514, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.684725284576416, "reward_std": 0.16330943629145622, "rewards/accuracy_reward": 0.7051335275173187, "rewards/format_reward": 0.9795918464660645, "step": 1062 }, { "completion_length": 249.2448959350586, "epoch": 0.1069685534591195, "grad_norm": 0.935867428779602, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.66200190782547, "reward_std": 0.20514695346355438, "rewards/accuracy_reward": 0.6620019376277924, "rewards/format_reward": 1.0, "step": 1063 }, { "completion_length": 249.58162689208984, "epoch": 0.10706918238993711, "grad_norm": 2.1620798110961914, "kl": 0.06689453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.563176155090332, "reward_std": 0.283614382147789, "rewards/accuracy_reward": 0.563176229596138, "rewards/format_reward": 1.0, "step": 1064 }, { "completion_length": 286.5918273925781, "epoch": 0.10716981132075472, "grad_norm": 0.6979299783706665, "kl": 0.050048828125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.544350266456604, "reward_std": 0.24113713204860687, "rewards/accuracy_reward": 0.5443502962589264, "rewards/format_reward": 1.0, "step": 1065 }, { "completion_length": 199.7346954345703, "epoch": 0.10727044025157233, "grad_norm": 0.6875602006912231, "kl": 0.0396728515625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7755101919174194, "reward_std": 0.1270286738872528, "rewards/accuracy_reward": 0.7755101919174194, "rewards/format_reward": 1.0, "step": 1066 }, { "completion_length": 224.9693832397461, "epoch": 0.10737106918238994, "grad_norm": 1.2482794523239136, "kl": 0.10498046875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.652772307395935, "reward_std": 0.25140945985913277, "rewards/accuracy_reward": 0.6935886442661285, "rewards/format_reward": 0.9591836631298065, "step": 1067 }, { "completion_length": 244.64286041259766, "epoch": 0.10747169811320754, "grad_norm": 0.9136814475059509, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.582415521144867, "reward_std": 0.2327292636036873, "rewards/accuracy_reward": 0.6028237342834473, "rewards/format_reward": 0.9795918166637421, "step": 1068 }, { "completion_length": 205.64285278320312, "epoch": 0.10757232704402515, "grad_norm": 12.317825317382812, "kl": 0.244384765625, "learning_rate": 1e-06, "loss": 0.0098, "reward": 1.7133138179779053, "reward_std": 0.1764780580997467, "rewards/accuracy_reward": 0.7133138477802277, "rewards/format_reward": 1.0, "step": 1069 }, { "completion_length": 209.0408172607422, "epoch": 0.10767295597484276, "grad_norm": 1.5307607650756836, "kl": 0.0867919921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.637657105922699, "reward_std": 0.24447064101696014, "rewards/accuracy_reward": 0.6478613317012787, "rewards/format_reward": 0.9897959232330322, "step": 1070 }, { "completion_length": 223.64285278320312, "epoch": 0.10777358490566037, "grad_norm": 1.4334629774093628, "kl": 0.0823974609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7710044980049133, "reward_std": 0.1478070691227913, "rewards/accuracy_reward": 0.7710044980049133, "rewards/format_reward": 1.0, "step": 1071 }, { "completion_length": 242.1734619140625, "epoch": 0.10787421383647798, "grad_norm": 1.0025978088378906, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5787657499313354, "reward_std": 0.30303630232810974, "rewards/accuracy_reward": 0.5787657797336578, "rewards/format_reward": 1.0, "step": 1072 }, { "completion_length": 222.62244415283203, "epoch": 0.1079748427672956, "grad_norm": 0.38430455327033997, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6569573283195496, "reward_std": 0.09099483862519264, "rewards/accuracy_reward": 0.6671613454818726, "rewards/format_reward": 0.9897959232330322, "step": 1073 }, { "completion_length": 216.1734619140625, "epoch": 0.10807547169811321, "grad_norm": 0.5832129120826721, "kl": 0.0386962890625, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.8241961002349854, "reward_std": 0.14696454256772995, "rewards/accuracy_reward": 0.8241961598396301, "rewards/format_reward": 1.0, "step": 1074 }, { "completion_length": 220.81632232666016, "epoch": 0.10817610062893082, "grad_norm": 0.8350942730903625, "kl": 0.057373046875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7112511992454529, "reward_std": 0.1399563066661358, "rewards/accuracy_reward": 0.7112512290477753, "rewards/format_reward": 1.0, "step": 1075 }, { "completion_length": 290.58162689208984, "epoch": 0.10827672955974843, "grad_norm": 0.8395107388496399, "kl": 0.05224609375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7855982780456543, "reward_std": 0.1990465149283409, "rewards/accuracy_reward": 0.7855983078479767, "rewards/format_reward": 1.0, "step": 1076 }, { "completion_length": 251.89795684814453, "epoch": 0.10837735849056604, "grad_norm": 0.9434700012207031, "kl": 0.0606689453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.660909354686737, "reward_std": 0.1973254308104515, "rewards/accuracy_reward": 0.6609093546867371, "rewards/format_reward": 1.0, "step": 1077 }, { "completion_length": 227.35713958740234, "epoch": 0.10847798742138365, "grad_norm": 0.509199857711792, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7908349633216858, "reward_std": 0.09282635897397995, "rewards/accuracy_reward": 0.801039069890976, "rewards/format_reward": 0.9897959232330322, "step": 1078 }, { "completion_length": 264.83673095703125, "epoch": 0.10857861635220126, "grad_norm": 0.979247510433197, "kl": 0.103271484375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.5184058547019958, "reward_std": 0.25778108090162277, "rewards/accuracy_reward": 0.5388140976428986, "rewards/format_reward": 0.9795918464660645, "step": 1079 }, { "completion_length": 251.21428680419922, "epoch": 0.10867924528301887, "grad_norm": 0.8450666069984436, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.615003228187561, "reward_std": 0.16254743561148643, "rewards/accuracy_reward": 0.6252073049545288, "rewards/format_reward": 0.9897959232330322, "step": 1080 }, { "completion_length": 239.20407104492188, "epoch": 0.10877987421383647, "grad_norm": 0.6249140501022339, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7420634627342224, "reward_std": 0.18482805788516998, "rewards/accuracy_reward": 0.7522675395011902, "rewards/format_reward": 0.9897959232330322, "step": 1081 }, { "completion_length": 227.82653045654297, "epoch": 0.10888050314465408, "grad_norm": 1.774242639541626, "kl": 0.0548095703125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6740358471870422, "reward_std": 0.24231857806444168, "rewards/accuracy_reward": 0.6842399835586548, "rewards/format_reward": 0.9897959232330322, "step": 1082 }, { "completion_length": 204.81632232666016, "epoch": 0.10898113207547169, "grad_norm": 0.9950591921806335, "kl": 0.054443359375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6148221492767334, "reward_std": 0.12370758689939976, "rewards/accuracy_reward": 0.6352302730083466, "rewards/format_reward": 0.9795918464660645, "step": 1083 }, { "completion_length": 270.9285583496094, "epoch": 0.1090817610062893, "grad_norm": 0.9804590344429016, "kl": 0.0718994140625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5596784353256226, "reward_std": 0.26448871940374374, "rewards/accuracy_reward": 0.5698824524879456, "rewards/format_reward": 0.9897959232330322, "step": 1084 }, { "completion_length": 210.448974609375, "epoch": 0.10918238993710692, "grad_norm": 1.06504487991333, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.702974259853363, "reward_std": 0.20423667132854462, "rewards/accuracy_reward": 0.7131783366203308, "rewards/format_reward": 0.9897959232330322, "step": 1085 }, { "completion_length": 248.22447967529297, "epoch": 0.10928301886792453, "grad_norm": 0.6994537711143494, "kl": 0.0537109375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7061224579811096, "reward_std": 0.2822757363319397, "rewards/accuracy_reward": 0.7367346882820129, "rewards/format_reward": 0.9693877398967743, "step": 1086 }, { "completion_length": 346.24488830566406, "epoch": 0.10938364779874214, "grad_norm": 0.5868625044822693, "kl": 0.034912109375, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.4827775359153748, "reward_std": 0.2281644120812416, "rewards/accuracy_reward": 0.5031857192516327, "rewards/format_reward": 0.9795918464660645, "step": 1087 }, { "completion_length": 271.28570556640625, "epoch": 0.10948427672955975, "grad_norm": 0.9473839998245239, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7122448682785034, "reward_std": 0.2492925524711609, "rewards/accuracy_reward": 0.7224489450454712, "rewards/format_reward": 0.9897959232330322, "step": 1088 }, { "completion_length": 287.448974609375, "epoch": 0.10958490566037736, "grad_norm": 0.7316133379936218, "kl": 0.041015625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.5924696326255798, "reward_std": 0.2952314466238022, "rewards/accuracy_reward": 0.6230818927288055, "rewards/format_reward": 0.9693877398967743, "step": 1089 }, { "completion_length": 206.07141876220703, "epoch": 0.10968553459119497, "grad_norm": 1.2907711267471313, "kl": 0.05352783203125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7595058679580688, "reward_std": 0.20709911733865738, "rewards/accuracy_reward": 0.7595058977603912, "rewards/format_reward": 1.0, "step": 1090 }, { "completion_length": 246.5204086303711, "epoch": 0.10978616352201258, "grad_norm": 0.6106665730476379, "kl": 0.05859375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7892776131629944, "reward_std": 0.16808728873729706, "rewards/accuracy_reward": 0.7994816601276398, "rewards/format_reward": 0.9897959232330322, "step": 1091 }, { "completion_length": 261.87754821777344, "epoch": 0.10988679245283019, "grad_norm": 1.1926809549331665, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8027211427688599, "reward_std": 0.2123960256576538, "rewards/accuracy_reward": 0.8027210831642151, "rewards/format_reward": 1.0, "step": 1092 }, { "completion_length": 161.83673095703125, "epoch": 0.1099874213836478, "grad_norm": 1.9337142705917358, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7673818469047546, "reward_std": 0.21196402609348297, "rewards/accuracy_reward": 0.7673818469047546, "rewards/format_reward": 1.0, "step": 1093 }, { "completion_length": 182.61223602294922, "epoch": 0.1100880503144654, "grad_norm": 1.6539617776870728, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.5480087995529175, "reward_std": 0.23966094851493835, "rewards/accuracy_reward": 0.5480088442564011, "rewards/format_reward": 1.0, "step": 1094 }, { "completion_length": 305.9897766113281, "epoch": 0.11018867924528301, "grad_norm": 0.7141932249069214, "kl": 0.041015625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6229591965675354, "reward_std": 0.2551824226975441, "rewards/accuracy_reward": 0.622959166765213, "rewards/format_reward": 1.0, "step": 1095 }, { "completion_length": 231.1530532836914, "epoch": 0.11028930817610062, "grad_norm": 0.49578675627708435, "kl": 0.05517578125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8639838695526123, "reward_std": 0.11084319651126862, "rewards/accuracy_reward": 0.8639839291572571, "rewards/format_reward": 1.0, "step": 1096 }, { "completion_length": 205.01020050048828, "epoch": 0.11038993710691825, "grad_norm": 0.8260464072227478, "kl": 0.043212890625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.775753140449524, "reward_std": 0.17596221342682838, "rewards/accuracy_reward": 0.7757531404495239, "rewards/format_reward": 1.0, "step": 1097 }, { "completion_length": 241.33673095703125, "epoch": 0.11049056603773585, "grad_norm": 1.005519151687622, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6620306372642517, "reward_std": 0.2095610871911049, "rewards/accuracy_reward": 0.6824387609958649, "rewards/format_reward": 0.9795918166637421, "step": 1098 }, { "completion_length": 233.4081573486328, "epoch": 0.11059119496855346, "grad_norm": 1.0717819929122925, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7643519639968872, "reward_std": 0.1577918054535985, "rewards/accuracy_reward": 0.7643519639968872, "rewards/format_reward": 1.0, "step": 1099 }, { "completion_length": 230.62244415283203, "epoch": 0.11069182389937107, "grad_norm": 0.5010136961936951, "kl": 0.03564453125, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.645418107509613, "reward_std": 0.1364556960761547, "rewards/accuracy_reward": 0.6454181373119354, "rewards/format_reward": 1.0, "step": 1100 }, { "completion_length": 257.5408172607422, "epoch": 0.11079245283018868, "grad_norm": 0.3568747043609619, "kl": 0.0469970703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6428384184837341, "reward_std": 0.08954771235585213, "rewards/accuracy_reward": 0.6428384482860565, "rewards/format_reward": 1.0, "step": 1101 }, { "completion_length": 261.6428451538086, "epoch": 0.11089308176100629, "grad_norm": 0.7284268736839294, "kl": 0.04833984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.554705262184143, "reward_std": 0.15045347064733505, "rewards/accuracy_reward": 0.5547051727771759, "rewards/format_reward": 1.0, "step": 1102 }, { "completion_length": 210.62244415283203, "epoch": 0.1109937106918239, "grad_norm": 1.9135632514953613, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6717687249183655, "reward_std": 0.23508594930171967, "rewards/accuracy_reward": 0.692176878452301, "rewards/format_reward": 0.9795918166637421, "step": 1103 }, { "completion_length": 286.1836700439453, "epoch": 0.11109433962264151, "grad_norm": 0.9008699655532837, "kl": 0.046142578125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.622448980808258, "reward_std": 0.20006241649389267, "rewards/accuracy_reward": 0.6326530575752258, "rewards/format_reward": 0.9897959232330322, "step": 1104 }, { "completion_length": 260.03060150146484, "epoch": 0.11119496855345912, "grad_norm": 0.8564059138298035, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.525688350200653, "reward_std": 0.22645539045333862, "rewards/accuracy_reward": 0.5358924567699432, "rewards/format_reward": 0.9897959232330322, "step": 1105 }, { "completion_length": 304.37754821777344, "epoch": 0.11129559748427673, "grad_norm": 0.9229421615600586, "kl": 0.05419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5800043940544128, "reward_std": 0.24149832129478455, "rewards/accuracy_reward": 0.6004124581813812, "rewards/format_reward": 0.9795918464660645, "step": 1106 }, { "completion_length": 201.6938705444336, "epoch": 0.11139622641509433, "grad_norm": 1.0343250036239624, "kl": 0.097900390625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.654518961906433, "reward_std": 0.17200881242752075, "rewards/accuracy_reward": 0.6545189172029495, "rewards/format_reward": 1.0, "step": 1107 }, { "completion_length": 254.9081573486328, "epoch": 0.11149685534591194, "grad_norm": 0.8990403413772583, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.795918345451355, "reward_std": 0.20791853964328766, "rewards/accuracy_reward": 0.7959183752536774, "rewards/format_reward": 1.0, "step": 1108 }, { "completion_length": 299.1632614135742, "epoch": 0.11159748427672957, "grad_norm": 1.2220605611801147, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6220073699951172, "reward_std": 0.2993968576192856, "rewards/accuracy_reward": 0.6424155533313751, "rewards/format_reward": 0.9795918166637421, "step": 1109 }, { "completion_length": 194.2653045654297, "epoch": 0.11169811320754718, "grad_norm": 0.9030049443244934, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6665273904800415, "reward_std": 0.012830010149627924, "rewards/accuracy_reward": 0.6665274053812027, "rewards/format_reward": 1.0, "step": 1110 }, { "completion_length": 225.31632232666016, "epoch": 0.11179874213836478, "grad_norm": 0.941274106502533, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7462584376335144, "reward_std": 0.2709125876426697, "rewards/accuracy_reward": 0.7666666805744171, "rewards/format_reward": 0.9795918464660645, "step": 1111 }, { "completion_length": 305.31632232666016, "epoch": 0.1118993710691824, "grad_norm": 0.9697866439819336, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.612406849861145, "reward_std": 0.20546992123126984, "rewards/accuracy_reward": 0.6226109266281128, "rewards/format_reward": 0.9897959232330322, "step": 1112 }, { "completion_length": 225.7551040649414, "epoch": 0.112, "grad_norm": 0.5575282573699951, "kl": 0.05029296875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5932944416999817, "reward_std": 0.1470169462263584, "rewards/accuracy_reward": 0.6034985333681107, "rewards/format_reward": 0.9897959232330322, "step": 1113 }, { "completion_length": 237.7346954345703, "epoch": 0.11210062893081761, "grad_norm": 1.8765339851379395, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6646727323532104, "reward_std": 0.3404480591416359, "rewards/accuracy_reward": 0.6748768389225006, "rewards/format_reward": 0.9897959232330322, "step": 1114 }, { "completion_length": 359.4795837402344, "epoch": 0.11220125786163522, "grad_norm": 0.6858241558074951, "kl": 0.03692626953125, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5296177864074707, "reward_std": 0.26921024918556213, "rewards/accuracy_reward": 0.5296178460121155, "rewards/format_reward": 1.0, "step": 1115 }, { "completion_length": 203.55101013183594, "epoch": 0.11230188679245283, "grad_norm": 2.216336250305176, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6654762029647827, "reward_std": 0.17008740454912186, "rewards/accuracy_reward": 0.6756802499294281, "rewards/format_reward": 0.9897959232330322, "step": 1116 }, { "completion_length": 242.7244873046875, "epoch": 0.11240251572327044, "grad_norm": 0.8087358474731445, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.577377200126648, "reward_std": 0.24091245234012604, "rewards/accuracy_reward": 0.5977854132652283, "rewards/format_reward": 0.9795918464660645, "step": 1117 }, { "completion_length": 282.46937561035156, "epoch": 0.11250314465408805, "grad_norm": 0.8340288400650024, "kl": 0.0352783203125, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.643680989742279, "reward_std": 0.2084309458732605, "rewards/accuracy_reward": 0.6436811089515686, "rewards/format_reward": 1.0, "step": 1118 }, { "completion_length": 282.77549743652344, "epoch": 0.11260377358490566, "grad_norm": 0.5863176584243774, "kl": 0.04425048828125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6620616912841797, "reward_std": 0.16821805573999882, "rewards/accuracy_reward": 0.6824699342250824, "rewards/format_reward": 0.9795918166637421, "step": 1119 }, { "completion_length": 265.79591369628906, "epoch": 0.11270440251572326, "grad_norm": 1.2527577877044678, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6892435550689697, "reward_std": 0.31634054332971573, "rewards/accuracy_reward": 0.7096517980098724, "rewards/format_reward": 0.9795918166637421, "step": 1120 }, { "completion_length": 278.3163299560547, "epoch": 0.11280503144654087, "grad_norm": 0.8134411573410034, "kl": 0.042724609375, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7432847619056702, "reward_std": 0.19783061742782593, "rewards/accuracy_reward": 0.7432847917079926, "rewards/format_reward": 1.0, "step": 1121 }, { "completion_length": 298.6122283935547, "epoch": 0.1129056603773585, "grad_norm": 0.6558731198310852, "kl": 0.0465087890625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.781306266784668, "reward_std": 0.1852005459368229, "rewards/accuracy_reward": 0.8221225142478943, "rewards/format_reward": 0.9591836631298065, "step": 1122 }, { "completion_length": 267.32652282714844, "epoch": 0.1130062893081761, "grad_norm": 1.8359577655792236, "kl": 0.0584716796875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.627409279346466, "reward_std": 0.18887345492839813, "rewards/accuracy_reward": 0.6478174328804016, "rewards/format_reward": 0.9795918166637421, "step": 1123 }, { "completion_length": 266.4081497192383, "epoch": 0.11310691823899371, "grad_norm": 2.5683233737945557, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6235610842704773, "reward_std": 0.18760498240590096, "rewards/accuracy_reward": 0.6337651014328003, "rewards/format_reward": 0.9897959232330322, "step": 1124 }, { "completion_length": 338.82652282714844, "epoch": 0.11320754716981132, "grad_norm": 0.7666530609130859, "kl": 0.0396728515625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.5224488973617554, "reward_std": 0.21047765761613846, "rewards/accuracy_reward": 0.5224489867687225, "rewards/format_reward": 1.0, "step": 1125 }, { "completion_length": 236.1938705444336, "epoch": 0.11330817610062893, "grad_norm": 0.5407875180244446, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.859990656375885, "reward_std": 0.12918492779135704, "rewards/accuracy_reward": 0.8599906265735626, "rewards/format_reward": 1.0, "step": 1126 }, { "completion_length": 221.4795913696289, "epoch": 0.11340880503144654, "grad_norm": 1.0297832489013672, "kl": 0.0579833984375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.772343635559082, "reward_std": 0.253783218562603, "rewards/accuracy_reward": 0.79275181889534, "rewards/format_reward": 0.9795918166637421, "step": 1127 }, { "completion_length": 245.52040100097656, "epoch": 0.11350943396226415, "grad_norm": 1.8783388137817383, "kl": 0.045166015625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6109408736228943, "reward_std": 0.2514238879084587, "rewards/accuracy_reward": 0.6415531039237976, "rewards/format_reward": 0.9693877398967743, "step": 1128 }, { "completion_length": 245.71428680419922, "epoch": 0.11361006289308176, "grad_norm": 0.9573481678962708, "kl": 0.0452880859375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6659493446350098, "reward_std": 0.16934265196323395, "rewards/accuracy_reward": 0.6659493744373322, "rewards/format_reward": 1.0, "step": 1129 }, { "completion_length": 227.81632232666016, "epoch": 0.11371069182389937, "grad_norm": 5.96561336517334, "kl": 0.056396484375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8477891087532043, "reward_std": 0.2231357917189598, "rewards/accuracy_reward": 0.8681972622871399, "rewards/format_reward": 0.9795918464660645, "step": 1130 }, { "completion_length": 335.1632537841797, "epoch": 0.11381132075471698, "grad_norm": 1.8731253147125244, "kl": 0.052490234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.459627628326416, "reward_std": 0.38528239727020264, "rewards/accuracy_reward": 0.5004439502954483, "rewards/format_reward": 0.9591836631298065, "step": 1131 }, { "completion_length": 290.10203552246094, "epoch": 0.11391194968553459, "grad_norm": 1.8256186246871948, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.66641765832901, "reward_std": 0.30130474269390106, "rewards/accuracy_reward": 0.7174381613731384, "rewards/format_reward": 0.9489795565605164, "step": 1132 }, { "completion_length": 258.33673095703125, "epoch": 0.1140125786163522, "grad_norm": 1.1018234491348267, "kl": 0.04052734375, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7108843922615051, "reward_std": 0.288932129740715, "rewards/accuracy_reward": 0.7312925159931183, "rewards/format_reward": 0.9795918464660645, "step": 1133 }, { "completion_length": 249.2244873046875, "epoch": 0.11411320754716982, "grad_norm": 1.8143095970153809, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6437596678733826, "reward_std": 0.2717771679162979, "rewards/accuracy_reward": 0.6641678214073181, "rewards/format_reward": 0.9795918166637421, "step": 1134 }, { "completion_length": 209.7142791748047, "epoch": 0.11421383647798743, "grad_norm": 0.8923256397247314, "kl": 0.053955078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7755101919174194, "reward_std": 0.24054987728595734, "rewards/accuracy_reward": 0.8061224520206451, "rewards/format_reward": 0.9693877398967743, "step": 1135 }, { "completion_length": 207.448974609375, "epoch": 0.11431446540880504, "grad_norm": 0.9029499292373657, "kl": 0.0677490234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.626019835472107, "reward_std": 0.11473599076271057, "rewards/accuracy_reward": 0.6362240314483643, "rewards/format_reward": 0.9897959232330322, "step": 1136 }, { "completion_length": 326.1020202636719, "epoch": 0.11441509433962264, "grad_norm": 0.9059045910835266, "kl": 0.0391845703125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.647563636302948, "reward_std": 0.2549860253930092, "rewards/accuracy_reward": 0.6781758368015289, "rewards/format_reward": 0.9693877398967743, "step": 1137 }, { "completion_length": 253.31632232666016, "epoch": 0.11451572327044025, "grad_norm": 2.1429800987243652, "kl": 0.15673828125, "learning_rate": 1e-06, "loss": 0.0063, "reward": 1.64830881357193, "reward_std": 0.25469622015953064, "rewards/accuracy_reward": 0.6789210736751556, "rewards/format_reward": 0.9693877398967743, "step": 1138 }, { "completion_length": 191.36734771728516, "epoch": 0.11461635220125786, "grad_norm": 0.970512330532074, "kl": 0.0709228515625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6611234545707703, "reward_std": 0.2214701622724533, "rewards/accuracy_reward": 0.6815316379070282, "rewards/format_reward": 0.9795918166637421, "step": 1139 }, { "completion_length": 238.79591369628906, "epoch": 0.11471698113207547, "grad_norm": 0.8598250150680542, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6537901163101196, "reward_std": 0.20267562568187714, "rewards/accuracy_reward": 0.6741982996463776, "rewards/format_reward": 0.9795918464660645, "step": 1140 }, { "completion_length": 196.28571319580078, "epoch": 0.11481761006289308, "grad_norm": 1.0306676626205444, "kl": 0.0545654296875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7959182858467102, "reward_std": 0.19220631197094917, "rewards/accuracy_reward": 0.8163264989852905, "rewards/format_reward": 0.9795918166637421, "step": 1141 }, { "completion_length": 215.63265228271484, "epoch": 0.11491823899371069, "grad_norm": 1.3659064769744873, "kl": 0.05078125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6044703125953674, "reward_std": 0.2211145982146263, "rewards/accuracy_reward": 0.6146744191646576, "rewards/format_reward": 0.9897959232330322, "step": 1142 }, { "completion_length": 199.2040786743164, "epoch": 0.1150188679245283, "grad_norm": 0.9286252856254578, "kl": 0.05517578125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6103416681289673, "reward_std": 0.15940605476498604, "rewards/accuracy_reward": 0.6103417277336121, "rewards/format_reward": 1.0, "step": 1143 }, { "completion_length": 272.55101776123047, "epoch": 0.1151194968553459, "grad_norm": 1.014819860458374, "kl": 0.04736328125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6673617959022522, "reward_std": 0.2540828660130501, "rewards/accuracy_reward": 0.7081781029701233, "rewards/format_reward": 0.9591836333274841, "step": 1144 }, { "completion_length": 207.7653045654297, "epoch": 0.11522012578616352, "grad_norm": 0.6052362322807312, "kl": 0.0457763671875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7259525656700134, "reward_std": 0.1662285141646862, "rewards/accuracy_reward": 0.7361566722393036, "rewards/format_reward": 0.9897959232330322, "step": 1145 }, { "completion_length": 318.6836700439453, "epoch": 0.11532075471698114, "grad_norm": 0.944871187210083, "kl": 0.0479736328125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5198917984962463, "reward_std": 0.31308935582637787, "rewards/accuracy_reward": 0.5709122121334076, "rewards/format_reward": 0.9489795863628387, "step": 1146 }, { "completion_length": 203.91836547851562, "epoch": 0.11542138364779875, "grad_norm": 0.6568711996078491, "kl": 0.059814453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.639941692352295, "reward_std": 0.14339160174131393, "rewards/accuracy_reward": 0.6501457691192627, "rewards/format_reward": 0.9897959232330322, "step": 1147 }, { "completion_length": 199.66326141357422, "epoch": 0.11552201257861636, "grad_norm": 1.0277704000473022, "kl": 0.0640869140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.603295087814331, "reward_std": 0.14415612630546093, "rewards/accuracy_reward": 0.6134992837905884, "rewards/format_reward": 0.9897959232330322, "step": 1148 }, { "completion_length": 269.1530532836914, "epoch": 0.11562264150943397, "grad_norm": 0.3596694767475128, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6144712567329407, "reward_std": 0.06858932226896286, "rewards/accuracy_reward": 0.6144712269306183, "rewards/format_reward": 1.0, "step": 1149 }, { "completion_length": 198.91836547851562, "epoch": 0.11572327044025157, "grad_norm": 1.4857063293457031, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.579194188117981, "reward_std": 0.3047920912504196, "rewards/accuracy_reward": 0.6200105845928192, "rewards/format_reward": 0.9591836631298065, "step": 1150 }, { "completion_length": 182.45917510986328, "epoch": 0.11582389937106918, "grad_norm": 0.3873833119869232, "kl": 0.0528564453125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7286807298660278, "reward_std": 0.032648297026753426, "rewards/accuracy_reward": 0.7286807298660278, "rewards/format_reward": 1.0, "step": 1151 }, { "completion_length": 313.3163146972656, "epoch": 0.11592452830188679, "grad_norm": 0.8284375071525574, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5510203838348389, "reward_std": 0.28744053840637207, "rewards/accuracy_reward": 0.5918367207050323, "rewards/format_reward": 0.9591836631298065, "step": 1152 }, { "completion_length": 291.02040100097656, "epoch": 0.1160251572327044, "grad_norm": 1.192522406578064, "kl": 0.055419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.622448980808258, "reward_std": 0.34769071638584137, "rewards/accuracy_reward": 0.642857164144516, "rewards/format_reward": 0.9795918166637421, "step": 1153 }, { "completion_length": 249.35713958740234, "epoch": 0.11612578616352201, "grad_norm": 0.9342990517616272, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6612328886985779, "reward_std": 0.2626948058605194, "rewards/accuracy_reward": 0.6816409826278687, "rewards/format_reward": 0.9795918166637421, "step": 1154 }, { "completion_length": 217.62244415283203, "epoch": 0.11622641509433962, "grad_norm": 1.7451527118682861, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.5193291902542114, "reward_std": 0.26319775730371475, "rewards/accuracy_reward": 0.5397373735904694, "rewards/format_reward": 0.9795918464660645, "step": 1155 }, { "completion_length": 261.6632614135742, "epoch": 0.11632704402515723, "grad_norm": 0.9949705004692078, "kl": 0.04736328125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.65908282995224, "reward_std": 0.2989822328090668, "rewards/accuracy_reward": 0.6896951496601105, "rewards/format_reward": 0.9693877398967743, "step": 1156 }, { "completion_length": 229.57142639160156, "epoch": 0.11642767295597484, "grad_norm": 1.2668876647949219, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.724302053451538, "reward_std": 0.16252533346414566, "rewards/accuracy_reward": 0.7243019938468933, "rewards/format_reward": 1.0, "step": 1157 }, { "completion_length": 249.90816497802734, "epoch": 0.11652830188679246, "grad_norm": 1.8045364618301392, "kl": 0.0537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5271803140640259, "reward_std": 0.20229241251945496, "rewards/accuracy_reward": 0.5475885570049286, "rewards/format_reward": 0.9795918464660645, "step": 1158 }, { "completion_length": 225.11223602294922, "epoch": 0.11662893081761007, "grad_norm": 1.021760106086731, "kl": 0.051513671875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7245835661888123, "reward_std": 0.25275004655122757, "rewards/accuracy_reward": 0.7449917197227478, "rewards/format_reward": 0.9795918166637421, "step": 1159 }, { "completion_length": 198.16326141357422, "epoch": 0.11672955974842768, "grad_norm": 3.9843835830688477, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5159456133842468, "reward_std": 0.2041693702340126, "rewards/accuracy_reward": 0.5465579181909561, "rewards/format_reward": 0.9693877398967743, "step": 1160 }, { "completion_length": 197.01020050048828, "epoch": 0.11683018867924529, "grad_norm": 1.355008840560913, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7040815949440002, "reward_std": 0.27302366495132446, "rewards/accuracy_reward": 0.7346938848495483, "rewards/format_reward": 0.9693877398967743, "step": 1161 }, { "completion_length": 258.2040710449219, "epoch": 0.1169308176100629, "grad_norm": 0.7845268845558167, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7822157144546509, "reward_std": 0.18705995380878448, "rewards/accuracy_reward": 0.7822157144546509, "rewards/format_reward": 1.0, "step": 1162 }, { "completion_length": 278.96937561035156, "epoch": 0.1170314465408805, "grad_norm": 0.8940032124519348, "kl": 0.0543212890625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.4870296716690063, "reward_std": 0.3010164946317673, "rewards/accuracy_reward": 0.5074379444122314, "rewards/format_reward": 0.9795918464660645, "step": 1163 }, { "completion_length": 196.71428680419922, "epoch": 0.11713207547169811, "grad_norm": 0.6600427627563477, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.721088469028473, "reward_std": 0.09784764796495438, "rewards/accuracy_reward": 0.7210884392261505, "rewards/format_reward": 1.0, "step": 1164 }, { "completion_length": 245.32652282714844, "epoch": 0.11723270440251572, "grad_norm": 1.207958698272705, "kl": 0.0517578125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7448979616165161, "reward_std": 0.29787128418684006, "rewards/accuracy_reward": 0.7551020383834839, "rewards/format_reward": 0.9897959232330322, "step": 1165 }, { "completion_length": 237.448974609375, "epoch": 0.11733333333333333, "grad_norm": 1.2753219604492188, "kl": 0.0540771484375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5134283304214478, "reward_std": 0.18717999756336212, "rewards/accuracy_reward": 0.5236324220895767, "rewards/format_reward": 0.9897959232330322, "step": 1166 }, { "completion_length": 224.24488830566406, "epoch": 0.11743396226415094, "grad_norm": 2.1782987117767334, "kl": 0.0653076171875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.599251925945282, "reward_std": 0.28963108360767365, "rewards/accuracy_reward": 0.6094560474157333, "rewards/format_reward": 0.9897959232330322, "step": 1167 }, { "completion_length": 250.81632232666016, "epoch": 0.11753459119496855, "grad_norm": 1.8179622888565063, "kl": 0.0814208984375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.5027865767478943, "reward_std": 0.3442173898220062, "rewards/accuracy_reward": 0.5231947600841522, "rewards/format_reward": 0.9795918166637421, "step": 1168 }, { "completion_length": 214.55101013183594, "epoch": 0.11763522012578616, "grad_norm": 0.6808657050132751, "kl": 0.06341552734375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7338159084320068, "reward_std": 0.08811422996222973, "rewards/accuracy_reward": 0.7338158488273621, "rewards/format_reward": 1.0, "step": 1169 }, { "completion_length": 230.09183502197266, "epoch": 0.11773584905660377, "grad_norm": 1.253920316696167, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6654598712921143, "reward_std": 0.29575444757938385, "rewards/accuracy_reward": 0.675664097070694, "rewards/format_reward": 0.9897959232330322, "step": 1170 }, { "completion_length": 288.1734619140625, "epoch": 0.11783647798742139, "grad_norm": 0.7419031262397766, "kl": 0.0587158203125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6248785257339478, "reward_std": 0.21252978965640068, "rewards/accuracy_reward": 0.6350826025009155, "rewards/format_reward": 0.9897959232330322, "step": 1171 }, { "completion_length": 227.33673095703125, "epoch": 0.117937106918239, "grad_norm": 20.0729923248291, "kl": 0.541748046875, "learning_rate": 1e-06, "loss": 0.0218, "reward": 1.8041048049926758, "reward_std": 0.22045034170150757, "rewards/accuracy_reward": 0.8143088817596436, "rewards/format_reward": 0.9897959232330322, "step": 1172 }, { "completion_length": 274.7346954345703, "epoch": 0.11803773584905661, "grad_norm": 0.5142006278038025, "kl": 0.04156494140625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6355834007263184, "reward_std": 0.22990891337394714, "rewards/accuracy_reward": 0.6559916138648987, "rewards/format_reward": 0.9795918166637421, "step": 1173 }, { "completion_length": 186.85713958740234, "epoch": 0.11813836477987422, "grad_norm": 1.4428831338882446, "kl": 0.052490234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7623125314712524, "reward_std": 0.21648603677749634, "rewards/accuracy_reward": 0.7827206552028656, "rewards/format_reward": 0.9795918166637421, "step": 1174 }, { "completion_length": 182.2142791748047, "epoch": 0.11823899371069183, "grad_norm": 1.1236554384231567, "kl": 0.0626220703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6937816143035889, "reward_std": 0.14610198140144348, "rewards/accuracy_reward": 0.6937816143035889, "rewards/format_reward": 1.0, "step": 1175 }, { "completion_length": 260.80611419677734, "epoch": 0.11833962264150943, "grad_norm": 1.3483678102493286, "kl": 0.0626220703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6938632726669312, "reward_std": 0.27194900810718536, "rewards/accuracy_reward": 0.7040673494338989, "rewards/format_reward": 0.9897959232330322, "step": 1176 }, { "completion_length": 230.26529693603516, "epoch": 0.11844025157232704, "grad_norm": 0.9891273975372314, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5971492528915405, "reward_std": 0.19277767091989517, "rewards/accuracy_reward": 0.5971493124961853, "rewards/format_reward": 1.0, "step": 1177 }, { "completion_length": 264.8265151977539, "epoch": 0.11854088050314465, "grad_norm": 1.0222694873809814, "kl": 0.054931640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7292557954788208, "reward_std": 0.2748779132962227, "rewards/accuracy_reward": 0.749663919210434, "rewards/format_reward": 0.9795918464660645, "step": 1178 }, { "completion_length": 344.4183654785156, "epoch": 0.11864150943396226, "grad_norm": 1.029754877090454, "kl": 0.05859375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5367900729179382, "reward_std": 0.17821313813328743, "rewards/accuracy_reward": 0.5571983158588409, "rewards/format_reward": 0.9795918464660645, "step": 1179 }, { "completion_length": 263.7142868041992, "epoch": 0.11874213836477987, "grad_norm": 0.8353971242904663, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5335600972175598, "reward_std": 0.19572052359580994, "rewards/accuracy_reward": 0.5335600972175598, "rewards/format_reward": 1.0, "step": 1180 }, { "completion_length": 287.46937561035156, "epoch": 0.11884276729559748, "grad_norm": 0.7206958532333374, "kl": 0.04150390625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5872762203216553, "reward_std": 0.24908644706010818, "rewards/accuracy_reward": 0.6076843738555908, "rewards/format_reward": 0.9795918166637421, "step": 1181 }, { "completion_length": 254.07142639160156, "epoch": 0.11894339622641509, "grad_norm": 1.9745310544967651, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.65378999710083, "reward_std": 0.15791042149066925, "rewards/accuracy_reward": 0.6537900567054749, "rewards/format_reward": 1.0, "step": 1182 }, { "completion_length": 165.60203552246094, "epoch": 0.11904402515723271, "grad_norm": 0.3641887903213501, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.9427720308303833, "reward_std": 0.06988748162984848, "rewards/accuracy_reward": 0.9427720904350281, "rewards/format_reward": 1.0, "step": 1183 }, { "completion_length": 263.76529693603516, "epoch": 0.11914465408805032, "grad_norm": 0.9408788084983826, "kl": 0.0457763671875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.68367338180542, "reward_std": 0.23038647323846817, "rewards/accuracy_reward": 0.6938775181770325, "rewards/format_reward": 0.9897959232330322, "step": 1184 }, { "completion_length": 248.10203552246094, "epoch": 0.11924528301886793, "grad_norm": 0.9393928050994873, "kl": 0.056884765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6585261821746826, "reward_std": 0.17451602965593338, "rewards/accuracy_reward": 0.668730229139328, "rewards/format_reward": 0.9897959232330322, "step": 1185 }, { "completion_length": 228.2040786743164, "epoch": 0.11934591194968554, "grad_norm": 1.3935168981552124, "kl": 0.0582275390625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6632652282714844, "reward_std": 0.17639155685901642, "rewards/accuracy_reward": 0.6734693646430969, "rewards/format_reward": 0.9897959232330322, "step": 1186 }, { "completion_length": 225.41836547851562, "epoch": 0.11944654088050315, "grad_norm": 1.118403434753418, "kl": 0.0484619140625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6836734414100647, "reward_std": 0.23704826459288597, "rewards/accuracy_reward": 0.7142857015132904, "rewards/format_reward": 0.9693877398967743, "step": 1187 }, { "completion_length": 203.88774871826172, "epoch": 0.11954716981132076, "grad_norm": 1.132583498954773, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7514466643333435, "reward_std": 0.1999756470322609, "rewards/accuracy_reward": 0.7514466643333435, "rewards/format_reward": 1.0, "step": 1188 }, { "completion_length": 281.89796447753906, "epoch": 0.11964779874213836, "grad_norm": 1.3865314722061157, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5814709663391113, "reward_std": 0.3456796556711197, "rewards/accuracy_reward": 0.5916749835014343, "rewards/format_reward": 0.9897959232330322, "step": 1189 }, { "completion_length": 296.02040100097656, "epoch": 0.11974842767295597, "grad_norm": 0.8706949949264526, "kl": 0.04833984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6190476417541504, "reward_std": 0.2731291130185127, "rewards/accuracy_reward": 0.6394557505846024, "rewards/format_reward": 0.9795918166637421, "step": 1190 }, { "completion_length": 259.57141876220703, "epoch": 0.11984905660377358, "grad_norm": 0.6478813290596008, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6358997821807861, "reward_std": 0.24553176015615463, "rewards/accuracy_reward": 0.6358998417854309, "rewards/format_reward": 1.0, "step": 1191 }, { "completion_length": 224.86734008789062, "epoch": 0.11994968553459119, "grad_norm": 0.49159738421440125, "kl": 0.05517578125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.771490454673767, "reward_std": 0.10779435560107231, "rewards/accuracy_reward": 0.7714904248714447, "rewards/format_reward": 1.0, "step": 1192 }, { "completion_length": 305.5816192626953, "epoch": 0.1200503144654088, "grad_norm": 0.6211878061294556, "kl": 0.0537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.4781340956687927, "reward_std": 0.22715529799461365, "rewards/accuracy_reward": 0.5087463706731796, "rewards/format_reward": 0.9693877398967743, "step": 1193 }, { "completion_length": 250.7551040649414, "epoch": 0.12015094339622641, "grad_norm": 0.9529584646224976, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.658600628376007, "reward_std": 0.19834907352924347, "rewards/accuracy_reward": 0.6586005389690399, "rewards/format_reward": 1.0, "step": 1194 }, { "completion_length": 196.31632232666016, "epoch": 0.12025157232704403, "grad_norm": 0.7571586966514587, "kl": 0.0496826171875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7473934292793274, "reward_std": 0.25225549191236496, "rewards/accuracy_reward": 0.7678016722202301, "rewards/format_reward": 0.9795918166637421, "step": 1195 }, { "completion_length": 222.6326446533203, "epoch": 0.12035220125786164, "grad_norm": 0.7315899133682251, "kl": 0.0455322265625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6559200286865234, "reward_std": 0.2175029218196869, "rewards/accuracy_reward": 0.666124016046524, "rewards/format_reward": 0.9897959232330322, "step": 1196 }, { "completion_length": 264.20408630371094, "epoch": 0.12045283018867925, "grad_norm": 2.2634387016296387, "kl": 0.0540771484375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7173678278923035, "reward_std": 0.14545797556638718, "rewards/accuracy_reward": 0.7173677980899811, "rewards/format_reward": 1.0, "step": 1197 }, { "completion_length": 336.6428527832031, "epoch": 0.12055345911949686, "grad_norm": 0.5968193411827087, "kl": 0.03564453125, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.6352040767669678, "reward_std": 0.23070282489061356, "rewards/accuracy_reward": 0.6454081535339355, "rewards/format_reward": 0.9897959232330322, "step": 1198 }, { "completion_length": 247.78571319580078, "epoch": 0.12065408805031447, "grad_norm": 0.7248167395591736, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.596274733543396, "reward_std": 0.21597082912921906, "rewards/accuracy_reward": 0.6064788103103638, "rewards/format_reward": 0.9897959232330322, "step": 1199 }, { "completion_length": 171.61224365234375, "epoch": 0.12075471698113208, "grad_norm": 0.6676002740859985, "kl": 0.04736328125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7596776485443115, "reward_std": 0.0414077527821064, "rewards/accuracy_reward": 0.7596776187419891, "rewards/format_reward": 1.0, "step": 1200 }, { "completion_length": 251.32653045654297, "epoch": 0.12085534591194969, "grad_norm": 0.907171905040741, "kl": 0.034912109375, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.6088435053825378, "reward_std": 0.20791853219270706, "rewards/accuracy_reward": 0.6088435351848602, "rewards/format_reward": 1.0, "step": 1201 }, { "completion_length": 303.7244873046875, "epoch": 0.1209559748427673, "grad_norm": 0.45355045795440674, "kl": 0.05322265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.626579225063324, "reward_std": 0.165466770529747, "rewards/accuracy_reward": 0.6469874680042267, "rewards/format_reward": 0.9795918166637421, "step": 1202 }, { "completion_length": 248.38774871826172, "epoch": 0.1210566037735849, "grad_norm": 0.7992289066314697, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5816326141357422, "reward_std": 0.20006240904331207, "rewards/accuracy_reward": 0.5918367207050323, "rewards/format_reward": 0.9897959232330322, "step": 1203 }, { "completion_length": 197.9285659790039, "epoch": 0.12115723270440251, "grad_norm": 1.2408528327941895, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7517006993293762, "reward_std": 0.21103612333536148, "rewards/accuracy_reward": 0.7517006695270538, "rewards/format_reward": 1.0, "step": 1204 }, { "completion_length": 235.23468780517578, "epoch": 0.12125786163522012, "grad_norm": 1.342376470565796, "kl": 0.0682373046875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.759183645248413, "reward_std": 0.18749930709600449, "rewards/accuracy_reward": 0.7591836750507355, "rewards/format_reward": 1.0, "step": 1205 }, { "completion_length": 271.82652282714844, "epoch": 0.12135849056603773, "grad_norm": 1.0113123655319214, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.4593780040740967, "reward_std": 0.29075316339731216, "rewards/accuracy_reward": 0.4899902641773224, "rewards/format_reward": 0.9693877398967743, "step": 1206 }, { "completion_length": 259.26529693603516, "epoch": 0.12145911949685535, "grad_norm": 0.884300947189331, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7474600076675415, "reward_std": 0.21008504182100296, "rewards/accuracy_reward": 0.7576640546321869, "rewards/format_reward": 0.9897959232330322, "step": 1207 }, { "completion_length": 312.27549743652344, "epoch": 0.12155974842767296, "grad_norm": 1.336349606513977, "kl": 0.0408935546875, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.5877860188484192, "reward_std": 0.3897055983543396, "rewards/accuracy_reward": 0.6081941723823547, "rewards/format_reward": 0.9795918464660645, "step": 1208 }, { "completion_length": 263.9795913696289, "epoch": 0.12166037735849057, "grad_norm": 0.9201287627220154, "kl": 0.0400390625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6336248517036438, "reward_std": 0.3379747271537781, "rewards/accuracy_reward": 0.6642371118068695, "rewards/format_reward": 0.9693877398967743, "step": 1209 }, { "completion_length": 196.53060913085938, "epoch": 0.12176100628930818, "grad_norm": 0.8775133490562439, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.601204514503479, "reward_std": 0.19631431996822357, "rewards/accuracy_reward": 0.6012044847011566, "rewards/format_reward": 1.0, "step": 1210 }, { "completion_length": 246.9387664794922, "epoch": 0.12186163522012579, "grad_norm": 0.8140299320220947, "kl": 0.0491943359375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.758712649345398, "reward_std": 0.2539932504296303, "rewards/accuracy_reward": 0.7587127089500427, "rewards/format_reward": 1.0, "step": 1211 }, { "completion_length": 268.2040710449219, "epoch": 0.1219622641509434, "grad_norm": 0.8134958148002625, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.482814371585846, "reward_std": 0.29400239139795303, "rewards/accuracy_reward": 0.5032225400209427, "rewards/format_reward": 0.9795918166637421, "step": 1212 }, { "completion_length": 197.4081573486328, "epoch": 0.122062893081761, "grad_norm": 9.935943603515625, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7398444414138794, "reward_std": 0.2096601128578186, "rewards/accuracy_reward": 0.7602526545524597, "rewards/format_reward": 0.9795918464660645, "step": 1213 }, { "completion_length": 177.27550506591797, "epoch": 0.12216352201257862, "grad_norm": 2.262629747390747, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8123052716255188, "reward_std": 0.20472967624664307, "rewards/accuracy_reward": 0.8327134549617767, "rewards/format_reward": 0.9795918464660645, "step": 1214 }, { "completion_length": 202.35713958740234, "epoch": 0.12226415094339622, "grad_norm": 0.9124404191970825, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6998299360275269, "reward_std": 0.19345952570438385, "rewards/accuracy_reward": 0.7100340127944946, "rewards/format_reward": 0.9897959232330322, "step": 1215 }, { "completion_length": 147.75509643554688, "epoch": 0.12236477987421383, "grad_norm": 0.8813827633857727, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7143139839172363, "reward_std": 0.16350890323519707, "rewards/accuracy_reward": 0.7347221374511719, "rewards/format_reward": 0.9795918166637421, "step": 1216 }, { "completion_length": 248.79591369628906, "epoch": 0.12246540880503144, "grad_norm": 4.405376434326172, "kl": 0.0640869140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6273080706596375, "reward_std": 0.3088892474770546, "rewards/accuracy_reward": 0.6477161943912506, "rewards/format_reward": 0.9795918166637421, "step": 1217 }, { "completion_length": 243.83673095703125, "epoch": 0.12256603773584905, "grad_norm": 0.9110222458839417, "kl": 0.0350341796875, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.7736960649490356, "reward_std": 0.21821648627519608, "rewards/accuracy_reward": 0.773696094751358, "rewards/format_reward": 1.0, "step": 1218 }, { "completion_length": 208.01020050048828, "epoch": 0.12266666666666666, "grad_norm": 0.8397361040115356, "kl": 0.0855712890625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.74137282371521, "reward_std": 0.23099816590547562, "rewards/accuracy_reward": 0.7821892499923706, "rewards/format_reward": 0.9591836631298065, "step": 1219 }, { "completion_length": 250.1632537841797, "epoch": 0.12276729559748428, "grad_norm": 0.9961692690849304, "kl": 0.0692138671875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6839826703071594, "reward_std": 0.21833278983831406, "rewards/accuracy_reward": 0.6941867768764496, "rewards/format_reward": 0.9897959232330322, "step": 1220 }, { "completion_length": 240.05101013183594, "epoch": 0.12286792452830189, "grad_norm": 0.7993612885475159, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6863606572151184, "reward_std": 0.205279640853405, "rewards/accuracy_reward": 0.6965646743774414, "rewards/format_reward": 0.9897959232330322, "step": 1221 }, { "completion_length": 222.55101013183594, "epoch": 0.1229685534591195, "grad_norm": 0.7716086506843567, "kl": 0.044677734375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.733560025691986, "reward_std": 0.2297581061720848, "rewards/accuracy_reward": 0.7335600852966309, "rewards/format_reward": 1.0, "step": 1222 }, { "completion_length": 260.1836700439453, "epoch": 0.12306918238993711, "grad_norm": 0.9074140787124634, "kl": 0.0540771484375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5295300483703613, "reward_std": 0.26829203963279724, "rewards/accuracy_reward": 0.5397340655326843, "rewards/format_reward": 0.9897959232330322, "step": 1223 }, { "completion_length": 274.46937561035156, "epoch": 0.12316981132075472, "grad_norm": 1.052491545677185, "kl": 0.0413818359375, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.8049163818359375, "reward_std": 0.27304184436798096, "rewards/accuracy_reward": 0.825324684381485, "rewards/format_reward": 0.9795918464660645, "step": 1224 }, { "completion_length": 199.448974609375, "epoch": 0.12327044025157233, "grad_norm": 1.4086041450500488, "kl": 0.1171875, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.6445504426956177, "reward_std": 0.24658195674419403, "rewards/accuracy_reward": 0.6649586260318756, "rewards/format_reward": 0.9795918464660645, "step": 1225 }, { "completion_length": 188.1938705444336, "epoch": 0.12337106918238994, "grad_norm": 1.8409584760665894, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6508745551109314, "reward_std": 0.3019729107618332, "rewards/accuracy_reward": 0.691690981388092, "rewards/format_reward": 0.9591836333274841, "step": 1226 }, { "completion_length": 178.448974609375, "epoch": 0.12347169811320755, "grad_norm": 1.5913175344467163, "kl": 0.05615234375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7793367505073547, "reward_std": 0.20160306245088577, "rewards/accuracy_reward": 0.7793367207050323, "rewards/format_reward": 1.0, "step": 1227 }, { "completion_length": 268.2449035644531, "epoch": 0.12357232704402515, "grad_norm": 1.0223373174667358, "kl": 0.05859375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.597102165222168, "reward_std": 0.26712165772914886, "rewards/accuracy_reward": 0.6175104081630707, "rewards/format_reward": 0.9795918464660645, "step": 1228 }, { "completion_length": 272.33673095703125, "epoch": 0.12367295597484276, "grad_norm": 3.769275188446045, "kl": 0.0499267578125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6380815505981445, "reward_std": 0.21403922885656357, "rewards/accuracy_reward": 0.6482856869697571, "rewards/format_reward": 0.9897959232330322, "step": 1229 }, { "completion_length": 270.6122360229492, "epoch": 0.12377358490566037, "grad_norm": 0.5957027673721313, "kl": 0.0350341796875, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.5557900667190552, "reward_std": 0.1821887046098709, "rewards/accuracy_reward": 0.5557901263237, "rewards/format_reward": 1.0, "step": 1230 }, { "completion_length": 258.8265151977539, "epoch": 0.12387421383647798, "grad_norm": 1.3592571020126343, "kl": 0.05181884765625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6633774638175964, "reward_std": 0.15436188504099846, "rewards/accuracy_reward": 0.663377434015274, "rewards/format_reward": 1.0, "step": 1231 }, { "completion_length": 204.58162689208984, "epoch": 0.1239748427672956, "grad_norm": 1.1094170808792114, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6835862398147583, "reward_std": 0.20237644016742706, "rewards/accuracy_reward": 0.6937902867794037, "rewards/format_reward": 0.9897959232330322, "step": 1232 }, { "completion_length": 205.09183502197266, "epoch": 0.12407547169811321, "grad_norm": 0.6571090221405029, "kl": 0.043212890625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.697704017162323, "reward_std": 0.06886624917387962, "rewards/accuracy_reward": 0.6977040767669678, "rewards/format_reward": 1.0, "step": 1233 }, { "completion_length": 269.0306091308594, "epoch": 0.12417610062893082, "grad_norm": 1.1719956398010254, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5621172785758972, "reward_std": 0.29942256957292557, "rewards/accuracy_reward": 0.6131377220153809, "rewards/format_reward": 0.9489795863628387, "step": 1234 }, { "completion_length": 286.76529693603516, "epoch": 0.12427672955974843, "grad_norm": 0.8956759572029114, "kl": 0.0633544921875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.570745050907135, "reward_std": 0.2835812568664551, "rewards/accuracy_reward": 0.6115614473819733, "rewards/format_reward": 0.9591836631298065, "step": 1235 }, { "completion_length": 170.77550506591797, "epoch": 0.12437735849056604, "grad_norm": 1.330320119857788, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7118560671806335, "reward_std": 0.23245258629322052, "rewards/accuracy_reward": 0.7118561565876007, "rewards/format_reward": 1.0, "step": 1236 }, { "completion_length": 214.2142791748047, "epoch": 0.12447798742138365, "grad_norm": 0.7353876829147339, "kl": 0.04541015625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7295917868614197, "reward_std": 0.222720667719841, "rewards/accuracy_reward": 0.7295918166637421, "rewards/format_reward": 1.0, "step": 1237 }, { "completion_length": 231.0714340209961, "epoch": 0.12457861635220126, "grad_norm": 0.8043341040611267, "kl": 0.04443359375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7098004221916199, "reward_std": 0.20153655111789703, "rewards/accuracy_reward": 0.7200044691562653, "rewards/format_reward": 0.9897959232330322, "step": 1238 }, { "completion_length": 265.9387664794922, "epoch": 0.12467924528301887, "grad_norm": 1.5039150714874268, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6068115830421448, "reward_std": 0.3011837974190712, "rewards/accuracy_reward": 0.6068115383386612, "rewards/format_reward": 1.0, "step": 1239 }, { "completion_length": 186.448974609375, "epoch": 0.12477987421383648, "grad_norm": 0.6564146280288696, "kl": 0.0614013671875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6591836810112, "reward_std": 0.08475913852453232, "rewards/accuracy_reward": 0.6693877279758453, "rewards/format_reward": 0.9897959232330322, "step": 1240 }, { "completion_length": 317.82653045654297, "epoch": 0.12488050314465408, "grad_norm": 0.657096266746521, "kl": 0.0689697265625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.4489795565605164, "reward_std": 0.18262769654393196, "rewards/accuracy_reward": 0.4693877398967743, "rewards/format_reward": 0.9795918166637421, "step": 1241 }, { "completion_length": 208.9897918701172, "epoch": 0.1249811320754717, "grad_norm": 0.8574203848838806, "kl": 0.0594482421875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7497689127922058, "reward_std": 0.22445645183324814, "rewards/accuracy_reward": 0.7701770961284637, "rewards/format_reward": 0.9795918464660645, "step": 1242 }, { "completion_length": 356.4693908691406, "epoch": 0.12508176100628932, "grad_norm": 0.9698066711425781, "kl": 0.0489501953125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.43718820810318, "reward_std": 0.32173047959804535, "rewards/accuracy_reward": 0.4473922848701477, "rewards/format_reward": 0.9897959232330322, "step": 1243 }, { "completion_length": 186.49999237060547, "epoch": 0.12518238993710692, "grad_norm": 0.7159508466720581, "kl": 0.0631103515625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7857142686843872, "reward_std": 0.1348847895860672, "rewards/accuracy_reward": 0.7857142686843872, "rewards/format_reward": 1.0, "step": 1244 }, { "completion_length": 223.57142639160156, "epoch": 0.12528301886792453, "grad_norm": 1.2267887592315674, "kl": 0.0543212890625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7807739973068237, "reward_std": 0.2591477930545807, "rewards/accuracy_reward": 0.7807739973068237, "rewards/format_reward": 1.0, "step": 1245 }, { "completion_length": 243.90814971923828, "epoch": 0.12538364779874214, "grad_norm": 1.38724684715271, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.3852774500846863, "reward_std": 0.2576577067375183, "rewards/accuracy_reward": 0.39548157155513763, "rewards/format_reward": 0.9897959232330322, "step": 1246 }, { "completion_length": 194.64285278320312, "epoch": 0.12548427672955975, "grad_norm": 0.5643887519836426, "kl": 0.0487060546875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7706214785575867, "reward_std": 0.10813490301370621, "rewards/accuracy_reward": 0.7706214487552643, "rewards/format_reward": 1.0, "step": 1247 }, { "completion_length": 175.4183578491211, "epoch": 0.12558490566037736, "grad_norm": 1.4490357637405396, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7244897484779358, "reward_std": 0.27440160512924194, "rewards/accuracy_reward": 0.734693855047226, "rewards/format_reward": 0.9897959232330322, "step": 1248 }, { "completion_length": 231.28570556640625, "epoch": 0.12568553459119497, "grad_norm": 0.9688140749931335, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.709232211112976, "reward_std": 0.19636069983243942, "rewards/accuracy_reward": 0.7194363176822662, "rewards/format_reward": 0.9897959232330322, "step": 1249 }, { "completion_length": 198.80612182617188, "epoch": 0.12578616352201258, "grad_norm": 1.1090152263641357, "kl": 0.103759765625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.5627706050872803, "reward_std": 0.18921231850981712, "rewards/accuracy_reward": 0.5627705454826355, "rewards/format_reward": 1.0, "step": 1250 }, { "completion_length": 266.87754821777344, "epoch": 0.1258867924528302, "grad_norm": 0.75941002368927, "kl": 0.0572509765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6086199879646301, "reward_std": 0.23410965502262115, "rewards/accuracy_reward": 0.6086199581623077, "rewards/format_reward": 1.0, "step": 1251 }, { "completion_length": 230.11224365234375, "epoch": 0.1259874213836478, "grad_norm": 1.007838487625122, "kl": 0.0711669921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6131813526153564, "reward_std": 0.2603701502084732, "rewards/accuracy_reward": 0.6233854293823242, "rewards/format_reward": 0.9897959232330322, "step": 1252 }, { "completion_length": 254.71428680419922, "epoch": 0.1260880503144654, "grad_norm": 0.47146084904670715, "kl": 0.0540771484375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6938775181770325, "reward_std": 0.14656289666891098, "rewards/accuracy_reward": 0.7142857015132904, "rewards/format_reward": 0.9795918464660645, "step": 1253 }, { "completion_length": 228.80611419677734, "epoch": 0.12618867924528301, "grad_norm": 0.6422000527381897, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6552113890647888, "reward_std": 0.12808983772993088, "rewards/accuracy_reward": 0.6552113890647888, "rewards/format_reward": 1.0, "step": 1254 }, { "completion_length": 211.9897918701172, "epoch": 0.12628930817610062, "grad_norm": 0.7623651027679443, "kl": 0.0711669921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7142967581748962, "reward_std": 0.19753101468086243, "rewards/accuracy_reward": 0.724500834941864, "rewards/format_reward": 0.9897959232330322, "step": 1255 }, { "completion_length": 260.8367233276367, "epoch": 0.12638993710691823, "grad_norm": 0.7735055088996887, "kl": 0.052978515625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.670869767665863, "reward_std": 0.20105722919106483, "rewards/accuracy_reward": 0.6810738742351532, "rewards/format_reward": 0.9897959232330322, "step": 1256 }, { "completion_length": 323.0612258911133, "epoch": 0.12649056603773584, "grad_norm": 0.8949117660522461, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.4958873987197876, "reward_std": 0.13952890038490295, "rewards/accuracy_reward": 0.5060915052890778, "rewards/format_reward": 0.9897959232330322, "step": 1257 }, { "completion_length": 226.1938705444336, "epoch": 0.12659119496855345, "grad_norm": 0.6004989743232727, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7658530473709106, "reward_std": 0.1448923796415329, "rewards/accuracy_reward": 0.7760571539402008, "rewards/format_reward": 0.9897959232330322, "step": 1258 }, { "completion_length": 202.53060913085938, "epoch": 0.12669182389937106, "grad_norm": 1.068177580833435, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6629561185836792, "reward_std": 0.25391174852848053, "rewards/accuracy_reward": 0.673160195350647, "rewards/format_reward": 0.9897959232330322, "step": 1259 }, { "completion_length": 189.83673095703125, "epoch": 0.12679245283018867, "grad_norm": 1.134725570678711, "kl": 0.053955078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.835374116897583, "reward_std": 0.1337655708193779, "rewards/accuracy_reward": 0.8353741466999054, "rewards/format_reward": 1.0, "step": 1260 }, { "completion_length": 237.88774871826172, "epoch": 0.12689308176100628, "grad_norm": 0.9212029576301575, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8041677474975586, "reward_std": 0.24964533746242523, "rewards/accuracy_reward": 0.8143718242645264, "rewards/format_reward": 0.9897959232330322, "step": 1261 }, { "completion_length": 268.96937561035156, "epoch": 0.12699371069182389, "grad_norm": 0.952870786190033, "kl": 0.051513671875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6455106735229492, "reward_std": 0.2556961327791214, "rewards/accuracy_reward": 0.655714750289917, "rewards/format_reward": 0.9897959232330322, "step": 1262 }, { "completion_length": 229.5204086303711, "epoch": 0.12709433962264152, "grad_norm": 1.470183253288269, "kl": 0.0699462890625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7557204365730286, "reward_std": 0.13387205451726913, "rewards/accuracy_reward": 0.755720466375351, "rewards/format_reward": 1.0, "step": 1263 }, { "completion_length": 240.38775634765625, "epoch": 0.12719496855345913, "grad_norm": 1.4088586568832397, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.442828893661499, "reward_std": 0.19234386086463928, "rewards/accuracy_reward": 0.46323709189891815, "rewards/format_reward": 0.9795918464660645, "step": 1264 }, { "completion_length": 273.05101776123047, "epoch": 0.12729559748427674, "grad_norm": 0.796719491481781, "kl": 0.053955078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6795575022697449, "reward_std": 0.11222775280475616, "rewards/accuracy_reward": 0.6897615492343903, "rewards/format_reward": 0.9897959232330322, "step": 1265 }, { "completion_length": 214.88774871826172, "epoch": 0.12739622641509435, "grad_norm": 1.0102649927139282, "kl": 0.056884765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.4886179566383362, "reward_std": 0.3437570333480835, "rewards/accuracy_reward": 0.5294342637062073, "rewards/format_reward": 0.9591836333274841, "step": 1266 }, { "completion_length": 223.25509643554688, "epoch": 0.12749685534591196, "grad_norm": 0.7919471263885498, "kl": 0.0538330078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7628567218780518, "reward_std": 0.15526748448610306, "rewards/accuracy_reward": 0.7832649052143097, "rewards/format_reward": 0.9795918464660645, "step": 1267 }, { "completion_length": 173.12244415283203, "epoch": 0.12759748427672957, "grad_norm": 0.6113795042037964, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7843181490898132, "reward_std": 0.12882325053215027, "rewards/accuracy_reward": 0.794522225856781, "rewards/format_reward": 0.9897959232330322, "step": 1268 }, { "completion_length": 268.6836700439453, "epoch": 0.12769811320754718, "grad_norm": 0.6152809262275696, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.643197238445282, "reward_std": 0.248765729367733, "rewards/accuracy_reward": 0.6534013450145721, "rewards/format_reward": 0.9897959232330322, "step": 1269 }, { "completion_length": 213.5204086303711, "epoch": 0.12779874213836478, "grad_norm": 0.7815935015678406, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8151553273200989, "reward_std": 0.21153511852025986, "rewards/accuracy_reward": 0.8559716939926147, "rewards/format_reward": 0.9591836631298065, "step": 1270 }, { "completion_length": 249.8265151977539, "epoch": 0.1278993710691824, "grad_norm": 1.6985256671905518, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6372448205947876, "reward_std": 0.19012394547462463, "rewards/accuracy_reward": 0.6474489569664001, "rewards/format_reward": 0.9897959232330322, "step": 1271 }, { "completion_length": 187.10203552246094, "epoch": 0.128, "grad_norm": 0.8721681237220764, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.714739203453064, "reward_std": 0.1084303930401802, "rewards/accuracy_reward": 0.714739203453064, "rewards/format_reward": 1.0, "step": 1272 }, { "completion_length": 193.02040100097656, "epoch": 0.1281006289308176, "grad_norm": 0.7310689687728882, "kl": 0.0595703125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7360997796058655, "reward_std": 0.11771610751748085, "rewards/accuracy_reward": 0.7360997498035431, "rewards/format_reward": 1.0, "step": 1273 }, { "completion_length": 262.2448959350586, "epoch": 0.12820125786163522, "grad_norm": 0.7746087312698364, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7346938252449036, "reward_std": 0.21067723631858826, "rewards/accuracy_reward": 0.734693855047226, "rewards/format_reward": 1.0, "step": 1274 }, { "completion_length": 179.85713958740234, "epoch": 0.12830188679245283, "grad_norm": 2.405526876449585, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.822060227394104, "reward_std": 0.2610759884119034, "rewards/accuracy_reward": 0.8322643637657166, "rewards/format_reward": 0.9897959232330322, "step": 1275 }, { "completion_length": 239.9387664794922, "epoch": 0.12840251572327044, "grad_norm": 1.0637695789337158, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6174603700637817, "reward_std": 0.2471308559179306, "rewards/accuracy_reward": 0.6378684937953949, "rewards/format_reward": 0.9795918464660645, "step": 1276 }, { "completion_length": 317.07142639160156, "epoch": 0.12850314465408805, "grad_norm": 1.3834600448608398, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.4915726780891418, "reward_std": 0.2943621426820755, "rewards/accuracy_reward": 0.5221849530935287, "rewards/format_reward": 0.9693877398967743, "step": 1277 }, { "completion_length": 253.8775405883789, "epoch": 0.12860377358490566, "grad_norm": 0.8618311882019043, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6122448444366455, "reward_std": 0.17639156430959702, "rewards/accuracy_reward": 0.6224489510059357, "rewards/format_reward": 0.9897959232330322, "step": 1278 }, { "completion_length": 257.6734619140625, "epoch": 0.12870440251572327, "grad_norm": 1.0459452867507935, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6480280756950378, "reward_std": 0.2734718471765518, "rewards/accuracy_reward": 0.6582320928573608, "rewards/format_reward": 0.9897959232330322, "step": 1279 }, { "completion_length": 174.30612182617188, "epoch": 0.12880503144654087, "grad_norm": 1.151479721069336, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7047051787376404, "reward_std": 0.21104905754327774, "rewards/accuracy_reward": 0.7047052085399628, "rewards/format_reward": 1.0, "step": 1280 }, { "completion_length": 203.84693145751953, "epoch": 0.12890566037735848, "grad_norm": 0.47798284888267517, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6109461784362793, "reward_std": 0.07987084984779358, "rewards/accuracy_reward": 0.6109461784362793, "rewards/format_reward": 1.0, "step": 1281 }, { "completion_length": 225.36734008789062, "epoch": 0.1290062893081761, "grad_norm": 1.6154084205627441, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.686026394367218, "reward_std": 0.2871240973472595, "rewards/accuracy_reward": 0.7064344584941864, "rewards/format_reward": 0.9795918166637421, "step": 1282 }, { "completion_length": 252.88774871826172, "epoch": 0.1291069182389937, "grad_norm": 0.7531734704971313, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6574620604515076, "reward_std": 0.26638617366552353, "rewards/accuracy_reward": 0.6880743503570557, "rewards/format_reward": 0.9693877398967743, "step": 1283 }, { "completion_length": 210.84693908691406, "epoch": 0.1292075471698113, "grad_norm": 0.46557316184043884, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7837706208229065, "reward_std": 0.08717088960111141, "rewards/accuracy_reward": 0.7939747273921967, "rewards/format_reward": 0.9897959232330322, "step": 1284 }, { "completion_length": 213.11223602294922, "epoch": 0.12930817610062892, "grad_norm": 0.5805326104164124, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.738722026348114, "reward_std": 0.16211120411753654, "rewards/accuracy_reward": 0.738722026348114, "rewards/format_reward": 1.0, "step": 1285 }, { "completion_length": 178.6836700439453, "epoch": 0.12940880503144653, "grad_norm": 1.1716870069503784, "kl": 0.052978515625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7757227420806885, "reward_std": 0.22257845103740692, "rewards/accuracy_reward": 0.7859268486499786, "rewards/format_reward": 0.9897959232330322, "step": 1286 }, { "completion_length": 282.89794921875, "epoch": 0.12950943396226416, "grad_norm": 0.9125722050666809, "kl": 0.05419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7448979020118713, "reward_std": 0.2985696643590927, "rewards/accuracy_reward": 0.775510162115097, "rewards/format_reward": 0.9693877398967743, "step": 1287 }, { "completion_length": 238.72447967529297, "epoch": 0.12961006289308177, "grad_norm": 0.9986587762832642, "kl": 0.05712890625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5360543727874756, "reward_std": 0.2500624358654022, "rewards/accuracy_reward": 0.5870748460292816, "rewards/format_reward": 0.9489795565605164, "step": 1288 }, { "completion_length": 250.79590606689453, "epoch": 0.12971069182389938, "grad_norm": 0.9221508502960205, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7448979020118713, "reward_std": 0.23938562721014023, "rewards/accuracy_reward": 0.7551020085811615, "rewards/format_reward": 0.9897959232330322, "step": 1289 }, { "completion_length": 249.0816192626953, "epoch": 0.129811320754717, "grad_norm": 1.5634914636611938, "kl": 0.0494384765625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6090654134750366, "reward_std": 0.4052543044090271, "rewards/accuracy_reward": 0.6600857675075531, "rewards/format_reward": 0.9489795565605164, "step": 1290 }, { "completion_length": 146.12244415283203, "epoch": 0.1299119496855346, "grad_norm": 1.2630548477172852, "kl": 0.0880126953125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7804575562477112, "reward_std": 0.12808483839035034, "rewards/accuracy_reward": 0.7906616926193237, "rewards/format_reward": 0.9897959232330322, "step": 1291 }, { "completion_length": 271.4591751098633, "epoch": 0.1300125786163522, "grad_norm": 1.980387806892395, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7481723427772522, "reward_std": 0.2731269523501396, "rewards/accuracy_reward": 0.7685804665088654, "rewards/format_reward": 0.9795918166637421, "step": 1292 }, { "completion_length": 292.3877487182617, "epoch": 0.13011320754716982, "grad_norm": 1.4231818914413452, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5151907205581665, "reward_std": 0.32378867268562317, "rewards/accuracy_reward": 0.5253948867321014, "rewards/format_reward": 0.9897959232330322, "step": 1293 }, { "completion_length": 284.89795684814453, "epoch": 0.13021383647798743, "grad_norm": 0.9976562857627869, "kl": 0.0550537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6936345100402832, "reward_std": 0.15519605576992035, "rewards/accuracy_reward": 0.6936345398426056, "rewards/format_reward": 1.0, "step": 1294 }, { "completion_length": 298.29590606689453, "epoch": 0.13031446540880504, "grad_norm": 10.260540962219238, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5482771396636963, "reward_std": 0.28232719004154205, "rewards/accuracy_reward": 0.5584813058376312, "rewards/format_reward": 0.9897959232330322, "step": 1295 }, { "completion_length": 236.34693908691406, "epoch": 0.13041509433962264, "grad_norm": 0.6031968593597412, "kl": 0.0640869140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7190475463867188, "reward_std": 0.15171435847878456, "rewards/accuracy_reward": 0.7394557893276215, "rewards/format_reward": 0.9795918166637421, "step": 1296 }, { "completion_length": 212.9387664794922, "epoch": 0.13051572327044025, "grad_norm": 0.9716517329216003, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6064658164978027, "reward_std": 0.29327670484781265, "rewards/accuracy_reward": 0.6370781064033508, "rewards/format_reward": 0.9693877398967743, "step": 1297 }, { "completion_length": 246.61223602294922, "epoch": 0.13061635220125786, "grad_norm": 1.18129563331604, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.668285846710205, "reward_std": 0.17532473802566528, "rewards/accuracy_reward": 0.6784899532794952, "rewards/format_reward": 0.9897959232330322, "step": 1298 }, { "completion_length": 234.39795684814453, "epoch": 0.13071698113207547, "grad_norm": 3.0647313594818115, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7060251832008362, "reward_std": 0.19697970896959305, "rewards/accuracy_reward": 0.7264333963394165, "rewards/format_reward": 0.9795918464660645, "step": 1299 }, { "completion_length": 283.32652282714844, "epoch": 0.13081761006289308, "grad_norm": 0.7100619077682495, "kl": 0.045654296875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7089407444000244, "reward_std": 0.21449201926589012, "rewards/accuracy_reward": 0.7293488383293152, "rewards/format_reward": 0.9795918166637421, "step": 1300 }, { "completion_length": 238.36734008789062, "epoch": 0.1309182389937107, "grad_norm": 1.9894623756408691, "kl": 0.0723876953125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6473716497421265, "reward_std": 0.17655545473098755, "rewards/accuracy_reward": 0.6473716497421265, "rewards/format_reward": 1.0, "step": 1301 }, { "completion_length": 237.95917510986328, "epoch": 0.1310188679245283, "grad_norm": 1.0688790082931519, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6618682742118835, "reward_std": 0.1588471159338951, "rewards/accuracy_reward": 0.6618683040142059, "rewards/format_reward": 1.0, "step": 1302 }, { "completion_length": 222.4897918701172, "epoch": 0.1311194968553459, "grad_norm": 0.8840660452842712, "kl": 0.0748291015625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8287981152534485, "reward_std": 0.15669895708560944, "rewards/accuracy_reward": 0.8390022218227386, "rewards/format_reward": 0.9897959232330322, "step": 1303 }, { "completion_length": 284.8061065673828, "epoch": 0.13122012578616352, "grad_norm": 0.672796368598938, "kl": 0.04150390625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.611716091632843, "reward_std": 0.2385607808828354, "rewards/accuracy_reward": 0.6219202280044556, "rewards/format_reward": 0.9897959232330322, "step": 1304 }, { "completion_length": 233.9795913696289, "epoch": 0.13132075471698113, "grad_norm": 0.649412214756012, "kl": 0.0587158203125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.682823121547699, "reward_std": 0.17175141721963882, "rewards/accuracy_reward": 0.682823121547699, "rewards/format_reward": 1.0, "step": 1305 }, { "completion_length": 226.4183578491211, "epoch": 0.13142138364779873, "grad_norm": 0.6882494688034058, "kl": 0.0709228515625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7996288537979126, "reward_std": 0.21159129217267036, "rewards/accuracy_reward": 0.8098329901695251, "rewards/format_reward": 0.9897959232330322, "step": 1306 }, { "completion_length": 274.8673400878906, "epoch": 0.13152201257861634, "grad_norm": 0.8322039246559143, "kl": 0.05517578125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6181021928787231, "reward_std": 0.11812161281704903, "rewards/accuracy_reward": 0.6283063292503357, "rewards/format_reward": 0.9897959232330322, "step": 1307 }, { "completion_length": 284.10203552246094, "epoch": 0.13162264150943395, "grad_norm": 1.1032679080963135, "kl": 0.0416259765625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7219732403755188, "reward_std": 0.23957448452711105, "rewards/accuracy_reward": 0.721973329782486, "rewards/format_reward": 1.0, "step": 1308 }, { "completion_length": 302.06121826171875, "epoch": 0.13172327044025156, "grad_norm": 0.5461410880088806, "kl": 0.0570068359375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.654795229434967, "reward_std": 0.18496603518724442, "rewards/accuracy_reward": 0.6649993062019348, "rewards/format_reward": 0.9897959232330322, "step": 1309 }, { "completion_length": 173.39795684814453, "epoch": 0.13182389937106917, "grad_norm": 1.177485466003418, "kl": 0.0672607421875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8483721613883972, "reward_std": 0.1401918325573206, "rewards/accuracy_reward": 0.8483722805976868, "rewards/format_reward": 1.0, "step": 1310 }, { "completion_length": 260.30611419677734, "epoch": 0.13192452830188678, "grad_norm": 0.6591253876686096, "kl": 0.05517578125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6103016138076782, "reward_std": 0.16491889208555222, "rewards/accuracy_reward": 0.610301673412323, "rewards/format_reward": 1.0, "step": 1311 }, { "completion_length": 235.31632232666016, "epoch": 0.13202515723270442, "grad_norm": 0.9253599047660828, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.4716553688049316, "reward_std": 0.13661131262779236, "rewards/accuracy_reward": 0.47165530920028687, "rewards/format_reward": 1.0, "step": 1312 }, { "completion_length": 248.948974609375, "epoch": 0.13212578616352202, "grad_norm": 1.0934492349624634, "kl": 0.050537109375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.664621114730835, "reward_std": 0.14244595170021057, "rewards/accuracy_reward": 0.6646210551261902, "rewards/format_reward": 1.0, "step": 1313 }, { "completion_length": 304.37754821777344, "epoch": 0.13222641509433963, "grad_norm": 1.0545190572738647, "kl": 0.057373046875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.4423623085021973, "reward_std": 0.24105154722929, "rewards/accuracy_reward": 0.4627705365419388, "rewards/format_reward": 0.9795918166637421, "step": 1314 }, { "completion_length": 182.40816497802734, "epoch": 0.13232704402515724, "grad_norm": 0.5608872175216675, "kl": 0.0426025390625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7402129173278809, "reward_std": 0.07283179648220539, "rewards/accuracy_reward": 0.750417023897171, "rewards/format_reward": 0.9897959232330322, "step": 1315 }, { "completion_length": 222.95917510986328, "epoch": 0.13242767295597485, "grad_norm": 0.9955965876579285, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7148407697677612, "reward_std": 0.16200219094753265, "rewards/accuracy_reward": 0.7148407995700836, "rewards/format_reward": 1.0, "step": 1316 }, { "completion_length": 208.31632232666016, "epoch": 0.13252830188679246, "grad_norm": 13.91958999633789, "kl": 0.0545654296875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.599462628364563, "reward_std": 0.1631702035665512, "rewards/accuracy_reward": 0.5994627326726913, "rewards/format_reward": 1.0, "step": 1317 }, { "completion_length": 223.4081573486328, "epoch": 0.13262893081761007, "grad_norm": 0.7805994153022766, "kl": 0.0458984375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.8163264989852905, "reward_std": 0.21789833158254623, "rewards/accuracy_reward": 0.8265306055545807, "rewards/format_reward": 0.9897959232330322, "step": 1318 }, { "completion_length": 275.051025390625, "epoch": 0.13272955974842768, "grad_norm": 0.6112779378890991, "kl": 0.0550537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7189238667488098, "reward_std": 0.26854611933231354, "rewards/accuracy_reward": 0.7291279733181, "rewards/format_reward": 0.9897959232330322, "step": 1319 }, { "completion_length": 222.61223602294922, "epoch": 0.1328301886792453, "grad_norm": 1.4257644414901733, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6501360535621643, "reward_std": 0.25246938318014145, "rewards/accuracy_reward": 0.6603401601314545, "rewards/format_reward": 0.9897959232330322, "step": 1320 }, { "completion_length": 279.7244873046875, "epoch": 0.1329308176100629, "grad_norm": 2.033696413040161, "kl": 0.0823974609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.646873652935028, "reward_std": 0.2276536524295807, "rewards/accuracy_reward": 0.657077819108963, "rewards/format_reward": 0.9897959232330322, "step": 1321 }, { "completion_length": 320.72447967529297, "epoch": 0.1330314465408805, "grad_norm": 0.6803203821182251, "kl": 0.04638671875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.711901068687439, "reward_std": 0.2198989912867546, "rewards/accuracy_reward": 0.7425133585929871, "rewards/format_reward": 0.9693877398967743, "step": 1322 }, { "completion_length": 272.2653045654297, "epoch": 0.1331320754716981, "grad_norm": 1.9213615655899048, "kl": 0.0517578125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.622448980808258, "reward_std": 0.2620159685611725, "rewards/accuracy_reward": 0.6326530277729034, "rewards/format_reward": 0.9897959232330322, "step": 1323 }, { "completion_length": 250.55101013183594, "epoch": 0.13323270440251572, "grad_norm": 2.0927698612213135, "kl": 0.054931640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6059369444847107, "reward_std": 0.22456326335668564, "rewards/accuracy_reward": 0.6059369146823883, "rewards/format_reward": 1.0, "step": 1324 }, { "completion_length": 231.79591369628906, "epoch": 0.13333333333333333, "grad_norm": 1.2484511137008667, "kl": 0.0694580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7366374731063843, "reward_std": 0.2002703920006752, "rewards/accuracy_reward": 0.7366374731063843, "rewards/format_reward": 1.0, "step": 1325 }, { "completion_length": 273.35713958740234, "epoch": 0.13343396226415094, "grad_norm": 0.7623526453971863, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6643990874290466, "reward_std": 0.1510264202952385, "rewards/accuracy_reward": 0.6643990576267242, "rewards/format_reward": 1.0, "step": 1326 }, { "completion_length": 296.95916748046875, "epoch": 0.13353459119496855, "grad_norm": 0.8193209171295166, "kl": 0.0406494140625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.544379711151123, "reward_std": 0.2519368901848793, "rewards/accuracy_reward": 0.554583728313446, "rewards/format_reward": 0.9897959232330322, "step": 1327 }, { "completion_length": 228.60203552246094, "epoch": 0.13363522012578616, "grad_norm": 0.565025806427002, "kl": 0.0506591796875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8857142925262451, "reward_std": 0.1332312934100628, "rewards/accuracy_reward": 0.8857142627239227, "rewards/format_reward": 1.0, "step": 1328 }, { "completion_length": 233.91836547851562, "epoch": 0.13373584905660377, "grad_norm": 0.9322335720062256, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7187317609786987, "reward_std": 0.18293076753616333, "rewards/accuracy_reward": 0.7289358377456665, "rewards/format_reward": 0.9897959232330322, "step": 1329 }, { "completion_length": 281.37755584716797, "epoch": 0.13383647798742138, "grad_norm": 0.6684494018554688, "kl": 0.04248046875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.524819552898407, "reward_std": 0.1610134020447731, "rewards/accuracy_reward": 0.5350236892700195, "rewards/format_reward": 0.9897959232330322, "step": 1330 }, { "completion_length": 302.56121826171875, "epoch": 0.13393710691823899, "grad_norm": 0.6658238172531128, "kl": 0.0635986328125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6889008283615112, "reward_std": 0.2703295275568962, "rewards/accuracy_reward": 0.7195129990577698, "rewards/format_reward": 0.9693877398967743, "step": 1331 }, { "completion_length": 279.4897918701172, "epoch": 0.1340377358490566, "grad_norm": 0.9554463028907776, "kl": 0.0517578125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7250472903251648, "reward_std": 0.24125637114048004, "rewards/accuracy_reward": 0.7352513670921326, "rewards/format_reward": 0.9897959232330322, "step": 1332 }, { "completion_length": 282.22447967529297, "epoch": 0.1341383647798742, "grad_norm": 0.5749581456184387, "kl": 0.03302001953125, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.694942593574524, "reward_std": 0.09669839451089501, "rewards/accuracy_reward": 0.6949427127838135, "rewards/format_reward": 1.0, "step": 1333 }, { "completion_length": 280.0, "epoch": 0.1342389937106918, "grad_norm": 0.9212165474891663, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6326530575752258, "reward_std": 0.17969852685928345, "rewards/accuracy_reward": 0.6836734414100647, "rewards/format_reward": 0.9489795565605164, "step": 1334 }, { "completion_length": 214.4897918701172, "epoch": 0.13433962264150942, "grad_norm": 1.4673612117767334, "kl": 0.056640625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6573038697242737, "reward_std": 0.21305564790964127, "rewards/accuracy_reward": 0.6675080358982086, "rewards/format_reward": 0.9897959232330322, "step": 1335 }, { "completion_length": 309.7142791748047, "epoch": 0.13444025157232706, "grad_norm": 0.7633962035179138, "kl": 0.04296875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5816910862922668, "reward_std": 0.2812094986438751, "rewards/accuracy_reward": 0.602099284529686, "rewards/format_reward": 0.9795918166637421, "step": 1336 }, { "completion_length": 248.4897918701172, "epoch": 0.13454088050314467, "grad_norm": 1.293318510055542, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.532653033733368, "reward_std": 0.19476545602083206, "rewards/accuracy_reward": 0.5428571403026581, "rewards/format_reward": 0.9897959232330322, "step": 1337 }, { "completion_length": 217.2346954345703, "epoch": 0.13464150943396228, "grad_norm": 1.1661478281021118, "kl": 0.04833984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6794649362564087, "reward_std": 0.14438899606466293, "rewards/accuracy_reward": 0.6794649660587311, "rewards/format_reward": 1.0, "step": 1338 }, { "completion_length": 224.7142791748047, "epoch": 0.13474213836477988, "grad_norm": 0.3627180755138397, "kl": 0.050048828125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.467532455921173, "reward_std": 0.05399492383003235, "rewards/accuracy_reward": 0.4675324410200119, "rewards/format_reward": 1.0, "step": 1339 }, { "completion_length": 293.2653045654297, "epoch": 0.1348427672955975, "grad_norm": 0.7797254920005798, "kl": 0.059326171875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7172335982322693, "reward_std": 0.202999085187912, "rewards/accuracy_reward": 0.7274376451969147, "rewards/format_reward": 0.9897959232330322, "step": 1340 }, { "completion_length": 155.84693145751953, "epoch": 0.1349433962264151, "grad_norm": 0.990785539150238, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.721088469028473, "reward_std": 0.12145347520709038, "rewards/accuracy_reward": 0.7210884094238281, "rewards/format_reward": 1.0, "step": 1341 }, { "completion_length": 226.03060913085938, "epoch": 0.1350440251572327, "grad_norm": 1.161887764930725, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6050047874450684, "reward_std": 0.23058540374040604, "rewards/accuracy_reward": 0.6254130005836487, "rewards/format_reward": 0.9795918166637421, "step": 1342 }, { "completion_length": 248.78571319580078, "epoch": 0.13514465408805032, "grad_norm": 0.6092151999473572, "kl": 0.0587158203125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6217686533927917, "reward_std": 0.21834171563386917, "rewards/accuracy_reward": 0.6319728046655655, "rewards/format_reward": 0.9897959232330322, "step": 1343 }, { "completion_length": 338.39794921875, "epoch": 0.13524528301886793, "grad_norm": 0.6245805025100708, "kl": 0.048583984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7650061249732971, "reward_std": 0.13864022865891457, "rewards/accuracy_reward": 0.7650061249732971, "rewards/format_reward": 1.0, "step": 1344 }, { "completion_length": 236.9591827392578, "epoch": 0.13534591194968554, "grad_norm": 0.723678708076477, "kl": 0.0419921875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6427415013313293, "reward_std": 0.11956577748060226, "rewards/accuracy_reward": 0.6529455780982971, "rewards/format_reward": 0.9897959232330322, "step": 1345 }, { "completion_length": 237.52040100097656, "epoch": 0.13544654088050315, "grad_norm": 1.0454683303833008, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.658402442932129, "reward_std": 0.2654537856578827, "rewards/accuracy_reward": 0.6584024727344513, "rewards/format_reward": 1.0, "step": 1346 }, { "completion_length": 213.82653045654297, "epoch": 0.13554716981132076, "grad_norm": 1.0452563762664795, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6666666269302368, "reward_std": 0.23535112291574478, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "step": 1347 }, { "completion_length": 218.4081573486328, "epoch": 0.13564779874213836, "grad_norm": 0.8833997845649719, "kl": 0.049560546875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7137420773506165, "reward_std": 0.1788826510310173, "rewards/accuracy_reward": 0.7137420177459717, "rewards/format_reward": 1.0, "step": 1348 }, { "completion_length": 307.29591369628906, "epoch": 0.13574842767295597, "grad_norm": 0.9598141312599182, "kl": 0.054931640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5964903831481934, "reward_std": 0.26796701550483704, "rewards/accuracy_reward": 0.5964904129505157, "rewards/format_reward": 1.0, "step": 1349 }, { "completion_length": 215.86734771728516, "epoch": 0.13584905660377358, "grad_norm": 4.911957263946533, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7211657762527466, "reward_std": 0.24774304777383804, "rewards/accuracy_reward": 0.7415738999843597, "rewards/format_reward": 0.9795918464660645, "step": 1350 }, { "completion_length": 289.1122360229492, "epoch": 0.1359496855345912, "grad_norm": 3.0822243690490723, "kl": 0.0474853515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.506336271762848, "reward_std": 0.24246850609779358, "rewards/accuracy_reward": 0.5063362866640091, "rewards/format_reward": 1.0, "step": 1351 }, { "completion_length": 267.6836700439453, "epoch": 0.1360503144654088, "grad_norm": 1.4356486797332764, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.617349922657013, "reward_std": 0.26899974048137665, "rewards/accuracy_reward": 0.6173499226570129, "rewards/format_reward": 1.0, "step": 1352 }, { "completion_length": 182.75509643554688, "epoch": 0.1361509433962264, "grad_norm": 0.6148504018783569, "kl": 0.052001953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.873311161994934, "reward_std": 0.1397193968296051, "rewards/accuracy_reward": 0.8835152983665466, "rewards/format_reward": 0.9897959232330322, "step": 1353 }, { "completion_length": 247.80611419677734, "epoch": 0.13625157232704402, "grad_norm": 2.301060914993286, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6157081127166748, "reward_std": 0.17287123762071133, "rewards/accuracy_reward": 0.625912219285965, "rewards/format_reward": 0.9897959232330322, "step": 1354 }, { "completion_length": 251.0, "epoch": 0.13635220125786163, "grad_norm": 1.0989879369735718, "kl": 0.053466796875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6195334792137146, "reward_std": 0.20773016661405563, "rewards/accuracy_reward": 0.6195335388183594, "rewards/format_reward": 1.0, "step": 1355 }, { "completion_length": 219.07141876220703, "epoch": 0.13645283018867924, "grad_norm": 0.7266227006912231, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7568026781082153, "reward_std": 0.19454354047775269, "rewards/accuracy_reward": 0.7670067846775055, "rewards/format_reward": 0.9897959232330322, "step": 1356 }, { "completion_length": 383.39794921875, "epoch": 0.13655345911949685, "grad_norm": 0.7033111453056335, "kl": 0.03424072265625, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.6020408272743225, "reward_std": 0.35076195001602173, "rewards/accuracy_reward": 0.6224489808082581, "rewards/format_reward": 0.9795918166637421, "step": 1357 }, { "completion_length": 306.2142791748047, "epoch": 0.13665408805031445, "grad_norm": 0.6804506182670593, "kl": 0.0504150390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7556063532829285, "reward_std": 0.25738997012376785, "rewards/accuracy_reward": 0.7658104300498962, "rewards/format_reward": 0.9897959232330322, "step": 1358 }, { "completion_length": 240.54080963134766, "epoch": 0.13675471698113206, "grad_norm": 0.3034042716026306, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8149173855781555, "reward_std": 0.09296064078807831, "rewards/accuracy_reward": 0.8149174153804779, "rewards/format_reward": 1.0, "step": 1359 }, { "completion_length": 293.83673095703125, "epoch": 0.13685534591194967, "grad_norm": 0.5313437581062317, "kl": 0.0404052734375, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.8129251599311829, "reward_std": 0.21691767871379852, "rewards/accuracy_reward": 0.8129251301288605, "rewards/format_reward": 1.0, "step": 1360 }, { "completion_length": 245.4897918701172, "epoch": 0.1369559748427673, "grad_norm": 0.5887289643287659, "kl": 0.0589599609375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7356215119361877, "reward_std": 0.12100109085440636, "rewards/accuracy_reward": 0.7356215119361877, "rewards/format_reward": 1.0, "step": 1361 }, { "completion_length": 234.65306091308594, "epoch": 0.13705660377358492, "grad_norm": 0.5680229067802429, "kl": 0.04638671875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.750566840171814, "reward_std": 0.12792087346315384, "rewards/accuracy_reward": 0.750566840171814, "rewards/format_reward": 1.0, "step": 1362 }, { "completion_length": 277.9183578491211, "epoch": 0.13715723270440253, "grad_norm": 1.2121809720993042, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.5598639845848083, "reward_std": 0.21324753016233444, "rewards/accuracy_reward": 0.5700680166482925, "rewards/format_reward": 0.9897959232330322, "step": 1363 }, { "completion_length": 331.8163299560547, "epoch": 0.13725786163522014, "grad_norm": 0.5831614136695862, "kl": 0.0472412109375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6290982961654663, "reward_std": 0.21131908148527145, "rewards/accuracy_reward": 0.6495064496994019, "rewards/format_reward": 0.9795918166637421, "step": 1364 }, { "completion_length": 210.81632232666016, "epoch": 0.13735849056603774, "grad_norm": 0.8053040504455566, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8048434853553772, "reward_std": 0.1418289840221405, "rewards/accuracy_reward": 0.8048435151576996, "rewards/format_reward": 1.0, "step": 1365 }, { "completion_length": 241.6632537841797, "epoch": 0.13745911949685535, "grad_norm": 1.0756152868270874, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6573129296302795, "reward_std": 0.10342403501272202, "rewards/accuracy_reward": 0.6573128998279572, "rewards/format_reward": 1.0, "step": 1366 }, { "completion_length": 271.7653045654297, "epoch": 0.13755974842767296, "grad_norm": 1.0134012699127197, "kl": 0.0750732421875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6088098883628845, "reward_std": 0.19518554210662842, "rewards/accuracy_reward": 0.6088100075721741, "rewards/format_reward": 1.0, "step": 1367 }, { "completion_length": 284.11224365234375, "epoch": 0.13766037735849057, "grad_norm": 1.1089810132980347, "kl": 0.0576171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.427750825881958, "reward_std": 0.1860407516360283, "rewards/accuracy_reward": 0.4277508705854416, "rewards/format_reward": 1.0, "step": 1368 }, { "completion_length": 198.6530532836914, "epoch": 0.13776100628930818, "grad_norm": 17.750349044799805, "kl": 0.5955810546875, "learning_rate": 1e-06, "loss": 0.0238, "reward": 1.7429043054580688, "reward_std": 0.21648748964071274, "rewards/accuracy_reward": 0.7633124887943268, "rewards/format_reward": 0.9795918166637421, "step": 1369 }, { "completion_length": 302.0816345214844, "epoch": 0.1378616352201258, "grad_norm": 0.789045512676239, "kl": 0.037109375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.6302546858787537, "reward_std": 0.24606168270111084, "rewards/accuracy_reward": 0.6506628394126892, "rewards/format_reward": 0.9795918464660645, "step": 1370 }, { "completion_length": 179.15306091308594, "epoch": 0.1379622641509434, "grad_norm": 0.38055506348609924, "kl": 0.06103515625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.9501800537109375, "reward_std": 0.06417360156774521, "rewards/accuracy_reward": 0.9501800537109375, "rewards/format_reward": 1.0, "step": 1371 }, { "completion_length": 306.49998474121094, "epoch": 0.138062893081761, "grad_norm": 0.46030542254447937, "kl": 0.04541015625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.5714285373687744, "reward_std": 0.1730649657547474, "rewards/accuracy_reward": 0.5714285671710968, "rewards/format_reward": 1.0, "step": 1372 }, { "completion_length": 220.77550506591797, "epoch": 0.13816352201257862, "grad_norm": 1.8481624126434326, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6939481496810913, "reward_std": 0.2386472336947918, "rewards/accuracy_reward": 0.6939482092857361, "rewards/format_reward": 1.0, "step": 1373 }, { "completion_length": 182.05101776123047, "epoch": 0.13826415094339622, "grad_norm": 1.408851981163025, "kl": 0.059814453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7874149680137634, "reward_std": 0.17562639713287354, "rewards/accuracy_reward": 0.7874149680137634, "rewards/format_reward": 1.0, "step": 1374 }, { "completion_length": 255.27550506591797, "epoch": 0.13836477987421383, "grad_norm": 0.7309021353721619, "kl": 0.0491943359375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.656098186969757, "reward_std": 0.16372204944491386, "rewards/accuracy_reward": 0.6663022339344025, "rewards/format_reward": 0.9897959232330322, "step": 1375 }, { "completion_length": 240.1326446533203, "epoch": 0.13846540880503144, "grad_norm": 1.1500657796859741, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6968675255775452, "reward_std": 0.2253112494945526, "rewards/accuracy_reward": 0.6968676149845123, "rewards/format_reward": 1.0, "step": 1376 }, { "completion_length": 239.6632537841797, "epoch": 0.13856603773584905, "grad_norm": 0.900080680847168, "kl": 0.046630859375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.779614269733429, "reward_std": 0.2400095909833908, "rewards/accuracy_reward": 0.789818286895752, "rewards/format_reward": 0.9897959232330322, "step": 1377 }, { "completion_length": 236.61223602294922, "epoch": 0.13866666666666666, "grad_norm": 14.283405303955078, "kl": 0.0472412109375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6899659633636475, "reward_std": 0.2337135151028633, "rewards/accuracy_reward": 0.6899659931659698, "rewards/format_reward": 1.0, "step": 1378 }, { "completion_length": 271.4693908691406, "epoch": 0.13876729559748427, "grad_norm": 1.0137579441070557, "kl": 0.0550537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7678328156471252, "reward_std": 0.2589483931660652, "rewards/accuracy_reward": 0.7780369222164154, "rewards/format_reward": 0.9897959232330322, "step": 1379 }, { "completion_length": 309.5918273925781, "epoch": 0.13886792452830188, "grad_norm": 0.9885379672050476, "kl": 0.0528564453125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5998526215553284, "reward_std": 0.2903997302055359, "rewards/accuracy_reward": 0.6202608346939087, "rewards/format_reward": 0.9795918464660645, "step": 1380 }, { "completion_length": 260.32652282714844, "epoch": 0.1389685534591195, "grad_norm": 0.6113532781600952, "kl": 0.065185546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.683717429637909, "reward_std": 0.06185410916805267, "rewards/accuracy_reward": 0.7143296599388123, "rewards/format_reward": 0.9693877398967743, "step": 1381 }, { "completion_length": 278.6428527832031, "epoch": 0.1390691823899371, "grad_norm": 0.9861185550689697, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6675066947937012, "reward_std": 0.21808961778879166, "rewards/accuracy_reward": 0.6675066947937012, "rewards/format_reward": 1.0, "step": 1382 }, { "completion_length": 207.58162689208984, "epoch": 0.1391698113207547, "grad_norm": 0.9560715556144714, "kl": 0.04638671875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7100598216056824, "reward_std": 0.2110573723912239, "rewards/accuracy_reward": 0.71005979180336, "rewards/format_reward": 1.0, "step": 1383 }, { "completion_length": 237.80611419677734, "epoch": 0.13927044025157231, "grad_norm": 0.8263121247291565, "kl": 0.0750732421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7342737913131714, "reward_std": 0.19349191337823868, "rewards/accuracy_reward": 0.7444778978824615, "rewards/format_reward": 0.9897959232330322, "step": 1384 }, { "completion_length": 159.14285278320312, "epoch": 0.13937106918238995, "grad_norm": 1.369434118270874, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.9274376034736633, "reward_std": 0.12125836312770844, "rewards/accuracy_reward": 0.9274376630783081, "rewards/format_reward": 1.0, "step": 1385 }, { "completion_length": 157.78571319580078, "epoch": 0.13947169811320756, "grad_norm": 1.3892778158187866, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7519292831420898, "reward_std": 0.2557704374194145, "rewards/accuracy_reward": 0.7723374664783478, "rewards/format_reward": 0.9795918166637421, "step": 1386 }, { "completion_length": 260.17346954345703, "epoch": 0.13957232704402517, "grad_norm": 1.2239108085632324, "kl": 0.0606689453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6200128197669983, "reward_std": 0.28514227271080017, "rewards/accuracy_reward": 0.640421062707901, "rewards/format_reward": 0.9795918464660645, "step": 1387 }, { "completion_length": 202.72447967529297, "epoch": 0.13967295597484278, "grad_norm": 1.3169604539871216, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7904238104820251, "reward_std": 0.193658247590065, "rewards/accuracy_reward": 0.7904238700866699, "rewards/format_reward": 1.0, "step": 1388 }, { "completion_length": 217.83673095703125, "epoch": 0.1397735849056604, "grad_norm": 0.7401008009910583, "kl": 0.062744140625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6832199692726135, "reward_std": 0.21467530727386475, "rewards/accuracy_reward": 0.6934240162372589, "rewards/format_reward": 0.9897959232330322, "step": 1389 }, { "completion_length": 308.5306091308594, "epoch": 0.139874213836478, "grad_norm": 1.3385610580444336, "kl": 0.044677734375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.4943619966506958, "reward_std": 0.3063310384750366, "rewards/accuracy_reward": 0.5249742865562439, "rewards/format_reward": 0.9693877398967743, "step": 1390 }, { "completion_length": 248.25508880615234, "epoch": 0.1399748427672956, "grad_norm": 0.6707879900932312, "kl": 0.039306640625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7692956328392029, "reward_std": 0.23380382359027863, "rewards/accuracy_reward": 0.7897038161754608, "rewards/format_reward": 0.9795918464660645, "step": 1391 }, { "completion_length": 170.33673095703125, "epoch": 0.1400754716981132, "grad_norm": 1.3078945875167847, "kl": 0.0506591796875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7677695155143738, "reward_std": 0.21280376613140106, "rewards/accuracy_reward": 0.7677695453166962, "rewards/format_reward": 1.0, "step": 1392 }, { "completion_length": 260.6938781738281, "epoch": 0.14017610062893082, "grad_norm": 4.958102703094482, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6494582295417786, "reward_std": 0.16100051999092102, "rewards/accuracy_reward": 0.6494583040475845, "rewards/format_reward": 1.0, "step": 1393 }, { "completion_length": 249.9693832397461, "epoch": 0.14027672955974843, "grad_norm": 1.7878516912460327, "kl": 0.0443115234375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6344985365867615, "reward_std": 0.3438549339771271, "rewards/accuracy_reward": 0.6549066603183746, "rewards/format_reward": 0.9795918464660645, "step": 1394 }, { "completion_length": 286.8775405883789, "epoch": 0.14037735849056604, "grad_norm": 0.9170917868614197, "kl": 0.04345703125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7803692817687988, "reward_std": 0.19900153577327728, "rewards/accuracy_reward": 0.7905733585357666, "rewards/format_reward": 0.9897959232330322, "step": 1395 }, { "completion_length": 294.29591369628906, "epoch": 0.14047798742138365, "grad_norm": 1.1825778484344482, "kl": 0.037353515625, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5136588215827942, "reward_std": 0.2814435139298439, "rewards/accuracy_reward": 0.5136589258909225, "rewards/format_reward": 1.0, "step": 1396 }, { "completion_length": 279.1428527832031, "epoch": 0.14057861635220126, "grad_norm": 0.783035159111023, "kl": 0.0419921875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6682539582252502, "reward_std": 0.2811116576194763, "rewards/accuracy_reward": 0.6682539582252502, "rewards/format_reward": 1.0, "step": 1397 }, { "completion_length": 352.96937561035156, "epoch": 0.14067924528301887, "grad_norm": 0.7730596661567688, "kl": 0.039306640625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.5871585011482239, "reward_std": 0.25736576318740845, "rewards/accuracy_reward": 0.6279748678207397, "rewards/format_reward": 0.9591836631298065, "step": 1398 }, { "completion_length": 252.28571319580078, "epoch": 0.14077987421383648, "grad_norm": 0.8524631857872009, "kl": 0.05322265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.620402216911316, "reward_std": 0.1854339838027954, "rewards/accuracy_reward": 0.6204021871089935, "rewards/format_reward": 1.0, "step": 1399 }, { "completion_length": 292.29591369628906, "epoch": 0.14088050314465408, "grad_norm": 1.4698421955108643, "kl": 0.0404052734375, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7057543992996216, "reward_std": 0.15587684512138367, "rewards/accuracy_reward": 0.705754429101944, "rewards/format_reward": 1.0, "step": 1400 }, { "completion_length": 293.82652282714844, "epoch": 0.1409811320754717, "grad_norm": 0.5422400236129761, "kl": 0.0489501953125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.693388819694519, "reward_std": 0.19138260185718536, "rewards/accuracy_reward": 0.7035928666591644, "rewards/format_reward": 0.9897959232330322, "step": 1401 }, { "completion_length": 247.9795913696289, "epoch": 0.1410817610062893, "grad_norm": 0.5377397537231445, "kl": 0.0411376953125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.8543047308921814, "reward_std": 0.1553323920816183, "rewards/accuracy_reward": 0.864508867263794, "rewards/format_reward": 0.9897959232330322, "step": 1402 }, { "completion_length": 305.7040710449219, "epoch": 0.1411823899371069, "grad_norm": 1.1278091669082642, "kl": 0.0499267578125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6814184188842773, "reward_std": 0.2249266654253006, "rewards/accuracy_reward": 0.6916224658489227, "rewards/format_reward": 0.9897959232330322, "step": 1403 }, { "completion_length": 209.7448959350586, "epoch": 0.14128301886792452, "grad_norm": 1.9386510848999023, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7210884094238281, "reward_std": 0.2536221891641617, "rewards/accuracy_reward": 0.7210884392261505, "rewards/format_reward": 1.0, "step": 1404 }, { "completion_length": 397.4795837402344, "epoch": 0.14138364779874213, "grad_norm": 0.6979507803916931, "kl": 0.03375244140625, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.4319562315940857, "reward_std": 0.2351449877023697, "rewards/accuracy_reward": 0.4523644894361496, "rewards/format_reward": 0.9795918464660645, "step": 1405 }, { "completion_length": 333.4387664794922, "epoch": 0.14148427672955974, "grad_norm": 2.872699737548828, "kl": 0.0499267578125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5686944723129272, "reward_std": 0.3473801165819168, "rewards/accuracy_reward": 0.5993067622184753, "rewards/format_reward": 0.9693877398967743, "step": 1406 }, { "completion_length": 280.94896697998047, "epoch": 0.14158490566037735, "grad_norm": 2.4438705444335938, "kl": 0.0626220703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5876570343971252, "reward_std": 0.3026790916919708, "rewards/accuracy_reward": 0.6080652475357056, "rewards/format_reward": 0.9795918166637421, "step": 1407 }, { "completion_length": 266.9387664794922, "epoch": 0.14168553459119496, "grad_norm": 1.0556423664093018, "kl": 0.04833984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6069815158843994, "reward_std": 0.2223409339785576, "rewards/accuracy_reward": 0.6069815754890442, "rewards/format_reward": 1.0, "step": 1408 }, { "completion_length": 265.6530456542969, "epoch": 0.14178616352201256, "grad_norm": 0.8055673241615295, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.555349349975586, "reward_std": 0.2032548487186432, "rewards/accuracy_reward": 0.5553494095802307, "rewards/format_reward": 1.0, "step": 1409 }, { "completion_length": 328.9285583496094, "epoch": 0.1418867924528302, "grad_norm": 1.4471724033355713, "kl": 0.04541015625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6079749464988708, "reward_std": 0.3415771424770355, "rewards/accuracy_reward": 0.6283831596374512, "rewards/format_reward": 0.9795918464660645, "step": 1410 }, { "completion_length": 194.23468399047852, "epoch": 0.1419874213836478, "grad_norm": 0.6974369287490845, "kl": 0.0390625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7691918015480042, "reward_std": 0.15707625448703766, "rewards/accuracy_reward": 0.7691918313503265, "rewards/format_reward": 1.0, "step": 1411 }, { "completion_length": 355.32652282714844, "epoch": 0.14208805031446542, "grad_norm": 1.462653636932373, "kl": 0.0506591796875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6431244015693665, "reward_std": 0.17519766837358475, "rewards/accuracy_reward": 0.6431244015693665, "rewards/format_reward": 1.0, "step": 1412 }, { "completion_length": 224.93877410888672, "epoch": 0.14218867924528303, "grad_norm": 1.5714598894119263, "kl": 0.05029296875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.78652423620224, "reward_std": 0.1984177976846695, "rewards/accuracy_reward": 0.7865240573883057, "rewards/format_reward": 1.0, "step": 1413 }, { "completion_length": 270.4591751098633, "epoch": 0.14228930817610064, "grad_norm": 0.7548567652702332, "kl": 0.0545654296875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.740674614906311, "reward_std": 0.1323816142976284, "rewards/accuracy_reward": 0.7406745553016663, "rewards/format_reward": 1.0, "step": 1414 }, { "completion_length": 242.89794921875, "epoch": 0.14238993710691825, "grad_norm": 1.0408748388290405, "kl": 0.0487060546875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7707431316375732, "reward_std": 0.20159263908863068, "rewards/accuracy_reward": 0.7809472382068634, "rewards/format_reward": 0.9897959232330322, "step": 1415 }, { "completion_length": 342.448974609375, "epoch": 0.14249056603773586, "grad_norm": 0.8540840744972229, "kl": 0.0709228515625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6847872734069824, "reward_std": 0.2334316447377205, "rewards/accuracy_reward": 0.6847872734069824, "rewards/format_reward": 1.0, "step": 1416 }, { "completion_length": 244.61224365234375, "epoch": 0.14259119496855346, "grad_norm": 4.271435260772705, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7317784428596497, "reward_std": 0.1291225180029869, "rewards/accuracy_reward": 0.7317784130573273, "rewards/format_reward": 1.0, "step": 1417 }, { "completion_length": 249.06121826171875, "epoch": 0.14269182389937107, "grad_norm": 1.3464617729187012, "kl": 0.0450439453125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.5667129755020142, "reward_std": 0.25654348731040955, "rewards/accuracy_reward": 0.5769171118736267, "rewards/format_reward": 0.9897959232330322, "step": 1418 }, { "completion_length": 243.06122589111328, "epoch": 0.14279245283018868, "grad_norm": 2.293198585510254, "kl": 0.0435791015625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7449805736541748, "reward_std": 0.16842974722385406, "rewards/accuracy_reward": 0.7449805736541748, "rewards/format_reward": 1.0, "step": 1419 }, { "completion_length": 264.7142791748047, "epoch": 0.1428930817610063, "grad_norm": 2.9437479972839355, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6230906248092651, "reward_std": 0.2465354949235916, "rewards/accuracy_reward": 0.6230905950069427, "rewards/format_reward": 1.0, "step": 1420 }, { "completion_length": 373.51019287109375, "epoch": 0.1429937106918239, "grad_norm": 0.5594094395637512, "kl": 0.055908203125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6971187591552734, "reward_std": 0.15416496247053146, "rewards/accuracy_reward": 0.7277310788631439, "rewards/format_reward": 0.9693877398967743, "step": 1421 }, { "completion_length": 269.7755126953125, "epoch": 0.1430943396226415, "grad_norm": 0.6567454934120178, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7346938848495483, "reward_std": 0.16641174256801605, "rewards/accuracy_reward": 0.7346938848495483, "rewards/format_reward": 1.0, "step": 1422 }, { "completion_length": 123.83673095703125, "epoch": 0.14319496855345912, "grad_norm": 1.4007459878921509, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7267585396766663, "reward_std": 0.11583689600229263, "rewards/accuracy_reward": 0.7267584800720215, "rewards/format_reward": 1.0, "step": 1423 }, { "completion_length": 279.67346954345703, "epoch": 0.14329559748427673, "grad_norm": 1.0822898149490356, "kl": 0.0482177734375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6437055468559265, "reward_std": 0.2053556963801384, "rewards/accuracy_reward": 0.6539096236228943, "rewards/format_reward": 0.9897959232330322, "step": 1424 }, { "completion_length": 261.3163299560547, "epoch": 0.14339622641509434, "grad_norm": 0.6981168985366821, "kl": 0.053955078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5730922222137451, "reward_std": 0.15266500413417816, "rewards/accuracy_reward": 0.5832962393760681, "rewards/format_reward": 0.9897959232330322, "step": 1425 }, { "completion_length": 241.0, "epoch": 0.14349685534591194, "grad_norm": 0.5450517535209656, "kl": 0.056884765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6163750886917114, "reward_std": 0.1863888055086136, "rewards/accuracy_reward": 0.6367832720279694, "rewards/format_reward": 0.9795918166637421, "step": 1426 }, { "completion_length": 277.4183578491211, "epoch": 0.14359748427672955, "grad_norm": 0.7718424797058105, "kl": 0.052978515625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.66303551197052, "reward_std": 0.20449178665876389, "rewards/accuracy_reward": 0.6732396483421326, "rewards/format_reward": 0.9897959232330322, "step": 1427 }, { "completion_length": 249.56121063232422, "epoch": 0.14369811320754716, "grad_norm": 0.9958207011222839, "kl": 0.046142578125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7725479006767273, "reward_std": 0.17483408749103546, "rewards/accuracy_reward": 0.7827520966529846, "rewards/format_reward": 0.9897959232330322, "step": 1428 }, { "completion_length": 223.22447967529297, "epoch": 0.14379874213836477, "grad_norm": 3.7280728816986084, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5621528029441833, "reward_std": 0.16456462442874908, "rewards/accuracy_reward": 0.5621528923511505, "rewards/format_reward": 1.0, "step": 1429 }, { "completion_length": 230.551025390625, "epoch": 0.14389937106918238, "grad_norm": 4.7150559425354, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7460585832595825, "reward_std": 0.17012498527765274, "rewards/accuracy_reward": 0.7562627494335175, "rewards/format_reward": 0.9897959232330322, "step": 1430 }, { "completion_length": 231.07142639160156, "epoch": 0.144, "grad_norm": 0.6961696147918701, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7691933512687683, "reward_std": 0.17680369317531586, "rewards/accuracy_reward": 0.7691933810710907, "rewards/format_reward": 1.0, "step": 1431 }, { "completion_length": 201.09182739257812, "epoch": 0.1441006289308176, "grad_norm": 0.6521838307380676, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8803206086158752, "reward_std": 0.08505797758698463, "rewards/accuracy_reward": 0.8803206086158752, "rewards/format_reward": 1.0, "step": 1432 }, { "completion_length": 209.7551040649414, "epoch": 0.1442012578616352, "grad_norm": 1.1451953649520874, "kl": 0.0548095703125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7388077974319458, "reward_std": 0.2762366980314255, "rewards/accuracy_reward": 0.7490118443965912, "rewards/format_reward": 0.9897959232330322, "step": 1433 }, { "completion_length": 244.6224365234375, "epoch": 0.14430188679245284, "grad_norm": 0.7353343963623047, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7188001871109009, "reward_std": 0.19731555879116058, "rewards/accuracy_reward": 0.7188002467155457, "rewards/format_reward": 1.0, "step": 1434 }, { "completion_length": 198.38774871826172, "epoch": 0.14440251572327045, "grad_norm": 0.45191970467567444, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.806122362613678, "reward_std": 0.13821138441562653, "rewards/accuracy_reward": 0.8061224222183228, "rewards/format_reward": 1.0, "step": 1435 }, { "completion_length": 256.62245178222656, "epoch": 0.14450314465408806, "grad_norm": 0.5990857481956482, "kl": 0.05810546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7930272221565247, "reward_std": 0.1372383087873459, "rewards/accuracy_reward": 0.8032312989234924, "rewards/format_reward": 0.9897959232330322, "step": 1436 }, { "completion_length": 267.05101013183594, "epoch": 0.14460377358490567, "grad_norm": 2.221444845199585, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.57677161693573, "reward_std": 0.1874518170952797, "rewards/accuracy_reward": 0.57677161693573, "rewards/format_reward": 1.0, "step": 1437 }, { "completion_length": 301.8367385864258, "epoch": 0.14470440251572328, "grad_norm": 0.5232788920402527, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.716553270816803, "reward_std": 0.22841063141822815, "rewards/accuracy_reward": 0.7267573475837708, "rewards/format_reward": 0.9897959232330322, "step": 1438 }, { "completion_length": 191.7551040649414, "epoch": 0.1448050314465409, "grad_norm": 0.76179039478302, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7472052574157715, "reward_std": 0.11538834497332573, "rewards/accuracy_reward": 0.7574093639850616, "rewards/format_reward": 0.9897959232330322, "step": 1439 }, { "completion_length": 218.9897918701172, "epoch": 0.1449056603773585, "grad_norm": 0.6098412275314331, "kl": 0.0517578125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7118151783943176, "reward_std": 0.16794031858444214, "rewards/accuracy_reward": 0.7118151187896729, "rewards/format_reward": 1.0, "step": 1440 }, { "completion_length": 256.4183578491211, "epoch": 0.1450062893081761, "grad_norm": 0.996232807636261, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.680101215839386, "reward_std": 0.1264503300189972, "rewards/accuracy_reward": 0.6801012754440308, "rewards/format_reward": 1.0, "step": 1441 }, { "completion_length": 241.948974609375, "epoch": 0.14510691823899372, "grad_norm": 0.7859245538711548, "kl": 0.0589599609375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6147066950798035, "reward_std": 0.20271414518356323, "rewards/accuracy_reward": 0.6249108910560608, "rewards/format_reward": 0.9897959232330322, "step": 1442 }, { "completion_length": 209.7040786743164, "epoch": 0.14520754716981132, "grad_norm": 0.36777904629707336, "kl": 0.050537109375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7505668997764587, "reward_std": 0.07303375005722046, "rewards/accuracy_reward": 0.7505668699741364, "rewards/format_reward": 1.0, "step": 1443 }, { "completion_length": 248.36734771728516, "epoch": 0.14530817610062893, "grad_norm": 0.582177460193634, "kl": 0.0390625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6789420247077942, "reward_std": 0.1784384548664093, "rewards/accuracy_reward": 0.689146101474762, "rewards/format_reward": 0.9897959232330322, "step": 1444 }, { "completion_length": 197.81632232666016, "epoch": 0.14540880503144654, "grad_norm": 0.3701864182949066, "kl": 0.0487060546875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.9008895754814148, "reward_std": 0.06687349081039429, "rewards/accuracy_reward": 0.9008895754814148, "rewards/format_reward": 1.0, "step": 1445 }, { "completion_length": 249.6530532836914, "epoch": 0.14550943396226415, "grad_norm": 0.8579333424568176, "kl": 0.0484619140625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7765268683433533, "reward_std": 0.20504195615649223, "rewards/accuracy_reward": 0.7765268981456757, "rewards/format_reward": 1.0, "step": 1446 }, { "completion_length": 226.02040100097656, "epoch": 0.14561006289308176, "grad_norm": 0.5202239751815796, "kl": 0.0614013671875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.640913486480713, "reward_std": 0.14077293686568737, "rewards/accuracy_reward": 0.6613216698169708, "rewards/format_reward": 0.9795918464660645, "step": 1447 }, { "completion_length": 237.29591369628906, "epoch": 0.14571069182389937, "grad_norm": 0.8981155157089233, "kl": 0.0479736328125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.739909291267395, "reward_std": 0.23836785554885864, "rewards/accuracy_reward": 0.7501133680343628, "rewards/format_reward": 0.9897959232330322, "step": 1448 }, { "completion_length": 252.6326446533203, "epoch": 0.14581132075471698, "grad_norm": 48.00413131713867, "kl": 1.044921875, "learning_rate": 1e-06, "loss": 0.0414, "reward": 1.8380951881408691, "reward_std": 0.10668579675257206, "rewards/accuracy_reward": 0.8380951881408691, "rewards/format_reward": 1.0, "step": 1449 }, { "completion_length": 251.948974609375, "epoch": 0.1459119496855346, "grad_norm": 2.387528419494629, "kl": 0.0506591796875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6190301179885864, "reward_std": 0.2305639088153839, "rewards/accuracy_reward": 0.6292342394590378, "rewards/format_reward": 0.9897959232330322, "step": 1450 }, { "completion_length": 298.2142791748047, "epoch": 0.1460125786163522, "grad_norm": 0.9633277058601379, "kl": 0.0596923828125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.63180273771286, "reward_std": 0.2907438650727272, "rewards/accuracy_reward": 0.6726190447807312, "rewards/format_reward": 0.9591836333274841, "step": 1451 }, { "completion_length": 322.7653045654297, "epoch": 0.1461132075471698, "grad_norm": 0.7795525789260864, "kl": 0.043212890625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.4997084140777588, "reward_std": 0.2840871959924698, "rewards/accuracy_reward": 0.5201166123151779, "rewards/format_reward": 0.9795918464660645, "step": 1452 }, { "completion_length": 238.93877410888672, "epoch": 0.1462138364779874, "grad_norm": 0.8905573487281799, "kl": 0.0570068359375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.774538278579712, "reward_std": 0.20360726863145828, "rewards/accuracy_reward": 0.7847424447536469, "rewards/format_reward": 0.9897959232330322, "step": 1453 }, { "completion_length": 274.4081573486328, "epoch": 0.14631446540880502, "grad_norm": 1.0005484819412231, "kl": 0.0653076171875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6747495532035828, "reward_std": 0.2568644881248474, "rewards/accuracy_reward": 0.6951577365398407, "rewards/format_reward": 0.9795918464660645, "step": 1454 }, { "completion_length": 273.51019287109375, "epoch": 0.14641509433962263, "grad_norm": 0.879173994064331, "kl": 0.05615234375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5740801095962524, "reward_std": 0.19182614982128143, "rewards/accuracy_reward": 0.5944883525371552, "rewards/format_reward": 0.9795918166637421, "step": 1455 }, { "completion_length": 281.85713958740234, "epoch": 0.14651572327044024, "grad_norm": 0.7304270267486572, "kl": 0.04833984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7350339889526367, "reward_std": 0.19986968487501144, "rewards/accuracy_reward": 0.7452380359172821, "rewards/format_reward": 0.9897959232330322, "step": 1456 }, { "completion_length": 243.79591369628906, "epoch": 0.14661635220125785, "grad_norm": 1.192914605140686, "kl": 0.0858154296875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7237785458564758, "reward_std": 0.24899625033140182, "rewards/accuracy_reward": 0.7339826822280884, "rewards/format_reward": 0.9897959232330322, "step": 1457 }, { "completion_length": 308.5, "epoch": 0.14671698113207546, "grad_norm": 0.6749281883239746, "kl": 0.0474853515625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6077518463134766, "reward_std": 0.1630755066871643, "rewards/accuracy_reward": 0.6179559528827667, "rewards/format_reward": 0.9897959232330322, "step": 1458 }, { "completion_length": 228.4897918701172, "epoch": 0.1468176100628931, "grad_norm": 0.985819935798645, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.4530611634254456, "reward_std": 0.13323024660348892, "rewards/accuracy_reward": 0.4632652848958969, "rewards/format_reward": 0.9897959232330322, "step": 1459 }, { "completion_length": 235.9183578491211, "epoch": 0.1469182389937107, "grad_norm": 0.5250838398933411, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6947337985038757, "reward_std": 0.057000573724508286, "rewards/accuracy_reward": 0.6947337687015533, "rewards/format_reward": 1.0, "step": 1460 }, { "completion_length": 209.82652282714844, "epoch": 0.1470188679245283, "grad_norm": 0.46432366967201233, "kl": 0.0537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8829056024551392, "reward_std": 0.05343334563076496, "rewards/accuracy_reward": 0.8829055428504944, "rewards/format_reward": 1.0, "step": 1461 }, { "completion_length": 209.06121826171875, "epoch": 0.14711949685534592, "grad_norm": 1.7488764524459839, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5854918360710144, "reward_std": 0.21713418513536453, "rewards/accuracy_reward": 0.5956959426403046, "rewards/format_reward": 0.9897959232330322, "step": 1462 }, { "completion_length": 222.56121063232422, "epoch": 0.14722012578616353, "grad_norm": 0.5706118941307068, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8265305757522583, "reward_std": 0.10788732394576073, "rewards/accuracy_reward": 0.8265305757522583, "rewards/format_reward": 1.0, "step": 1463 }, { "completion_length": 251.01019287109375, "epoch": 0.14732075471698114, "grad_norm": 1.0776405334472656, "kl": 0.0518798828125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6774979829788208, "reward_std": 0.2033555991947651, "rewards/accuracy_reward": 0.6979061663150787, "rewards/format_reward": 0.9795918464660645, "step": 1464 }, { "completion_length": 252.31632232666016, "epoch": 0.14742138364779875, "grad_norm": 0.6811195611953735, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7405556440353394, "reward_std": 0.1165088452398777, "rewards/accuracy_reward": 0.7507598102092743, "rewards/format_reward": 0.9897959232330322, "step": 1465 }, { "completion_length": 226.32652282714844, "epoch": 0.14752201257861636, "grad_norm": 1.1748861074447632, "kl": 0.0517578125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6718334555625916, "reward_std": 0.17517035640776157, "rewards/accuracy_reward": 0.6820375621318817, "rewards/format_reward": 0.9897959232330322, "step": 1466 }, { "completion_length": 258.69386291503906, "epoch": 0.14762264150943397, "grad_norm": 0.8077232837677002, "kl": 0.053955078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7645666599273682, "reward_std": 0.12831689044833183, "rewards/accuracy_reward": 0.7645667195320129, "rewards/format_reward": 1.0, "step": 1467 }, { "completion_length": 172.39795684814453, "epoch": 0.14772327044025158, "grad_norm": 0.876899003982544, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7539455890655518, "reward_std": 0.11210310459136963, "rewards/accuracy_reward": 0.7539455890655518, "rewards/format_reward": 1.0, "step": 1468 }, { "completion_length": 220.948974609375, "epoch": 0.14782389937106918, "grad_norm": 1.3661776781082153, "kl": 0.050537109375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5936633944511414, "reward_std": 0.2664463520050049, "rewards/accuracy_reward": 0.6140716373920441, "rewards/format_reward": 0.9795918464660645, "step": 1469 }, { "completion_length": 317.0714111328125, "epoch": 0.1479245283018868, "grad_norm": 0.8180890679359436, "kl": 0.05487060546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5527211427688599, "reward_std": 0.1899796798825264, "rewards/accuracy_reward": 0.5527210831642151, "rewards/format_reward": 1.0, "step": 1470 }, { "completion_length": 256.40816497802734, "epoch": 0.1480251572327044, "grad_norm": 0.9891393184661865, "kl": 0.0452880859375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6211856603622437, "reward_std": 0.26507681608200073, "rewards/accuracy_reward": 0.6211855560541153, "rewards/format_reward": 1.0, "step": 1471 }, { "completion_length": 189.84693145751953, "epoch": 0.148125786163522, "grad_norm": 1.0648456811904907, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7772619128227234, "reward_std": 0.15553349629044533, "rewards/accuracy_reward": 0.7874660193920135, "rewards/format_reward": 0.9897959232330322, "step": 1472 }, { "completion_length": 260.7550964355469, "epoch": 0.14822641509433962, "grad_norm": 0.8665770292282104, "kl": 0.050537109375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6113246083259583, "reward_std": 0.2368415966629982, "rewards/accuracy_reward": 0.6113246083259583, "rewards/format_reward": 1.0, "step": 1473 }, { "completion_length": 264.9795837402344, "epoch": 0.14832704402515723, "grad_norm": 1.0825777053833008, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.641026496887207, "reward_std": 0.24477670341730118, "rewards/accuracy_reward": 0.6614346206188202, "rewards/format_reward": 0.9795918464660645, "step": 1474 }, { "completion_length": 262.6428451538086, "epoch": 0.14842767295597484, "grad_norm": 0.4566240608692169, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7123658657073975, "reward_std": 0.15862563997507095, "rewards/accuracy_reward": 0.7123658657073975, "rewards/format_reward": 1.0, "step": 1475 }, { "completion_length": 207.7244873046875, "epoch": 0.14852830188679245, "grad_norm": 0.6263942122459412, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6855393648147583, "reward_std": 0.07961785048246384, "rewards/accuracy_reward": 0.6855393648147583, "rewards/format_reward": 1.0, "step": 1476 }, { "completion_length": 251.11223602294922, "epoch": 0.14862893081761006, "grad_norm": 0.8303694725036621, "kl": 0.052490234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7978118062019348, "reward_std": 0.14041224867105484, "rewards/accuracy_reward": 0.7978118360042572, "rewards/format_reward": 1.0, "step": 1477 }, { "completion_length": 290.6428527832031, "epoch": 0.14872955974842766, "grad_norm": 0.7700362205505371, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8416666388511658, "reward_std": 0.20004651695489883, "rewards/accuracy_reward": 0.8416666090488434, "rewards/format_reward": 1.0, "step": 1478 }, { "completion_length": 269.44896697998047, "epoch": 0.14883018867924527, "grad_norm": 0.8214307427406311, "kl": 0.0552978515625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5807378888130188, "reward_std": 0.19677319005131721, "rewards/accuracy_reward": 0.6011460721492767, "rewards/format_reward": 0.9795918464660645, "step": 1479 }, { "completion_length": 230.38775634765625, "epoch": 0.14893081761006288, "grad_norm": 1.435976505279541, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.599125325679779, "reward_std": 0.14898406714200974, "rewards/accuracy_reward": 0.6093294322490692, "rewards/format_reward": 0.9897959232330322, "step": 1480 }, { "completion_length": 176.57142639160156, "epoch": 0.1490314465408805, "grad_norm": 0.5079855918884277, "kl": 0.0589599609375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.888315737247467, "reward_std": 0.11774809658527374, "rewards/accuracy_reward": 0.8985198438167572, "rewards/format_reward": 0.9897959232330322, "step": 1481 }, { "completion_length": 260.12245178222656, "epoch": 0.1491320754716981, "grad_norm": 0.8061746954917908, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8120747208595276, "reward_std": 0.11448680609464645, "rewards/accuracy_reward": 0.8120747804641724, "rewards/format_reward": 1.0, "step": 1482 }, { "completion_length": 235.78571319580078, "epoch": 0.14923270440251574, "grad_norm": 0.7835246324539185, "kl": 0.0667724609375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7201607823371887, "reward_std": 0.17236119136214256, "rewards/accuracy_reward": 0.7405689656734467, "rewards/format_reward": 0.9795918166637421, "step": 1483 }, { "completion_length": 224.7142791748047, "epoch": 0.14933333333333335, "grad_norm": 0.5621627569198608, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8719452023506165, "reward_std": 0.10903580486774445, "rewards/accuracy_reward": 0.8821494281291962, "rewards/format_reward": 0.9897959232330322, "step": 1484 }, { "completion_length": 285.7449035644531, "epoch": 0.14943396226415095, "grad_norm": 0.8912245631217957, "kl": 0.048095703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5665047764778137, "reward_std": 0.24918793141841888, "rewards/accuracy_reward": 0.5665048062801361, "rewards/format_reward": 1.0, "step": 1485 }, { "completion_length": 277.34693908691406, "epoch": 0.14953459119496856, "grad_norm": 0.6739470958709717, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6856988072395325, "reward_std": 0.16637380421161652, "rewards/accuracy_reward": 0.6959030032157898, "rewards/format_reward": 0.9897959232330322, "step": 1486 }, { "completion_length": 204.2346954345703, "epoch": 0.14963522012578617, "grad_norm": 0.9501432180404663, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.9081631898880005, "reward_std": 0.16188223659992218, "rewards/accuracy_reward": 0.9081632494926453, "rewards/format_reward": 1.0, "step": 1487 }, { "completion_length": 196.63265228271484, "epoch": 0.14973584905660378, "grad_norm": 1.2850173711776733, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7740362286567688, "reward_std": 0.23164308816194534, "rewards/accuracy_reward": 0.7740362882614136, "rewards/format_reward": 1.0, "step": 1488 }, { "completion_length": 234.31632232666016, "epoch": 0.1498364779874214, "grad_norm": 0.8827464580535889, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6222050786018372, "reward_std": 0.207326278090477, "rewards/accuracy_reward": 0.6324091255664825, "rewards/format_reward": 0.9897959232330322, "step": 1489 }, { "completion_length": 218.01019287109375, "epoch": 0.149937106918239, "grad_norm": 0.6835793256759644, "kl": 0.0689697265625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8246111869812012, "reward_std": 0.14783382415771484, "rewards/accuracy_reward": 0.8348152935504913, "rewards/format_reward": 0.9897959232330322, "step": 1490 }, { "completion_length": 274.2346878051758, "epoch": 0.1500377358490566, "grad_norm": 1.637394666671753, "kl": 0.049560546875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.743792474269867, "reward_std": 0.13823436200618744, "rewards/accuracy_reward": 0.7539965808391571, "rewards/format_reward": 0.9897959232330322, "step": 1491 }, { "completion_length": 243.14285278320312, "epoch": 0.15013836477987422, "grad_norm": 1.1054328680038452, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.719915509223938, "reward_std": 0.20131664723157883, "rewards/accuracy_reward": 0.7199155986309052, "rewards/format_reward": 1.0, "step": 1492 }, { "completion_length": 219.5, "epoch": 0.15023899371069183, "grad_norm": 0.8534876704216003, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6751938462257385, "reward_std": 0.22990407794713974, "rewards/accuracy_reward": 0.6956019997596741, "rewards/format_reward": 0.9795918166637421, "step": 1493 }, { "completion_length": 287.8775329589844, "epoch": 0.15033962264150943, "grad_norm": 0.7385272979736328, "kl": 0.041259765625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.4991852045059204, "reward_std": 0.1515440121293068, "rewards/accuracy_reward": 0.499185249209404, "rewards/format_reward": 1.0, "step": 1494 }, { "completion_length": 254.56122589111328, "epoch": 0.15044025157232704, "grad_norm": 2.738072395324707, "kl": 0.053466796875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7670340538024902, "reward_std": 0.3152560144662857, "rewards/accuracy_reward": 0.7772381603717804, "rewards/format_reward": 0.9897959232330322, "step": 1495 }, { "completion_length": 279.7040710449219, "epoch": 0.15054088050314465, "grad_norm": 1.5384362936019897, "kl": 0.0545654296875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5220222473144531, "reward_std": 0.14391738176345825, "rewards/accuracy_reward": 0.5220222622156143, "rewards/format_reward": 1.0, "step": 1496 }, { "completion_length": 258.24488830566406, "epoch": 0.15064150943396226, "grad_norm": 0.84495609998703, "kl": 0.048828125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6338083148002625, "reward_std": 0.2802673503756523, "rewards/accuracy_reward": 0.6338082849979401, "rewards/format_reward": 1.0, "step": 1497 }, { "completion_length": 255.81632232666016, "epoch": 0.15074213836477987, "grad_norm": 1.1452633142471313, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6782312989234924, "reward_std": 0.2291155382990837, "rewards/accuracy_reward": 0.6884353756904602, "rewards/format_reward": 0.9897959232330322, "step": 1498 }, { "completion_length": 188.1938705444336, "epoch": 0.15084276729559748, "grad_norm": 1.98012113571167, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.839983582496643, "reward_std": 0.19445949792861938, "rewards/accuracy_reward": 0.8399836421012878, "rewards/format_reward": 1.0, "step": 1499 }, { "completion_length": 217.83672332763672, "epoch": 0.1509433962264151, "grad_norm": 1.0115610361099243, "kl": 0.0523681640625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7769678831100464, "reward_std": 0.20424910634756088, "rewards/accuracy_reward": 0.7871719896793365, "rewards/format_reward": 0.9897959232330322, "step": 1500 }, { "completion_length": 284.38775634765625, "epoch": 0.1510440251572327, "grad_norm": 0.8514829874038696, "kl": 0.0592041015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6312604546546936, "reward_std": 0.256749264895916, "rewards/accuracy_reward": 0.6414645612239838, "rewards/format_reward": 0.9897959232330322, "step": 1501 }, { "completion_length": 318.88775634765625, "epoch": 0.1511446540880503, "grad_norm": 0.5443131327629089, "kl": 0.0396728515625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7114081382751465, "reward_std": 0.2182627171278, "rewards/accuracy_reward": 0.7114081680774689, "rewards/format_reward": 1.0, "step": 1502 }, { "completion_length": 268.55101776123047, "epoch": 0.15124528301886792, "grad_norm": 1.2660635709762573, "kl": 0.0418701171875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5629315972328186, "reward_std": 0.20219817012548447, "rewards/accuracy_reward": 0.5833398103713989, "rewards/format_reward": 0.9795918464660645, "step": 1503 }, { "completion_length": 260.1428527832031, "epoch": 0.15134591194968552, "grad_norm": 1.081409215927124, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6139941215515137, "reward_std": 0.2066599279642105, "rewards/accuracy_reward": 0.624198243021965, "rewards/format_reward": 0.9897959232330322, "step": 1504 }, { "completion_length": 215.40816497802734, "epoch": 0.15144654088050313, "grad_norm": 0.9324997067451477, "kl": 0.06103515625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7122676372528076, "reward_std": 0.17456937581300735, "rewards/accuracy_reward": 0.7326757907867432, "rewards/format_reward": 0.9795918166637421, "step": 1505 }, { "completion_length": 308.9897918701172, "epoch": 0.15154716981132074, "grad_norm": 0.691681981086731, "kl": 0.0828857421875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.5769944190979004, "reward_std": 0.1270286664366722, "rewards/accuracy_reward": 0.5871985256671906, "rewards/format_reward": 0.9897959232330322, "step": 1506 }, { "completion_length": 245.95917510986328, "epoch": 0.15164779874213835, "grad_norm": 0.6967453360557556, "kl": 0.05615234375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.665816307067871, "reward_std": 0.15637732297182083, "rewards/accuracy_reward": 0.6658163368701935, "rewards/format_reward": 1.0, "step": 1507 }, { "completion_length": 257.0612106323242, "epoch": 0.151748427672956, "grad_norm": 0.501274585723877, "kl": 0.056396484375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.763265311717987, "reward_std": 0.15115052089095116, "rewards/accuracy_reward": 0.7836734652519226, "rewards/format_reward": 0.9795918166637421, "step": 1508 }, { "completion_length": 189.84693908691406, "epoch": 0.1518490566037736, "grad_norm": 0.7179768681526184, "kl": 0.0592041015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6533527374267578, "reward_std": 0.17050284147262573, "rewards/accuracy_reward": 0.663556843996048, "rewards/format_reward": 0.9897959232330322, "step": 1509 }, { "completion_length": 224.7653045654297, "epoch": 0.1519496855345912, "grad_norm": 0.4863085150718689, "kl": 0.0635986328125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6713787913322449, "reward_std": 0.07098289579153061, "rewards/accuracy_reward": 0.6713787317276001, "rewards/format_reward": 1.0, "step": 1510 }, { "completion_length": 315.9897918701172, "epoch": 0.15205031446540881, "grad_norm": 0.46099337935447693, "kl": 0.0472412109375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.650631606578827, "reward_std": 0.16877559944987297, "rewards/accuracy_reward": 0.6506316661834717, "rewards/format_reward": 1.0, "step": 1511 }, { "completion_length": 268.8367385864258, "epoch": 0.15215094339622642, "grad_norm": 1.0305603742599487, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5622026920318604, "reward_std": 0.15949058532714844, "rewards/accuracy_reward": 0.5724068284034729, "rewards/format_reward": 0.9897959232330322, "step": 1512 }, { "completion_length": 222.03060913085938, "epoch": 0.15225157232704403, "grad_norm": 0.7218721508979797, "kl": 0.044189453125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.8379494547843933, "reward_std": 0.1575615108013153, "rewards/accuracy_reward": 0.8379494547843933, "rewards/format_reward": 1.0, "step": 1513 }, { "completion_length": 218.93877410888672, "epoch": 0.15235220125786164, "grad_norm": 0.38314181566238403, "kl": 0.0465087890625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.714285671710968, "reward_std": 0.10490182414650917, "rewards/accuracy_reward": 0.7142857015132904, "rewards/format_reward": 1.0, "step": 1514 }, { "completion_length": 289.1326446533203, "epoch": 0.15245283018867925, "grad_norm": 0.7402562499046326, "kl": 0.05859375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6800534129142761, "reward_std": 0.13188636675477028, "rewards/accuracy_reward": 0.6902575194835663, "rewards/format_reward": 0.9897959232330322, "step": 1515 }, { "completion_length": 225.88774871826172, "epoch": 0.15255345911949686, "grad_norm": 5.511465549468994, "kl": 0.05078125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.802818238735199, "reward_std": 0.22510237246751785, "rewards/accuracy_reward": 0.8232263922691345, "rewards/format_reward": 0.9795918464660645, "step": 1516 }, { "completion_length": 195.52040100097656, "epoch": 0.15265408805031447, "grad_norm": 0.9184983968734741, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.687305510044098, "reward_std": 0.15019835159182549, "rewards/accuracy_reward": 0.6873054802417755, "rewards/format_reward": 1.0, "step": 1517 }, { "completion_length": 173.9081573486328, "epoch": 0.15275471698113208, "grad_norm": 1.8059519529342651, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8141399025917053, "reward_std": 0.13649053871631622, "rewards/accuracy_reward": 0.8141399323940277, "rewards/format_reward": 1.0, "step": 1518 }, { "completion_length": 249.05101013183594, "epoch": 0.15285534591194969, "grad_norm": 0.953424334526062, "kl": 0.0418701171875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6927207112312317, "reward_std": 0.16381924971938133, "rewards/accuracy_reward": 0.7029247283935547, "rewards/format_reward": 0.9897959232330322, "step": 1519 }, { "completion_length": 253.51020050048828, "epoch": 0.1529559748427673, "grad_norm": 0.6782001852989197, "kl": 0.05517578125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6007808446884155, "reward_std": 0.17223267257213593, "rewards/accuracy_reward": 0.6109849214553833, "rewards/format_reward": 0.9897959232330322, "step": 1520 }, { "completion_length": 192.7244873046875, "epoch": 0.1530566037735849, "grad_norm": 0.5175065994262695, "kl": 0.0430908203125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.8611692190170288, "reward_std": 0.08725006878376007, "rewards/accuracy_reward": 0.8713733553886414, "rewards/format_reward": 0.9897959232330322, "step": 1521 }, { "completion_length": 230.35713958740234, "epoch": 0.1531572327044025, "grad_norm": 0.49469515681266785, "kl": 0.0321044921875, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.7398937940597534, "reward_std": 0.08249315433204174, "rewards/accuracy_reward": 0.7500978708267212, "rewards/format_reward": 0.9897959232330322, "step": 1522 }, { "completion_length": 268.7755126953125, "epoch": 0.15325786163522012, "grad_norm": 0.893646776676178, "kl": 0.0523681640625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7043002843856812, "reward_std": 0.2336096614599228, "rewards/accuracy_reward": 0.7043002843856812, "rewards/format_reward": 1.0, "step": 1523 }, { "completion_length": 199.27550506591797, "epoch": 0.15335849056603773, "grad_norm": 1.2600326538085938, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6488884687423706, "reward_std": 0.0914120301604271, "rewards/accuracy_reward": 0.6488884091377258, "rewards/format_reward": 1.0, "step": 1524 }, { "completion_length": 235.38774871826172, "epoch": 0.15345911949685534, "grad_norm": 0.7733063101768494, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8681318163871765, "reward_std": 0.09425182640552521, "rewards/accuracy_reward": 0.8783359527587891, "rewards/format_reward": 0.9897959232330322, "step": 1525 }, { "completion_length": 334.0918273925781, "epoch": 0.15355974842767295, "grad_norm": 0.6507148742675781, "kl": 0.045166015625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.5471144318580627, "reward_std": 0.27782540768384933, "rewards/accuracy_reward": 0.5471144318580627, "rewards/format_reward": 1.0, "step": 1526 }, { "completion_length": 269.6836624145508, "epoch": 0.15366037735849056, "grad_norm": 0.8015576004981995, "kl": 0.048828125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7127890586853027, "reward_std": 0.1642220839858055, "rewards/accuracy_reward": 0.7127890884876251, "rewards/format_reward": 1.0, "step": 1527 }, { "completion_length": 263.3775405883789, "epoch": 0.15376100628930817, "grad_norm": 0.7746962308883667, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7519437074661255, "reward_std": 0.2126564234495163, "rewards/accuracy_reward": 0.7519436776638031, "rewards/format_reward": 1.0, "step": 1528 }, { "completion_length": 246.94898223876953, "epoch": 0.15386163522012578, "grad_norm": 1.01882004737854, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5747990012168884, "reward_std": 0.23072315007448196, "rewards/accuracy_reward": 0.6258194148540497, "rewards/format_reward": 0.9489795565605164, "step": 1529 }, { "completion_length": 273.12244415283203, "epoch": 0.15396226415094338, "grad_norm": 0.8598759174346924, "kl": 0.0665283203125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7398267984390259, "reward_std": 0.16898705810308456, "rewards/accuracy_reward": 0.7398267984390259, "rewards/format_reward": 1.0, "step": 1530 }, { "completion_length": 206.12245178222656, "epoch": 0.154062893081761, "grad_norm": 9.389598846435547, "kl": 0.0535888671875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.737123429775238, "reward_std": 0.2967195063829422, "rewards/accuracy_reward": 0.7371233701705933, "rewards/format_reward": 1.0, "step": 1531 }, { "completion_length": 247.11223602294922, "epoch": 0.15416352201257863, "grad_norm": 0.8658119440078735, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5551385283470154, "reward_std": 0.1942441463470459, "rewards/accuracy_reward": 0.5653426051139832, "rewards/format_reward": 0.9897959232330322, "step": 1532 }, { "completion_length": 277.4693832397461, "epoch": 0.15426415094339624, "grad_norm": 0.7461984157562256, "kl": 0.040283203125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7216472029685974, "reward_std": 0.1255587637424469, "rewards/accuracy_reward": 0.7216472625732422, "rewards/format_reward": 1.0, "step": 1533 }, { "completion_length": 234.74488830566406, "epoch": 0.15436477987421385, "grad_norm": 0.8663583993911743, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6568439602851868, "reward_std": 0.1572677046060562, "rewards/accuracy_reward": 0.6670480370521545, "rewards/format_reward": 0.9897959232330322, "step": 1534 }, { "completion_length": 285.4285583496094, "epoch": 0.15446540880503146, "grad_norm": 0.70012366771698, "kl": 0.056884765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5864511728286743, "reward_std": 0.1412210762500763, "rewards/accuracy_reward": 0.5966552942991257, "rewards/format_reward": 0.9897959232330322, "step": 1535 }, { "completion_length": 237.03060913085938, "epoch": 0.15456603773584907, "grad_norm": 0.8058167099952698, "kl": 0.052490234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6993196606636047, "reward_std": 0.1491188071668148, "rewards/accuracy_reward": 0.7095237970352173, "rewards/format_reward": 0.9897959232330322, "step": 1536 }, { "completion_length": 290.3061218261719, "epoch": 0.15466666666666667, "grad_norm": 1.6826671361923218, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.582118570804596, "reward_std": 0.18772416561841965, "rewards/accuracy_reward": 0.6127307862043381, "rewards/format_reward": 0.9693877398967743, "step": 1537 }, { "completion_length": 210.31631469726562, "epoch": 0.15476729559748428, "grad_norm": 1.8441851139068604, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6753756403923035, "reward_std": 0.20389004796743393, "rewards/accuracy_reward": 0.6753756403923035, "rewards/format_reward": 1.0, "step": 1538 }, { "completion_length": 228.4081573486328, "epoch": 0.1548679245283019, "grad_norm": 0.5881368517875671, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6864296793937683, "reward_std": 0.12214988190680742, "rewards/accuracy_reward": 0.6864297091960907, "rewards/format_reward": 1.0, "step": 1539 }, { "completion_length": 285.05101776123047, "epoch": 0.1549685534591195, "grad_norm": 1.0240812301635742, "kl": 0.0565185546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6678571105003357, "reward_std": 0.2566675543785095, "rewards/accuracy_reward": 0.6882652938365936, "rewards/format_reward": 0.9795918166637421, "step": 1540 }, { "completion_length": 192.97958374023438, "epoch": 0.1550691823899371, "grad_norm": 0.8466376662254333, "kl": 0.0494384765625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8038443326950073, "reward_std": 0.20956122875213623, "rewards/accuracy_reward": 0.8242523968219757, "rewards/format_reward": 0.9795918464660645, "step": 1541 }, { "completion_length": 216.07141876220703, "epoch": 0.15516981132075472, "grad_norm": 0.7414425611495972, "kl": 0.0430908203125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.695906400680542, "reward_std": 0.12467104941606522, "rewards/accuracy_reward": 0.6959064304828644, "rewards/format_reward": 1.0, "step": 1542 }, { "completion_length": 207.7244873046875, "epoch": 0.15527044025157233, "grad_norm": 0.7109915614128113, "kl": 0.052734375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7842565774917603, "reward_std": 0.1071123443543911, "rewards/accuracy_reward": 0.7944606244564056, "rewards/format_reward": 0.9897959232330322, "step": 1543 }, { "completion_length": 297.4795837402344, "epoch": 0.15537106918238994, "grad_norm": 2.2559471130371094, "kl": 0.0501708984375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5495156049728394, "reward_std": 0.3414582759141922, "rewards/accuracy_reward": 0.5597196072340012, "rewards/format_reward": 0.9897959232330322, "step": 1544 }, { "completion_length": 265.1224365234375, "epoch": 0.15547169811320755, "grad_norm": 9.776201248168945, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6573283672332764, "reward_std": 0.11987041682004929, "rewards/accuracy_reward": 0.6573283672332764, "rewards/format_reward": 1.0, "step": 1545 }, { "completion_length": 235.9591827392578, "epoch": 0.15557232704402515, "grad_norm": 0.8874644041061401, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.4182539582252502, "reward_std": 0.13374977558851242, "rewards/accuracy_reward": 0.428458034992218, "rewards/format_reward": 0.9897959232330322, "step": 1546 }, { "completion_length": 265.6938705444336, "epoch": 0.15567295597484276, "grad_norm": 0.5014833211898804, "kl": 0.0596923828125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6632652282714844, "reward_std": 0.16188224777579308, "rewards/accuracy_reward": 0.6632652878761292, "rewards/format_reward": 1.0, "step": 1547 }, { "completion_length": 216.69387817382812, "epoch": 0.15577358490566037, "grad_norm": 1.8787020444869995, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.676084816455841, "reward_std": 0.15238407254219055, "rewards/accuracy_reward": 0.6964929699897766, "rewards/format_reward": 0.9795918166637421, "step": 1548 }, { "completion_length": 275.71427154541016, "epoch": 0.15587421383647798, "grad_norm": 0.7788113951683044, "kl": 0.044921875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7414966225624084, "reward_std": 0.13884975761175156, "rewards/accuracy_reward": 0.7517006397247314, "rewards/format_reward": 0.9897959232330322, "step": 1549 }, { "completion_length": 225.83672332763672, "epoch": 0.1559748427672956, "grad_norm": 0.824125349521637, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6418765187263489, "reward_std": 0.19216953590512276, "rewards/accuracy_reward": 0.6418764591217041, "rewards/format_reward": 1.0, "step": 1550 }, { "completion_length": 245.4285659790039, "epoch": 0.1560754716981132, "grad_norm": 1.0163202285766602, "kl": 0.055419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6603841185569763, "reward_std": 0.21251417696475983, "rewards/accuracy_reward": 0.6603841483592987, "rewards/format_reward": 1.0, "step": 1551 }, { "completion_length": 178.78571319580078, "epoch": 0.1561761006289308, "grad_norm": 1.3293193578720093, "kl": 0.0552978515625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7751457691192627, "reward_std": 0.1874811127781868, "rewards/accuracy_reward": 0.7751457393169403, "rewards/format_reward": 1.0, "step": 1552 }, { "completion_length": 234.23468780517578, "epoch": 0.15627672955974842, "grad_norm": 0.5933054089546204, "kl": 0.0550537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7293405532836914, "reward_std": 0.11855865642428398, "rewards/accuracy_reward": 0.729340523481369, "rewards/format_reward": 1.0, "step": 1553 }, { "completion_length": 195.03060913085938, "epoch": 0.15637735849056603, "grad_norm": 0.8991057276725769, "kl": 0.0633544921875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7448979020118713, "reward_std": 0.18102359771728516, "rewards/accuracy_reward": 0.7653061151504517, "rewards/format_reward": 0.9795918464660645, "step": 1554 }, { "completion_length": 280.9285659790039, "epoch": 0.15647798742138364, "grad_norm": 0.9022203087806702, "kl": 0.0504150390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5461734533309937, "reward_std": 0.17152639478445053, "rewards/accuracy_reward": 0.5461734384298325, "rewards/format_reward": 1.0, "step": 1555 }, { "completion_length": 233.92857360839844, "epoch": 0.15657861635220124, "grad_norm": 1.0980831384658813, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5143641829490662, "reward_std": 0.21341554075479507, "rewards/accuracy_reward": 0.5347723960876465, "rewards/format_reward": 0.9795918464660645, "step": 1556 }, { "completion_length": 162.5, "epoch": 0.15667924528301888, "grad_norm": 1.3295754194259644, "kl": 0.0516357421875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7676328420639038, "reward_std": 0.1774606555700302, "rewards/accuracy_reward": 0.7676328718662262, "rewards/format_reward": 1.0, "step": 1557 }, { "completion_length": 326.4183654785156, "epoch": 0.1567798742138365, "grad_norm": 0.5686647295951843, "kl": 0.0538330078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.4704216718673706, "reward_std": 0.15846817940473557, "rewards/accuracy_reward": 0.4704216420650482, "rewards/format_reward": 1.0, "step": 1558 }, { "completion_length": 276.03060150146484, "epoch": 0.1568805031446541, "grad_norm": 0.697521984577179, "kl": 0.0592041015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6312307119369507, "reward_std": 0.19137472659349442, "rewards/accuracy_reward": 0.6414346992969513, "rewards/format_reward": 0.9897959232330322, "step": 1559 }, { "completion_length": 246.33673095703125, "epoch": 0.1569811320754717, "grad_norm": 1.9584225416183472, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6039432287216187, "reward_std": 0.14558405429124832, "rewards/accuracy_reward": 0.614147275686264, "rewards/format_reward": 0.9897959232330322, "step": 1560 }, { "completion_length": 196.77550506591797, "epoch": 0.15708176100628932, "grad_norm": 0.8441194891929626, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8979591727256775, "reward_std": 0.08099238574504852, "rewards/accuracy_reward": 0.9081632494926453, "rewards/format_reward": 0.9897959232330322, "step": 1561 }, { "completion_length": 275.82652282714844, "epoch": 0.15718238993710693, "grad_norm": 0.53277587890625, "kl": 0.0382080078125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7948670387268066, "reward_std": 0.1707724705338478, "rewards/accuracy_reward": 0.794867068529129, "rewards/format_reward": 1.0, "step": 1562 }, { "completion_length": 276.57141876220703, "epoch": 0.15728301886792453, "grad_norm": 0.7988396883010864, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.539598524570465, "reward_std": 0.22306670993566513, "rewards/accuracy_reward": 0.5702108144760132, "rewards/format_reward": 0.9693877398967743, "step": 1563 }, { "completion_length": 257.9183654785156, "epoch": 0.15738364779874214, "grad_norm": 0.5904820561408997, "kl": 0.0523681640625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6883830428123474, "reward_std": 0.16315989196300507, "rewards/accuracy_reward": 0.6985870897769928, "rewards/format_reward": 0.9897959232330322, "step": 1564 }, { "completion_length": 283.9183654785156, "epoch": 0.15748427672955975, "grad_norm": 0.9219325184822083, "kl": 0.052001953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.554055392742157, "reward_std": 0.12286342680454254, "rewards/accuracy_reward": 0.5540554225444794, "rewards/format_reward": 1.0, "step": 1565 }, { "completion_length": 275.1632537841797, "epoch": 0.15758490566037736, "grad_norm": 0.872722864151001, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.779220700263977, "reward_std": 0.20503061264753342, "rewards/accuracy_reward": 0.7996289134025574, "rewards/format_reward": 0.9795918166637421, "step": 1566 }, { "completion_length": 225.28571319580078, "epoch": 0.15768553459119497, "grad_norm": 1.3042128086090088, "kl": 0.0614013671875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6396644711494446, "reward_std": 0.11272705718874931, "rewards/accuracy_reward": 0.639664500951767, "rewards/format_reward": 1.0, "step": 1567 }, { "completion_length": 263.51019287109375, "epoch": 0.15778616352201258, "grad_norm": 0.8346633911132812, "kl": 0.0450439453125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6985631585121155, "reward_std": 0.19796225428581238, "rewards/accuracy_reward": 0.7291753888130188, "rewards/format_reward": 0.9693877398967743, "step": 1568 }, { "completion_length": 208.29591369628906, "epoch": 0.1578867924528302, "grad_norm": 0.5628671646118164, "kl": 0.03955078125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.625008463859558, "reward_std": 0.05440052971243858, "rewards/accuracy_reward": 0.6250084638595581, "rewards/format_reward": 1.0, "step": 1569 }, { "completion_length": 249.96937561035156, "epoch": 0.1579874213836478, "grad_norm": 0.6984946727752686, "kl": 0.0411376953125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7566326260566711, "reward_std": 0.17046213895082474, "rewards/accuracy_reward": 0.7770408093929291, "rewards/format_reward": 0.9795918166637421, "step": 1570 }, { "completion_length": 206.4081573486328, "epoch": 0.1580880503144654, "grad_norm": 0.8845674991607666, "kl": 0.05322265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6965985298156738, "reward_std": 0.1771508827805519, "rewards/accuracy_reward": 0.7068027257919312, "rewards/format_reward": 0.9897959232330322, "step": 1571 }, { "completion_length": 244.53060913085938, "epoch": 0.15818867924528301, "grad_norm": 0.5192722678184509, "kl": 0.0406494140625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6958771347999573, "reward_std": 0.10312153026461601, "rewards/accuracy_reward": 0.706081211566925, "rewards/format_reward": 0.9897959232330322, "step": 1572 }, { "completion_length": 193.6836700439453, "epoch": 0.15828930817610062, "grad_norm": 0.4644604027271271, "kl": 0.04736328125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.8674009442329407, "reward_std": 0.07941615581512451, "rewards/accuracy_reward": 0.8776049613952637, "rewards/format_reward": 0.9897959232330322, "step": 1573 }, { "completion_length": 281.56121826171875, "epoch": 0.15838993710691823, "grad_norm": 0.7305585741996765, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6420797109603882, "reward_std": 0.1461181342601776, "rewards/accuracy_reward": 0.6624878346920013, "rewards/format_reward": 0.9795918464660645, "step": 1574 }, { "completion_length": 178.0204086303711, "epoch": 0.15849056603773584, "grad_norm": 2.2634246349334717, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7904518246650696, "reward_std": 0.14183050021529198, "rewards/accuracy_reward": 0.8006559610366821, "rewards/format_reward": 0.9897959232330322, "step": 1575 }, { "completion_length": 311.1530456542969, "epoch": 0.15859119496855345, "grad_norm": 1.043898344039917, "kl": 0.0321044921875, "learning_rate": 1e-06, "loss": 0.0013, "reward": 1.5665577054023743, "reward_std": 0.3036890849471092, "rewards/accuracy_reward": 0.6073739975690842, "rewards/format_reward": 0.9591836631298065, "step": 1576 }, { "completion_length": 226.62244415283203, "epoch": 0.15869182389937106, "grad_norm": 1.067088007926941, "kl": 0.0509033203125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7857142090797424, "reward_std": 0.1763915717601776, "rewards/accuracy_reward": 0.7857142686843872, "rewards/format_reward": 1.0, "step": 1577 }, { "completion_length": 195.79591369628906, "epoch": 0.15879245283018867, "grad_norm": 1.2614772319793701, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6554805636405945, "reward_std": 0.24214565008878708, "rewards/accuracy_reward": 0.67588871717453, "rewards/format_reward": 0.9795918166637421, "step": 1578 }, { "completion_length": 258.4591827392578, "epoch": 0.15889308176100628, "grad_norm": 0.9167017340660095, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6503400802612305, "reward_std": 0.19247454404830933, "rewards/accuracy_reward": 0.6503400802612305, "rewards/format_reward": 1.0, "step": 1579 }, { "completion_length": 213.34693145751953, "epoch": 0.1589937106918239, "grad_norm": 0.5412007570266724, "kl": 0.0498046875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7947846055030823, "reward_std": 0.14557798951864243, "rewards/accuracy_reward": 0.8049886226654053, "rewards/format_reward": 0.9897959232330322, "step": 1580 }, { "completion_length": 274.6938705444336, "epoch": 0.15909433962264152, "grad_norm": 1.0972957611083984, "kl": 0.0535888671875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5204080939292908, "reward_std": 0.22373328357934952, "rewards/accuracy_reward": 0.5204081535339355, "rewards/format_reward": 1.0, "step": 1581 }, { "completion_length": 220.7142791748047, "epoch": 0.15919496855345913, "grad_norm": 0.32218611240386963, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.887755036354065, "reward_std": 0.09670460596680641, "rewards/accuracy_reward": 0.8877550959587097, "rewards/format_reward": 1.0, "step": 1582 }, { "completion_length": 215.97958374023438, "epoch": 0.15929559748427674, "grad_norm": 0.46120280027389526, "kl": 0.04833984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.4999999403953552, "reward_std": 0.13821138441562653, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 1583 }, { "completion_length": 265.3673400878906, "epoch": 0.15939622641509435, "grad_norm": 1.2167836427688599, "kl": 0.0535888671875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6736655831336975, "reward_std": 0.26120223850011826, "rewards/accuracy_reward": 0.6736656129360199, "rewards/format_reward": 1.0, "step": 1584 }, { "completion_length": 239.84693908691406, "epoch": 0.15949685534591196, "grad_norm": 1.1897209882736206, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6952380537986755, "reward_std": 0.13821277394890785, "rewards/accuracy_reward": 0.7054421305656433, "rewards/format_reward": 0.9897959232330322, "step": 1585 }, { "completion_length": 245.60203552246094, "epoch": 0.15959748427672957, "grad_norm": 0.6596580147743225, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.458244264125824, "reward_std": 0.18640119582414627, "rewards/accuracy_reward": 0.46844834089279175, "rewards/format_reward": 0.9897959232330322, "step": 1586 }, { "completion_length": 220.78570556640625, "epoch": 0.15969811320754718, "grad_norm": 1.247212290763855, "kl": 0.11962890625, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.8333990573883057, "reward_std": 0.18005841225385666, "rewards/accuracy_reward": 0.8333989679813385, "rewards/format_reward": 1.0, "step": 1587 }, { "completion_length": 263.3061218261719, "epoch": 0.15979874213836479, "grad_norm": 0.8610192537307739, "kl": 0.0787353515625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6930272579193115, "reward_std": 0.2082236334681511, "rewards/accuracy_reward": 0.7032312750816345, "rewards/format_reward": 0.9897959232330322, "step": 1588 }, { "completion_length": 251.25508880615234, "epoch": 0.1598993710691824, "grad_norm": 0.8656578660011292, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6550822257995605, "reward_std": 0.2099689617753029, "rewards/accuracy_reward": 0.6754904389381409, "rewards/format_reward": 0.9795918464660645, "step": 1589 }, { "completion_length": 223.52040100097656, "epoch": 0.16, "grad_norm": 0.6547346711158752, "kl": 0.06103515625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.649097502231598, "reward_std": 0.11812891066074371, "rewards/accuracy_reward": 0.6490975916385651, "rewards/format_reward": 1.0, "step": 1590 }, { "completion_length": 308.74488830566406, "epoch": 0.1601006289308176, "grad_norm": 0.8548858165740967, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5714285373687744, "reward_std": 0.14599500596523285, "rewards/accuracy_reward": 0.5816326439380646, "rewards/format_reward": 0.9897959232330322, "step": 1591 }, { "completion_length": 208.9285659790039, "epoch": 0.16020125786163522, "grad_norm": 0.4546777606010437, "kl": 0.0472412109375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6761175394058228, "reward_std": 0.061830610036849976, "rewards/accuracy_reward": 0.6761175692081451, "rewards/format_reward": 1.0, "step": 1592 }, { "completion_length": 206.32652282714844, "epoch": 0.16030188679245283, "grad_norm": 1.6503245830535889, "kl": 0.05029296875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.61690753698349, "reward_std": 0.12642884626984596, "rewards/accuracy_reward": 0.6271116137504578, "rewards/format_reward": 0.9897959232330322, "step": 1593 }, { "completion_length": 240.64285278320312, "epoch": 0.16040251572327044, "grad_norm": 1.4093588590621948, "kl": 0.0511474609375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6130840182304382, "reward_std": 0.3443077430129051, "rewards/accuracy_reward": 0.6232881546020508, "rewards/format_reward": 0.9897959232330322, "step": 1594 }, { "completion_length": 197.29591369628906, "epoch": 0.16050314465408805, "grad_norm": 2.240715503692627, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8265305757522583, "reward_std": 0.23038647323846817, "rewards/accuracy_reward": 0.8265306055545807, "rewards/format_reward": 1.0, "step": 1595 }, { "completion_length": 191.09183502197266, "epoch": 0.16060377358490566, "grad_norm": 1.8867158889770508, "kl": 0.140380859375, "learning_rate": 1e-06, "loss": 0.0056, "reward": 1.670393943786621, "reward_std": 0.19290194660425186, "rewards/accuracy_reward": 0.7112102210521698, "rewards/format_reward": 0.9591836631298065, "step": 1596 }, { "completion_length": 253.80611419677734, "epoch": 0.16070440251572327, "grad_norm": 0.7889423370361328, "kl": 0.0421142578125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5810250043869019, "reward_std": 0.2149200215935707, "rewards/accuracy_reward": 0.5912290960550308, "rewards/format_reward": 0.9897959232330322, "step": 1597 }, { "completion_length": 175.31632232666016, "epoch": 0.16080503144654087, "grad_norm": 0.47865092754364014, "kl": 0.0521240234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8673468828201294, "reward_std": 0.10335781052708626, "rewards/accuracy_reward": 0.8673469126224518, "rewards/format_reward": 1.0, "step": 1598 }, { "completion_length": 231.14286041259766, "epoch": 0.16090566037735848, "grad_norm": 0.631560742855072, "kl": 0.0462646484375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.8027285933494568, "reward_std": 0.19768790155649185, "rewards/accuracy_reward": 0.8231367468833923, "rewards/format_reward": 0.9795918464660645, "step": 1599 }, { "completion_length": 247.4897918701172, "epoch": 0.1610062893081761, "grad_norm": 0.6096665859222412, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6525360345840454, "reward_std": 0.17259355634450912, "rewards/accuracy_reward": 0.672944188117981, "rewards/format_reward": 0.9795918464660645, "step": 1600 }, { "completion_length": 213.9387664794922, "epoch": 0.1611069182389937, "grad_norm": 0.6248530745506287, "kl": 0.054443359375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8307902812957764, "reward_std": 0.17213763296604156, "rewards/accuracy_reward": 0.8511984944343567, "rewards/format_reward": 0.9795918166637421, "step": 1601 }, { "completion_length": 190.66326141357422, "epoch": 0.1612075471698113, "grad_norm": 1.1811261177062988, "kl": 0.0523681640625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6572041511535645, "reward_std": 0.23099564760923386, "rewards/accuracy_reward": 0.6776122748851776, "rewards/format_reward": 0.9795918464660645, "step": 1602 }, { "completion_length": 230.62244415283203, "epoch": 0.16130817610062892, "grad_norm": 0.6447097063064575, "kl": 0.0516357421875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.634920597076416, "reward_std": 0.2548770383000374, "rewards/accuracy_reward": 0.6859410405158997, "rewards/format_reward": 0.9489795863628387, "step": 1603 }, { "completion_length": 288.8061218261719, "epoch": 0.16140880503144653, "grad_norm": 0.6219069361686707, "kl": 0.0528564453125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.600942611694336, "reward_std": 0.1901441588997841, "rewards/accuracy_reward": 0.6315549314022064, "rewards/format_reward": 0.9693877398967743, "step": 1604 }, { "completion_length": 276.75508880615234, "epoch": 0.16150943396226414, "grad_norm": 0.9522189497947693, "kl": 0.0594482421875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5961905121803284, "reward_std": 0.3179376572370529, "rewards/accuracy_reward": 0.6574150025844574, "rewards/format_reward": 0.9387754797935486, "step": 1605 }, { "completion_length": 239.7244873046875, "epoch": 0.16161006289308177, "grad_norm": 0.9867069721221924, "kl": 0.056640625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5505344867706299, "reward_std": 0.1996181160211563, "rewards/accuracy_reward": 0.5913508236408234, "rewards/format_reward": 0.9591836631298065, "step": 1606 }, { "completion_length": 240.87754821777344, "epoch": 0.16171069182389938, "grad_norm": 0.8545274138450623, "kl": 0.0565185546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5882498025894165, "reward_std": 0.1446635089814663, "rewards/accuracy_reward": 0.5984539538621902, "rewards/format_reward": 0.9897959232330322, "step": 1607 }, { "completion_length": 269.4897918701172, "epoch": 0.161811320754717, "grad_norm": 0.5680910348892212, "kl": 0.0513916015625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5369439721107483, "reward_std": 0.17409944534301758, "rewards/accuracy_reward": 0.5675562620162964, "rewards/format_reward": 0.9693877398967743, "step": 1608 }, { "completion_length": 218.40816497802734, "epoch": 0.1619119496855346, "grad_norm": 0.5788276791572571, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6336734294891357, "reward_std": 0.17552262917160988, "rewards/accuracy_reward": 0.6336734592914581, "rewards/format_reward": 1.0, "step": 1609 }, { "completion_length": 246.14285278320312, "epoch": 0.1620125786163522, "grad_norm": 1.3174571990966797, "kl": 0.050537109375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5615281462669373, "reward_std": 0.237567238509655, "rewards/accuracy_reward": 0.6023444831371307, "rewards/format_reward": 0.9591836631298065, "step": 1610 }, { "completion_length": 237.2653045654297, "epoch": 0.16211320754716982, "grad_norm": 1.5032747983932495, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7458308339118958, "reward_std": 0.19082164019346237, "rewards/accuracy_reward": 0.7560349702835083, "rewards/format_reward": 0.9897959232330322, "step": 1611 }, { "completion_length": 252.24488830566406, "epoch": 0.16221383647798743, "grad_norm": 1.049376130104065, "kl": 0.048828125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6509581804275513, "reward_std": 0.28873907029628754, "rewards/accuracy_reward": 0.6611622273921967, "rewards/format_reward": 0.9897959232330322, "step": 1612 }, { "completion_length": 252.03060913085938, "epoch": 0.16231446540880504, "grad_norm": 1.2978500127792358, "kl": 0.0528564453125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6252383589744568, "reward_std": 0.2570420131087303, "rewards/accuracy_reward": 0.6558505296707153, "rewards/format_reward": 0.9693877398967743, "step": 1613 }, { "completion_length": 223.03060913085938, "epoch": 0.16241509433962265, "grad_norm": 0.5968935489654541, "kl": 0.0740966796875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5709643959999084, "reward_std": 0.20468583703041077, "rewards/accuracy_reward": 0.5709644109010696, "rewards/format_reward": 1.0, "step": 1614 }, { "completion_length": 274.1530456542969, "epoch": 0.16251572327044025, "grad_norm": 0.9996453523635864, "kl": 0.0650634765625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.529609739780426, "reward_std": 0.22742090374231339, "rewards/accuracy_reward": 0.5704260468482971, "rewards/format_reward": 0.9591836631298065, "step": 1615 }, { "completion_length": 281.6224365234375, "epoch": 0.16261635220125786, "grad_norm": 0.577475905418396, "kl": 0.0484619140625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.59183669090271, "reward_std": 0.1684294268488884, "rewards/accuracy_reward": 0.6122448742389679, "rewards/format_reward": 0.9795918464660645, "step": 1616 }, { "completion_length": 299.5816345214844, "epoch": 0.16271698113207547, "grad_norm": 0.7826470732688904, "kl": 0.03692626953125, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5467787384986877, "reward_std": 0.2755483165383339, "rewards/accuracy_reward": 0.5875950455665588, "rewards/format_reward": 0.9591836631298065, "step": 1617 }, { "completion_length": 253.12244415283203, "epoch": 0.16281761006289308, "grad_norm": 0.7915647029876709, "kl": 0.0478515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6938775181770325, "reward_std": 0.18887970596551895, "rewards/accuracy_reward": 0.7040816247463226, "rewards/format_reward": 0.9897959232330322, "step": 1618 }, { "completion_length": 192.4081573486328, "epoch": 0.1629182389937107, "grad_norm": 2.3372714519500732, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8158406019210815, "reward_std": 0.12312436336651444, "rewards/accuracy_reward": 0.8260447084903717, "rewards/format_reward": 0.9897959232330322, "step": 1619 }, { "completion_length": 213.82653045654297, "epoch": 0.1630188679245283, "grad_norm": 0.6670907139778137, "kl": 0.0655517578125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7284182906150818, "reward_std": 0.17179275304079056, "rewards/accuracy_reward": 0.7488264441490173, "rewards/format_reward": 0.9795918166637421, "step": 1620 }, { "completion_length": 248.05101013183594, "epoch": 0.1631194968553459, "grad_norm": 1.4983187913894653, "kl": 0.055419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7202020287513733, "reward_std": 0.2526474595069885, "rewards/accuracy_reward": 0.7304061949253082, "rewards/format_reward": 0.9897959232330322, "step": 1621 }, { "completion_length": 302.57142639160156, "epoch": 0.16322012578616352, "grad_norm": 0.5894157886505127, "kl": 0.0452880859375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.5363099575042725, "reward_std": 0.25816410779953003, "rewards/accuracy_reward": 0.5567181408405304, "rewards/format_reward": 0.9795918464660645, "step": 1622 }, { "completion_length": 264.29590606689453, "epoch": 0.16332075471698113, "grad_norm": 1.2441530227661133, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5898300409317017, "reward_std": 0.22911908477544785, "rewards/accuracy_reward": 0.6000342667102814, "rewards/format_reward": 0.9897959232330322, "step": 1623 }, { "completion_length": 208.6836700439453, "epoch": 0.16342138364779873, "grad_norm": 0.6274334788322449, "kl": 0.057373046875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6934803128242493, "reward_std": 0.15008091367781162, "rewards/accuracy_reward": 0.7036845088005066, "rewards/format_reward": 0.9897959232330322, "step": 1624 }, { "completion_length": 285.9285659790039, "epoch": 0.16352201257861634, "grad_norm": 0.9146431684494019, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5662128329277039, "reward_std": 0.3253556042909622, "rewards/accuracy_reward": 0.6172333508729935, "rewards/format_reward": 0.9489795565605164, "step": 1625 }, { "completion_length": 258.78570556640625, "epoch": 0.16362264150943395, "grad_norm": 0.8230249285697937, "kl": 0.045654296875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6638680696487427, "reward_std": 0.22736267745494843, "rewards/accuracy_reward": 0.6944803893566132, "rewards/format_reward": 0.9693877398967743, "step": 1626 }, { "completion_length": 208.74488830566406, "epoch": 0.16372327044025156, "grad_norm": 0.6454881429672241, "kl": 0.051025390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7607904076576233, "reward_std": 0.12698395177721977, "rewards/accuracy_reward": 0.7811985611915588, "rewards/format_reward": 0.9795918166637421, "step": 1627 }, { "completion_length": 253.14285278320312, "epoch": 0.16382389937106917, "grad_norm": 1.0372064113616943, "kl": 0.0626220703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7539725303649902, "reward_std": 0.17001528292894363, "rewards/accuracy_reward": 0.7539725303649902, "rewards/format_reward": 1.0, "step": 1628 }, { "completion_length": 265.82653045654297, "epoch": 0.16392452830188678, "grad_norm": 1.0576006174087524, "kl": 0.0528564453125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5861621499061584, "reward_std": 0.23265496641397476, "rewards/accuracy_reward": 0.5963661372661591, "rewards/format_reward": 0.9897959232330322, "step": 1629 }, { "completion_length": 246.2142791748047, "epoch": 0.16402515723270442, "grad_norm": 0.9082156419754028, "kl": 0.0677490234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6581433415412903, "reward_std": 0.15986574813723564, "rewards/accuracy_reward": 0.6581433713436127, "rewards/format_reward": 1.0, "step": 1630 }, { "completion_length": 198.2142791748047, "epoch": 0.16412578616352202, "grad_norm": 0.9841831922531128, "kl": 0.05126953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.718367338180542, "reward_std": 0.11397746577858925, "rewards/accuracy_reward": 0.7183673679828644, "rewards/format_reward": 1.0, "step": 1631 }, { "completion_length": 224.7142791748047, "epoch": 0.16422641509433963, "grad_norm": 1.3403152227401733, "kl": 0.0601806640625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.670741617679596, "reward_std": 0.1472666598856449, "rewards/accuracy_reward": 0.6707416474819183, "rewards/format_reward": 1.0, "step": 1632 }, { "completion_length": 230.55101776123047, "epoch": 0.16432704402515724, "grad_norm": 1.7584774494171143, "kl": 0.0521240234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6795703768730164, "reward_std": 0.22483893483877182, "rewards/accuracy_reward": 0.6897745132446289, "rewards/format_reward": 0.9897959232330322, "step": 1633 }, { "completion_length": 291.1020202636719, "epoch": 0.16442767295597485, "grad_norm": 0.8525959849357605, "kl": 0.041015625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7353741526603699, "reward_std": 0.241074800491333, "rewards/accuracy_reward": 0.7455782294273376, "rewards/format_reward": 0.9897959232330322, "step": 1634 }, { "completion_length": 245.59182739257812, "epoch": 0.16452830188679246, "grad_norm": 1.3033818006515503, "kl": 0.0743408203125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6279864311218262, "reward_std": 0.2772831320762634, "rewards/accuracy_reward": 0.6381904780864716, "rewards/format_reward": 0.9897959232330322, "step": 1635 }, { "completion_length": 280.65306091308594, "epoch": 0.16462893081761007, "grad_norm": 0.5104833841323853, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7611626982688904, "reward_std": 0.131737120449543, "rewards/accuracy_reward": 0.7611626088619232, "rewards/format_reward": 1.0, "step": 1636 }, { "completion_length": 238.9285659790039, "epoch": 0.16472955974842768, "grad_norm": 1.755653738975525, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5098159313201904, "reward_std": 0.3306489884853363, "rewards/accuracy_reward": 0.5404281616210938, "rewards/format_reward": 0.9693877398967743, "step": 1637 }, { "completion_length": 194.7142791748047, "epoch": 0.1648301886792453, "grad_norm": 6.010156154632568, "kl": 0.0528564453125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6711722016334534, "reward_std": 0.1132836639881134, "rewards/accuracy_reward": 0.6711722314357758, "rewards/format_reward": 1.0, "step": 1638 }, { "completion_length": 293.5714111328125, "epoch": 0.1649308176100629, "grad_norm": 1.5477348566055298, "kl": 0.0517578125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5415971875190735, "reward_std": 0.15134429931640625, "rewards/accuracy_reward": 0.5518012493848801, "rewards/format_reward": 0.9897959232330322, "step": 1639 }, { "completion_length": 248.7244873046875, "epoch": 0.1650314465408805, "grad_norm": 0.8856039643287659, "kl": 0.039794921875, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7106227278709412, "reward_std": 0.17721626162528992, "rewards/accuracy_reward": 0.7106227278709412, "rewards/format_reward": 1.0, "step": 1640 }, { "completion_length": 253.7040786743164, "epoch": 0.16513207547169811, "grad_norm": 1.0963647365570068, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.636686086654663, "reward_std": 0.22232987731695175, "rewards/accuracy_reward": 0.6570942401885986, "rewards/format_reward": 0.9795918166637421, "step": 1641 }, { "completion_length": 278.54080963134766, "epoch": 0.16523270440251572, "grad_norm": 0.7883203029632568, "kl": 0.04345703125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5972983837127686, "reward_std": 0.16931676864624023, "rewards/accuracy_reward": 0.5972983092069626, "rewards/format_reward": 1.0, "step": 1642 }, { "completion_length": 192.4183578491211, "epoch": 0.16533333333333333, "grad_norm": 1.9590280055999756, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.681201159954071, "reward_std": 0.1675250306725502, "rewards/accuracy_reward": 0.6914052665233612, "rewards/format_reward": 0.9897959232330322, "step": 1643 }, { "completion_length": 300.0918273925781, "epoch": 0.16543396226415094, "grad_norm": 0.39491620659828186, "kl": 0.0460205078125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.673469364643097, "reward_std": 0.11454052850604057, "rewards/accuracy_reward": 0.6734693646430969, "rewards/format_reward": 1.0, "step": 1644 }, { "completion_length": 217.14285278320312, "epoch": 0.16553459119496855, "grad_norm": 1.3142681121826172, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.74319726228714, "reward_std": 0.2113775908946991, "rewards/accuracy_reward": 0.7636054158210754, "rewards/format_reward": 0.9795918464660645, "step": 1645 }, { "completion_length": 214.75509643554688, "epoch": 0.16563522012578616, "grad_norm": 0.681379497051239, "kl": 0.0670166015625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.9003887176513672, "reward_std": 0.21996711194515228, "rewards/accuracy_reward": 0.9207969009876251, "rewards/format_reward": 0.9795918166637421, "step": 1646 }, { "completion_length": 181.49999237060547, "epoch": 0.16573584905660377, "grad_norm": 0.7962600588798523, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.62362402677536, "reward_std": 0.10497242584824562, "rewards/accuracy_reward": 0.6236239969730377, "rewards/format_reward": 1.0, "step": 1647 }, { "completion_length": 250.6734619140625, "epoch": 0.16583647798742138, "grad_norm": 0.48414912819862366, "kl": 0.051513671875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.714285671710968, "reward_std": 0.14939410239458084, "rewards/accuracy_reward": 0.7142857015132904, "rewards/format_reward": 1.0, "step": 1648 }, { "completion_length": 220.13265228271484, "epoch": 0.16593710691823899, "grad_norm": 0.7484638690948486, "kl": 0.0499267578125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.795918345451355, "reward_std": 0.22470130771398544, "rewards/accuracy_reward": 0.8061224222183228, "rewards/format_reward": 0.9897959232330322, "step": 1649 }, { "completion_length": 307.4795837402344, "epoch": 0.1660377358490566, "grad_norm": 0.48668649792671204, "kl": 0.0418701171875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6935859322547913, "reward_std": 0.21061502397060394, "rewards/accuracy_reward": 0.7139941453933716, "rewards/format_reward": 0.9795918464660645, "step": 1650 }, { "completion_length": 292.1122283935547, "epoch": 0.1661383647798742, "grad_norm": 0.7786377668380737, "kl": 0.0372314453125, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5331839323043823, "reward_std": 0.25157327950000763, "rewards/accuracy_reward": 0.5331839770078659, "rewards/format_reward": 1.0, "step": 1651 }, { "completion_length": 217.51020050048828, "epoch": 0.1662389937106918, "grad_norm": 0.6396987438201904, "kl": 0.0614013671875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7108843922615051, "reward_std": 0.16793004423379898, "rewards/accuracy_reward": 0.7312925159931183, "rewards/format_reward": 0.9795918166637421, "step": 1652 }, { "completion_length": 211.88775634765625, "epoch": 0.16633962264150942, "grad_norm": 1.3108288049697876, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.665306031703949, "reward_std": 0.24696869403123856, "rewards/accuracy_reward": 0.6653061211109161, "rewards/format_reward": 1.0, "step": 1653 }, { "completion_length": 278.448974609375, "epoch": 0.16644025157232703, "grad_norm": 0.8133676052093506, "kl": 0.056884765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5537739396095276, "reward_std": 0.25023504346609116, "rewards/accuracy_reward": 0.5741820633411407, "rewards/format_reward": 0.9795918166637421, "step": 1654 }, { "completion_length": 235.45917510986328, "epoch": 0.16654088050314467, "grad_norm": 1.109897255897522, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8094928860664368, "reward_std": 0.1765056662261486, "rewards/accuracy_reward": 0.8094928562641144, "rewards/format_reward": 1.0, "step": 1655 }, { "completion_length": 296.2040786743164, "epoch": 0.16664150943396228, "grad_norm": 1.4387022256851196, "kl": 0.0423583984375, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6407313346862793, "reward_std": 0.2174595445394516, "rewards/accuracy_reward": 0.6407312750816345, "rewards/format_reward": 1.0, "step": 1656 }, { "completion_length": 330.1224365234375, "epoch": 0.16674213836477988, "grad_norm": 3.3913516998291016, "kl": 0.0421142578125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5714285373687744, "reward_std": 0.33715197443962097, "rewards/accuracy_reward": 0.5816326439380646, "rewards/format_reward": 0.9897959232330322, "step": 1657 }, { "completion_length": 265.4795837402344, "epoch": 0.1668427672955975, "grad_norm": 0.6179856657981873, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7514434456825256, "reward_std": 0.19363588839769363, "rewards/accuracy_reward": 0.7514434158802032, "rewards/format_reward": 1.0, "step": 1658 }, { "completion_length": 186.75509643554688, "epoch": 0.1669433962264151, "grad_norm": 0.9182776212692261, "kl": 0.0714111328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7229591012001038, "reward_std": 0.23513022810220718, "rewards/accuracy_reward": 0.7433673143386841, "rewards/format_reward": 0.9795918166637421, "step": 1659 }, { "completion_length": 288.5612106323242, "epoch": 0.1670440251572327, "grad_norm": 0.36121731996536255, "kl": 0.0482177734375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.694749653339386, "reward_std": 0.12443257868289948, "rewards/accuracy_reward": 0.6947496831417084, "rewards/format_reward": 1.0, "step": 1660 }, { "completion_length": 291.26529693603516, "epoch": 0.16714465408805032, "grad_norm": 0.9703314900398254, "kl": 0.0447998046875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.4762959480285645, "reward_std": 0.2904280349612236, "rewards/accuracy_reward": 0.5069081485271454, "rewards/format_reward": 0.9693877398967743, "step": 1661 }, { "completion_length": 255.75509643554688, "epoch": 0.16724528301886793, "grad_norm": 0.8947461247444153, "kl": 0.0670166015625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.596513569355011, "reward_std": 0.15080371499061584, "rewards/accuracy_reward": 0.5965135842561722, "rewards/format_reward": 1.0, "step": 1662 }, { "completion_length": 318.2652893066406, "epoch": 0.16734591194968554, "grad_norm": 0.7568218111991882, "kl": 0.052001953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5457416772842407, "reward_std": 0.25632480531930923, "rewards/accuracy_reward": 0.5559457391500473, "rewards/format_reward": 0.9897959232330322, "step": 1663 }, { "completion_length": 196.5, "epoch": 0.16744654088050315, "grad_norm": 0.7697597742080688, "kl": 0.0540771484375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8775509595870972, "reward_std": 0.08884849771857262, "rewards/accuracy_reward": 0.8775510191917419, "rewards/format_reward": 1.0, "step": 1664 }, { "completion_length": 249.79591369628906, "epoch": 0.16754716981132076, "grad_norm": 0.7546987533569336, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5986968278884888, "reward_std": 0.16863054782152176, "rewards/accuracy_reward": 0.5986968278884888, "rewards/format_reward": 1.0, "step": 1665 }, { "completion_length": 192.95918655395508, "epoch": 0.16764779874213837, "grad_norm": 0.8494704365730286, "kl": 0.059326171875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.874458909034729, "reward_std": 0.10942823998630047, "rewards/accuracy_reward": 0.8744588494300842, "rewards/format_reward": 1.0, "step": 1666 }, { "completion_length": 250.32653045654297, "epoch": 0.16774842767295597, "grad_norm": 1.2386140823364258, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7796613574028015, "reward_std": 0.17130310833454132, "rewards/accuracy_reward": 0.8000695705413818, "rewards/format_reward": 0.9795918166637421, "step": 1667 }, { "completion_length": 326.9387664794922, "epoch": 0.16784905660377358, "grad_norm": 0.7982906103134155, "kl": 0.043212890625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5936889052391052, "reward_std": 0.27890921384096146, "rewards/accuracy_reward": 0.6243011653423309, "rewards/format_reward": 0.9693877398967743, "step": 1668 }, { "completion_length": 291.87754821777344, "epoch": 0.1679496855345912, "grad_norm": 2.0602400302886963, "kl": 0.0465087890625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.4753805994987488, "reward_std": 0.3243432715535164, "rewards/accuracy_reward": 0.48558469116687775, "rewards/format_reward": 0.9897959232330322, "step": 1669 }, { "completion_length": 265.42857360839844, "epoch": 0.1680503144654088, "grad_norm": 0.8418524265289307, "kl": 0.0670166015625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6437684297561646, "reward_std": 0.27805982530117035, "rewards/accuracy_reward": 0.6539724469184875, "rewards/format_reward": 0.9897959232330322, "step": 1670 }, { "completion_length": 225.54080963134766, "epoch": 0.1681509433962264, "grad_norm": 1.3740328550338745, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8299319744110107, "reward_std": 0.2721581608057022, "rewards/accuracy_reward": 0.8707482814788818, "rewards/format_reward": 0.9591836333274841, "step": 1671 }, { "completion_length": 281.84693145751953, "epoch": 0.16825157232704402, "grad_norm": 0.9807401895523071, "kl": 0.0528564453125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7448979020118713, "reward_std": 0.1334144026041031, "rewards/accuracy_reward": 0.7551020383834839, "rewards/format_reward": 0.9897959232330322, "step": 1672 }, { "completion_length": 260.4591827392578, "epoch": 0.16835220125786163, "grad_norm": 1.4983576536178589, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6925376057624817, "reward_std": 0.19517182558774948, "rewards/accuracy_reward": 0.7027417123317719, "rewards/format_reward": 0.9897959232330322, "step": 1673 }, { "completion_length": 197.11224365234375, "epoch": 0.16845283018867924, "grad_norm": 1.2324013710021973, "kl": 0.054931640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8733320832252502, "reward_std": 0.14451085776090622, "rewards/accuracy_reward": 0.8733320534229279, "rewards/format_reward": 1.0, "step": 1674 }, { "completion_length": 219.34693145751953, "epoch": 0.16855345911949685, "grad_norm": 0.9687536358833313, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6658260822296143, "reward_std": 0.2788088992238045, "rewards/accuracy_reward": 0.6760302484035492, "rewards/format_reward": 0.9897959232330322, "step": 1675 }, { "completion_length": 259.0918273925781, "epoch": 0.16865408805031445, "grad_norm": 2.115583896636963, "kl": 0.0504150390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5858503580093384, "reward_std": 0.3002689629793167, "rewards/accuracy_reward": 0.6062585115432739, "rewards/format_reward": 0.9795918464660645, "step": 1676 }, { "completion_length": 222.05101776123047, "epoch": 0.16875471698113206, "grad_norm": 1.431826114654541, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.910134196281433, "reward_std": 0.14379625394940376, "rewards/accuracy_reward": 0.9407463073730469, "rewards/format_reward": 0.9693877398967743, "step": 1677 }, { "completion_length": 327.81632232666016, "epoch": 0.16885534591194967, "grad_norm": 1.1053812503814697, "kl": 0.04052734375, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.6604036092758179, "reward_std": 0.2708279490470886, "rewards/accuracy_reward": 0.7012199759483337, "rewards/format_reward": 0.9591836631298065, "step": 1678 }, { "completion_length": 285.0, "epoch": 0.1689559748427673, "grad_norm": 0.6840335130691528, "kl": 0.03485107421875, "learning_rate": 1e-06, "loss": 0.0014, "reward": 1.7026239037513733, "reward_std": 0.25799496471881866, "rewards/accuracy_reward": 0.7026238441467285, "rewards/format_reward": 1.0, "step": 1679 }, { "completion_length": 226.28570556640625, "epoch": 0.16905660377358492, "grad_norm": 0.5450852513313293, "kl": 0.0677490234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7756924033164978, "reward_std": 0.09253505058586597, "rewards/accuracy_reward": 0.7756924033164978, "rewards/format_reward": 1.0, "step": 1680 }, { "completion_length": 271.9591827392578, "epoch": 0.16915723270440253, "grad_norm": 1.655770182609558, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.46723473072052, "reward_std": 0.1785905733704567, "rewards/accuracy_reward": 0.4774388521909714, "rewards/format_reward": 0.9897959232330322, "step": 1681 }, { "completion_length": 303.7755126953125, "epoch": 0.16925786163522014, "grad_norm": 0.7370525598526001, "kl": 0.052490234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.652332365512848, "reward_std": 0.23646093904972076, "rewards/accuracy_reward": 0.6523323059082031, "rewards/format_reward": 1.0, "step": 1682 }, { "completion_length": 177.88774871826172, "epoch": 0.16935849056603774, "grad_norm": 1.0877392292022705, "kl": 0.103515625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.5853559970855713, "reward_std": 0.21794115006923676, "rewards/accuracy_reward": 0.5955599844455719, "rewards/format_reward": 0.9897959232330322, "step": 1683 }, { "completion_length": 338.7857131958008, "epoch": 0.16945911949685535, "grad_norm": 1.0527219772338867, "kl": 0.06390380859375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6475774049758911, "reward_std": 0.2480437085032463, "rewards/accuracy_reward": 0.6577815115451813, "rewards/format_reward": 0.9897959232330322, "step": 1684 }, { "completion_length": 329.6428527832031, "epoch": 0.16955974842767296, "grad_norm": 0.594872772693634, "kl": 0.0517578125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5573521256446838, "reward_std": 0.14913729578256607, "rewards/accuracy_reward": 0.5573521554470062, "rewards/format_reward": 1.0, "step": 1685 }, { "completion_length": 210.58162689208984, "epoch": 0.16966037735849057, "grad_norm": 0.9354603290557861, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7798303365707397, "reward_std": 0.2173420414328575, "rewards/accuracy_reward": 0.7798303067684174, "rewards/format_reward": 1.0, "step": 1686 }, { "completion_length": 229.57142639160156, "epoch": 0.16976100628930818, "grad_norm": 0.5648786425590515, "kl": 0.048828125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6530612111091614, "reward_std": 0.13794391602277756, "rewards/accuracy_reward": 0.6632652878761292, "rewards/format_reward": 0.9897959232330322, "step": 1687 }, { "completion_length": 278.7755126953125, "epoch": 0.1698616352201258, "grad_norm": 0.7639481425285339, "kl": 0.0548095703125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7518043518066406, "reward_std": 0.20181258022785187, "rewards/accuracy_reward": 0.7518044412136078, "rewards/format_reward": 1.0, "step": 1688 }, { "completion_length": 253.6326446533203, "epoch": 0.1699622641509434, "grad_norm": 1.7311558723449707, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7468562722206116, "reward_std": 0.15565450862050056, "rewards/accuracy_reward": 0.7468563318252563, "rewards/format_reward": 1.0, "step": 1689 }, { "completion_length": 181.29591369628906, "epoch": 0.170062893081761, "grad_norm": 1.6883983612060547, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7474614381790161, "reward_std": 0.21823318302631378, "rewards/accuracy_reward": 0.7576655745506287, "rewards/format_reward": 0.9897959232330322, "step": 1690 }, { "completion_length": 202.51020050048828, "epoch": 0.17016352201257862, "grad_norm": 1.1215728521347046, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.858827292919159, "reward_std": 0.142486110329628, "rewards/accuracy_reward": 0.8588273525238037, "rewards/format_reward": 1.0, "step": 1691 }, { "completion_length": 281.42857360839844, "epoch": 0.17026415094339623, "grad_norm": 0.80568528175354, "kl": 0.0589599609375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.640913486480713, "reward_std": 0.2273254096508026, "rewards/accuracy_reward": 0.6511175334453583, "rewards/format_reward": 0.9897959232330322, "step": 1692 }, { "completion_length": 273.3571319580078, "epoch": 0.17036477987421383, "grad_norm": 0.9183602333068848, "kl": 0.0478515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7703771591186523, "reward_std": 0.27466151118278503, "rewards/accuracy_reward": 0.7907853722572327, "rewards/format_reward": 0.9795918464660645, "step": 1693 }, { "completion_length": 330.34693145751953, "epoch": 0.17046540880503144, "grad_norm": 1.009120225906372, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5787355303764343, "reward_std": 0.295486256480217, "rewards/accuracy_reward": 0.6093478798866272, "rewards/format_reward": 0.9693877398967743, "step": 1694 }, { "completion_length": 237.7142791748047, "epoch": 0.17056603773584905, "grad_norm": 0.6064314842224121, "kl": 0.0545654296875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5675736665725708, "reward_std": 0.13322192430496216, "rewards/accuracy_reward": 0.577777773141861, "rewards/format_reward": 0.9897959232330322, "step": 1695 }, { "completion_length": 316.7449035644531, "epoch": 0.17066666666666666, "grad_norm": 0.5377804636955261, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6353707909584045, "reward_std": 0.17250347137451172, "rewards/accuracy_reward": 0.6455748975276947, "rewards/format_reward": 0.9897959232330322, "step": 1696 }, { "completion_length": 225.71428680419922, "epoch": 0.17076729559748427, "grad_norm": 0.6878013014793396, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7904762029647827, "reward_std": 0.10271825268864632, "rewards/accuracy_reward": 0.7904761731624603, "rewards/format_reward": 1.0, "step": 1697 }, { "completion_length": 245.89795684814453, "epoch": 0.17086792452830188, "grad_norm": 1.1094192266464233, "kl": 0.0491943359375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.714285671710968, "reward_std": 0.1968383491039276, "rewards/accuracy_reward": 0.7244897782802582, "rewards/format_reward": 0.9897959232330322, "step": 1698 }, { "completion_length": 221.7142791748047, "epoch": 0.1709685534591195, "grad_norm": 1.8210580348968506, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7040815949440002, "reward_std": 0.24462831765413284, "rewards/accuracy_reward": 0.734693855047226, "rewards/format_reward": 0.9693877398967743, "step": 1699 }, { "completion_length": 236.6428451538086, "epoch": 0.1710691823899371, "grad_norm": 1.2992199659347534, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7186625599861145, "reward_std": 0.0886751338839531, "rewards/accuracy_reward": 0.7288666665554047, "rewards/format_reward": 0.9897959232330322, "step": 1700 }, { "completion_length": 251.3775405883789, "epoch": 0.1711698113207547, "grad_norm": 0.7006261348724365, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7355415225028992, "reward_std": 0.18116428330540657, "rewards/accuracy_reward": 0.7457456588745117, "rewards/format_reward": 0.9897959232330322, "step": 1701 }, { "completion_length": 236.1938705444336, "epoch": 0.17127044025157231, "grad_norm": 0.8297969102859497, "kl": 0.04443359375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.652397096157074, "reward_std": 0.23842224851250648, "rewards/accuracy_reward": 0.6523971110582352, "rewards/format_reward": 1.0, "step": 1702 }, { "completion_length": 268.2653045654297, "epoch": 0.17137106918238992, "grad_norm": 0.4975054860115051, "kl": 0.0562744140625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7298603057861328, "reward_std": 0.18132372200489044, "rewards/accuracy_reward": 0.7298603057861328, "rewards/format_reward": 1.0, "step": 1703 }, { "completion_length": 303.65306091308594, "epoch": 0.17147169811320756, "grad_norm": 1.0825461149215698, "kl": 0.048583984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.616618037223816, "reward_std": 0.2971402257680893, "rewards/accuracy_reward": 0.6166180372238159, "rewards/format_reward": 1.0, "step": 1704 }, { "completion_length": 249.58163452148438, "epoch": 0.17157232704402517, "grad_norm": 0.5334105491638184, "kl": 0.0438232421875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.8029639720916748, "reward_std": 0.2366335690021515, "rewards/accuracy_reward": 0.8131681084632874, "rewards/format_reward": 0.9897959232330322, "step": 1705 }, { "completion_length": 240.01019287109375, "epoch": 0.17167295597484278, "grad_norm": 0.9545140862464905, "kl": 0.0465087890625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.619543731212616, "reward_std": 0.23943521082401276, "rewards/accuracy_reward": 0.6603600084781647, "rewards/format_reward": 0.9591836631298065, "step": 1706 }, { "completion_length": 214.01019287109375, "epoch": 0.1717735849056604, "grad_norm": 0.6232447624206543, "kl": 0.0380859375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.6706348657608032, "reward_std": 0.09842048585414886, "rewards/accuracy_reward": 0.6706348955631256, "rewards/format_reward": 1.0, "step": 1707 }, { "completion_length": 232.88775634765625, "epoch": 0.171874213836478, "grad_norm": 1.2755415439605713, "kl": 0.045654296875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6681243181228638, "reward_std": 0.23037491738796234, "rewards/accuracy_reward": 0.6783284544944763, "rewards/format_reward": 0.9897959232330322, "step": 1708 }, { "completion_length": 180.85713958740234, "epoch": 0.1719748427672956, "grad_norm": 0.9704936146736145, "kl": 0.0631103515625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7147229313850403, "reward_std": 0.2220214530825615, "rewards/accuracy_reward": 0.7249270677566528, "rewards/format_reward": 0.9897959232330322, "step": 1709 }, { "completion_length": 274.2346954345703, "epoch": 0.1720754716981132, "grad_norm": 0.9372650980949402, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.414618194103241, "reward_std": 0.2003237009048462, "rewards/accuracy_reward": 0.41461819410324097, "rewards/format_reward": 1.0, "step": 1710 }, { "completion_length": 269.31632232666016, "epoch": 0.17217610062893082, "grad_norm": 0.6176806688308716, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6425479054450989, "reward_std": 0.2497139573097229, "rewards/accuracy_reward": 0.6527520418167114, "rewards/format_reward": 0.9897959232330322, "step": 1711 }, { "completion_length": 239.10204315185547, "epoch": 0.17227672955974843, "grad_norm": 1.3012012243270874, "kl": 0.0562744140625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6112730503082275, "reward_std": 0.20250388979911804, "rewards/accuracy_reward": 0.6112730652093887, "rewards/format_reward": 1.0, "step": 1712 }, { "completion_length": 174.80611419677734, "epoch": 0.17237735849056604, "grad_norm": 2.2134969234466553, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.690338134765625, "reward_std": 0.1382206417620182, "rewards/accuracy_reward": 0.7005422711372375, "rewards/format_reward": 0.9897959232330322, "step": 1713 }, { "completion_length": 301.8775405883789, "epoch": 0.17247798742138365, "grad_norm": 0.7563797235488892, "kl": 0.048095703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7628989219665527, "reward_std": 0.17942500859498978, "rewards/accuracy_reward": 0.7833071053028107, "rewards/format_reward": 0.9795918166637421, "step": 1714 }, { "completion_length": 254.99999237060547, "epoch": 0.17257861635220126, "grad_norm": 1.3312721252441406, "kl": 0.04638671875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6092711091041565, "reward_std": 0.2581142634153366, "rewards/accuracy_reward": 0.6092711389064789, "rewards/format_reward": 1.0, "step": 1715 }, { "completion_length": 350.96937561035156, "epoch": 0.17267924528301887, "grad_norm": 0.8873476386070251, "kl": 0.0445556640625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.579336404800415, "reward_std": 0.27409808337688446, "rewards/accuracy_reward": 0.5997446477413177, "rewards/format_reward": 0.9795918464660645, "step": 1716 }, { "completion_length": 181.80611419677734, "epoch": 0.17277987421383648, "grad_norm": 0.9112927913665771, "kl": 0.048828125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7857142686843872, "reward_std": 0.1880747713148594, "rewards/accuracy_reward": 0.7857142686843872, "rewards/format_reward": 1.0, "step": 1717 }, { "completion_length": 266.051025390625, "epoch": 0.17288050314465409, "grad_norm": 1.171514630317688, "kl": 0.0570068359375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6346039175987244, "reward_std": 0.28557712584733963, "rewards/accuracy_reward": 0.6652161478996277, "rewards/format_reward": 0.9693877398967743, "step": 1718 }, { "completion_length": 265.4591827392578, "epoch": 0.1729811320754717, "grad_norm": 1.0134047269821167, "kl": 0.04443359375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.625381588935852, "reward_std": 0.23658058792352676, "rewards/accuracy_reward": 0.6355857253074646, "rewards/format_reward": 0.9897959232330322, "step": 1719 }, { "completion_length": 255.77550506591797, "epoch": 0.1730817610062893, "grad_norm": 0.5845552086830139, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7539398670196533, "reward_std": 0.17880086228251457, "rewards/accuracy_reward": 0.7641439437866211, "rewards/format_reward": 0.9897959232330322, "step": 1720 }, { "completion_length": 226.4693832397461, "epoch": 0.1731823899371069, "grad_norm": 0.6244237422943115, "kl": 0.0548095703125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7560526728630066, "reward_std": 0.18228847533464432, "rewards/accuracy_reward": 0.7560527324676514, "rewards/format_reward": 1.0, "step": 1721 }, { "completion_length": 341.32652282714844, "epoch": 0.17328301886792452, "grad_norm": 1.0095540285110474, "kl": 0.056640625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.415522575378418, "reward_std": 0.2141466923058033, "rewards/accuracy_reward": 0.4359307140111923, "rewards/format_reward": 0.9795918166637421, "step": 1722 }, { "completion_length": 272.36734771728516, "epoch": 0.17338364779874213, "grad_norm": 0.7487077116966248, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6525017619132996, "reward_std": 0.1537383310496807, "rewards/accuracy_reward": 0.6831140220165253, "rewards/format_reward": 0.9693877398967743, "step": 1723 }, { "completion_length": 220.88774871826172, "epoch": 0.17348427672955974, "grad_norm": 1.254697322845459, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.78481924533844, "reward_std": 0.14956040680408478, "rewards/accuracy_reward": 0.7848192453384399, "rewards/format_reward": 1.0, "step": 1724 }, { "completion_length": 230.17346954345703, "epoch": 0.17358490566037735, "grad_norm": 1.1797960996627808, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5144829750061035, "reward_std": 0.24455660581588745, "rewards/accuracy_reward": 0.5552993416786194, "rewards/format_reward": 0.9591836631298065, "step": 1725 }, { "completion_length": 262.4693908691406, "epoch": 0.17368553459119496, "grad_norm": 0.8158984780311584, "kl": 0.054443359375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.739552915096283, "reward_std": 0.3002719581127167, "rewards/accuracy_reward": 0.7599610984325409, "rewards/format_reward": 0.9795918464660645, "step": 1726 }, { "completion_length": 273.89795684814453, "epoch": 0.17378616352201257, "grad_norm": 0.8047425150871277, "kl": 0.04443359375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.8061224222183228, "reward_std": 0.2483847737312317, "rewards/accuracy_reward": 0.8061224222183228, "rewards/format_reward": 1.0, "step": 1727 }, { "completion_length": 247.2244873046875, "epoch": 0.1738867924528302, "grad_norm": 0.5143508315086365, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8515769243240356, "reward_std": 0.1330622062087059, "rewards/accuracy_reward": 0.8719851672649384, "rewards/format_reward": 0.9795918166637421, "step": 1728 }, { "completion_length": 268.9693832397461, "epoch": 0.1739874213836478, "grad_norm": 1.3262962102890015, "kl": 0.0723876953125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5374760031700134, "reward_std": 0.22323472052812576, "rewards/accuracy_reward": 0.547680139541626, "rewards/format_reward": 0.9897959232330322, "step": 1729 }, { "completion_length": 263.46937561035156, "epoch": 0.17408805031446542, "grad_norm": 0.8948343992233276, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6470480561256409, "reward_std": 0.15962110459804535, "rewards/accuracy_reward": 0.6470480859279633, "rewards/format_reward": 1.0, "step": 1730 }, { "completion_length": 260.6836624145508, "epoch": 0.17418867924528303, "grad_norm": 0.9901966452598572, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7206761837005615, "reward_std": 0.26737868785858154, "rewards/accuracy_reward": 0.7206761240959167, "rewards/format_reward": 1.0, "step": 1731 }, { "completion_length": 240.6938705444336, "epoch": 0.17428930817610064, "grad_norm": 1.4877495765686035, "kl": 0.0565185546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.721088469028473, "reward_std": 0.2097463384270668, "rewards/accuracy_reward": 0.7414965927600861, "rewards/format_reward": 0.9795918464660645, "step": 1732 }, { "completion_length": 152.05101776123047, "epoch": 0.17438993710691825, "grad_norm": 1.020583987236023, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7980712056159973, "reward_std": 0.13829437270760536, "rewards/accuracy_reward": 0.7980713248252869, "rewards/format_reward": 1.0, "step": 1733 }, { "completion_length": 207.25509643554688, "epoch": 0.17449056603773586, "grad_norm": 0.7402899861335754, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6282670497894287, "reward_std": 0.1608808897435665, "rewards/accuracy_reward": 0.6282671391963959, "rewards/format_reward": 1.0, "step": 1734 }, { "completion_length": 242.88774871826172, "epoch": 0.17459119496855346, "grad_norm": 1.256009578704834, "kl": 0.0511474609375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.714285671710968, "reward_std": 0.17873751372098923, "rewards/accuracy_reward": 0.7142857015132904, "rewards/format_reward": 1.0, "step": 1735 }, { "completion_length": 278.27549743652344, "epoch": 0.17469182389937107, "grad_norm": 0.806717038154602, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.527502417564392, "reward_std": 0.2344938963651657, "rewards/accuracy_reward": 0.54791060090065, "rewards/format_reward": 0.9795918464660645, "step": 1736 }, { "completion_length": 189.0408172607422, "epoch": 0.17479245283018868, "grad_norm": 0.9171810746192932, "kl": 0.0584716796875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6865888833999634, "reward_std": 0.19013237208127975, "rewards/accuracy_reward": 0.7069970667362213, "rewards/format_reward": 0.9795918464660645, "step": 1737 }, { "completion_length": 275.2346954345703, "epoch": 0.1748930817610063, "grad_norm": 0.8250877261161804, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6884353160858154, "reward_std": 0.2571137398481369, "rewards/accuracy_reward": 0.7190476059913635, "rewards/format_reward": 0.9693877398967743, "step": 1738 }, { "completion_length": 213.89794921875, "epoch": 0.1749937106918239, "grad_norm": 0.48023632168769836, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7216655611991882, "reward_std": 0.08957872167229652, "rewards/accuracy_reward": 0.7318696975708008, "rewards/format_reward": 0.9897959232330322, "step": 1739 }, { "completion_length": 257.0306167602539, "epoch": 0.1750943396226415, "grad_norm": 1.6050392389297485, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6378798484802246, "reward_std": 0.22213709354400635, "rewards/accuracy_reward": 0.6684921681880951, "rewards/format_reward": 0.9693877398967743, "step": 1740 }, { "completion_length": 300.6938705444336, "epoch": 0.17519496855345912, "grad_norm": 1.890552043914795, "kl": 0.05517578125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7053773403167725, "reward_std": 0.2987530380487442, "rewards/accuracy_reward": 0.7461937367916107, "rewards/format_reward": 0.9591836333274841, "step": 1741 }, { "completion_length": 278.02040100097656, "epoch": 0.17529559748427673, "grad_norm": 0.8797164559364319, "kl": 0.0552978515625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.571016252040863, "reward_std": 0.1968899518251419, "rewards/accuracy_reward": 0.5710163414478302, "rewards/format_reward": 1.0, "step": 1742 }, { "completion_length": 177.25509643554688, "epoch": 0.17539622641509434, "grad_norm": 0.5914991497993469, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.719387710094452, "reward_std": 0.1281326748430729, "rewards/accuracy_reward": 0.7397958934307098, "rewards/format_reward": 0.9795918464660645, "step": 1743 }, { "completion_length": 222.91836547851562, "epoch": 0.17549685534591195, "grad_norm": 0.6810727119445801, "kl": 0.0550537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8469387292861938, "reward_std": 0.15402613580226898, "rewards/accuracy_reward": 0.8571428656578064, "rewards/format_reward": 0.9897959232330322, "step": 1744 }, { "completion_length": 243.24488830566406, "epoch": 0.17559748427672955, "grad_norm": 0.8342616558074951, "kl": 0.047119140625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.605898141860962, "reward_std": 0.23655898869037628, "rewards/accuracy_reward": 0.6365103721618652, "rewards/format_reward": 0.9693877398967743, "step": 1745 }, { "completion_length": 283.3163146972656, "epoch": 0.17569811320754716, "grad_norm": 1.230041265487671, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6319909691810608, "reward_std": 0.32381145656108856, "rewards/accuracy_reward": 0.6523991525173187, "rewards/format_reward": 0.9795918166637421, "step": 1746 }, { "completion_length": 222.58162689208984, "epoch": 0.17579874213836477, "grad_norm": 2.4788196086883545, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6589115858078003, "reward_std": 0.19879207015037537, "rewards/accuracy_reward": 0.6895238757133484, "rewards/format_reward": 0.9693877398967743, "step": 1747 }, { "completion_length": 229.17346954345703, "epoch": 0.17589937106918238, "grad_norm": 2.5070934295654297, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7744710445404053, "reward_std": 0.2597522661089897, "rewards/accuracy_reward": 0.7948792576789856, "rewards/format_reward": 0.9795918464660645, "step": 1748 }, { "completion_length": 231.58162689208984, "epoch": 0.176, "grad_norm": 1.2080512046813965, "kl": 0.052490234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6408636569976807, "reward_std": 0.24806824326515198, "rewards/accuracy_reward": 0.6612718105316162, "rewards/format_reward": 0.9795918464660645, "step": 1749 }, { "completion_length": 182.31632232666016, "epoch": 0.1761006289308176, "grad_norm": 0.7026248574256897, "kl": 0.065185546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.766506552696228, "reward_std": 0.15427026897668839, "rewards/accuracy_reward": 0.786914736032486, "rewards/format_reward": 0.9795918464660645, "step": 1750 }, { "completion_length": 281.44896697998047, "epoch": 0.1762012578616352, "grad_norm": 0.5380944609642029, "kl": 0.04150390625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7341107726097107, "reward_std": 0.13182337954640388, "rewards/accuracy_reward": 0.7341107428073883, "rewards/format_reward": 1.0, "step": 1751 }, { "completion_length": 235.2653045654297, "epoch": 0.17630188679245282, "grad_norm": 1.4333828687667847, "kl": 0.0562744140625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.820699691772461, "reward_std": 0.1784294843673706, "rewards/accuracy_reward": 0.8309037387371063, "rewards/format_reward": 0.9897959232330322, "step": 1752 }, { "completion_length": 268.6428527832031, "epoch": 0.17640251572327045, "grad_norm": 1.0987945795059204, "kl": 0.0587158203125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5702385306358337, "reward_std": 0.16192177683115005, "rewards/accuracy_reward": 0.5804425477981567, "rewards/format_reward": 0.9897959232330322, "step": 1753 }, { "completion_length": 305.2653045654297, "epoch": 0.17650314465408806, "grad_norm": 0.568007230758667, "kl": 0.0537109375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7346938252449036, "reward_std": 0.21117264032363892, "rewards/accuracy_reward": 0.7551020383834839, "rewards/format_reward": 0.9795918464660645, "step": 1754 }, { "completion_length": 226.77549743652344, "epoch": 0.17660377358490567, "grad_norm": 1.0842372179031372, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8469387292861938, "reward_std": 0.19673582166433334, "rewards/accuracy_reward": 0.857142835855484, "rewards/format_reward": 0.9897959232330322, "step": 1755 }, { "completion_length": 265.39796447753906, "epoch": 0.17670440251572328, "grad_norm": 0.7460964918136597, "kl": 0.036865234375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.7457318902015686, "reward_std": 0.2888840511441231, "rewards/accuracy_reward": 0.7763441503047943, "rewards/format_reward": 0.9693877398967743, "step": 1756 }, { "completion_length": 302.17345428466797, "epoch": 0.1768050314465409, "grad_norm": 0.5995692014694214, "kl": 0.0458984375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7089998722076416, "reward_std": 0.2572983652353287, "rewards/accuracy_reward": 0.7498162388801575, "rewards/format_reward": 0.9591836333274841, "step": 1757 }, { "completion_length": 228.32653045654297, "epoch": 0.1769056603773585, "grad_norm": 1.0216978788375854, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7122448682785034, "reward_std": 0.13522307947278023, "rewards/accuracy_reward": 0.7224489748477936, "rewards/format_reward": 0.9897959232330322, "step": 1758 }, { "completion_length": 201.1938705444336, "epoch": 0.1770062893081761, "grad_norm": 1.3017328977584839, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8698412775993347, "reward_std": 0.10924657434225082, "rewards/accuracy_reward": 0.8698412775993347, "rewards/format_reward": 1.0, "step": 1759 }, { "completion_length": 277.6632537841797, "epoch": 0.17710691823899372, "grad_norm": 0.5929752588272095, "kl": 0.056640625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5386464595794678, "reward_std": 0.23865807056427002, "rewards/accuracy_reward": 0.5692588090896606, "rewards/format_reward": 0.9693877398967743, "step": 1760 }, { "completion_length": 296.60203552246094, "epoch": 0.17720754716981132, "grad_norm": 0.9283341765403748, "kl": 0.0576171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.471961498260498, "reward_std": 0.2469741404056549, "rewards/accuracy_reward": 0.49236954748630524, "rewards/format_reward": 0.9795918464660645, "step": 1761 }, { "completion_length": 247.01019287109375, "epoch": 0.17730817610062893, "grad_norm": 2.5508944988250732, "kl": 0.0701904296875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7222104668617249, "reward_std": 0.22179245948791504, "rewards/accuracy_reward": 0.742618590593338, "rewards/format_reward": 0.9795918464660645, "step": 1762 }, { "completion_length": 180.23468780517578, "epoch": 0.17740880503144654, "grad_norm": 0.5694707632064819, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8469387292861938, "reward_std": 0.15855563431978226, "rewards/accuracy_reward": 0.857142835855484, "rewards/format_reward": 0.9897959232330322, "step": 1763 }, { "completion_length": 231.55101776123047, "epoch": 0.17750943396226415, "grad_norm": 2.7236011028289795, "kl": 0.0570068359375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5396053194999695, "reward_std": 0.2601131796836853, "rewards/accuracy_reward": 0.5600134134292603, "rewards/format_reward": 0.9795918464660645, "step": 1764 }, { "completion_length": 188.02040481567383, "epoch": 0.17761006289308176, "grad_norm": 1.1220979690551758, "kl": 0.0574951171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8469387292861938, "reward_std": 0.18102359026670456, "rewards/accuracy_reward": 0.857142835855484, "rewards/format_reward": 0.9897959232330322, "step": 1765 }, { "completion_length": 205.27550506591797, "epoch": 0.17771069182389937, "grad_norm": 1.1703745126724243, "kl": 0.055419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8061224222183228, "reward_std": 0.17793556302785873, "rewards/accuracy_reward": 0.8265306353569031, "rewards/format_reward": 0.9795918166637421, "step": 1766 }, { "completion_length": 209.82652282714844, "epoch": 0.17781132075471698, "grad_norm": 0.5184397101402283, "kl": 0.0601806640625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6224489212036133, "reward_std": 0.07303375005722046, "rewards/accuracy_reward": 0.6224489659070969, "rewards/format_reward": 1.0, "step": 1767 }, { "completion_length": 140.61224365234375, "epoch": 0.1779119496855346, "grad_norm": 1.526243805885315, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.741496503353119, "reward_std": 0.10903036221861839, "rewards/accuracy_reward": 0.7414966225624084, "rewards/format_reward": 1.0, "step": 1768 }, { "completion_length": 186.82653045654297, "epoch": 0.1780125786163522, "grad_norm": 2.0603249073028564, "kl": 0.123779296875, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.6916099190711975, "reward_std": 0.16882393509149551, "rewards/accuracy_reward": 0.6916099786758423, "rewards/format_reward": 1.0, "step": 1769 }, { "completion_length": 267.3367385864258, "epoch": 0.1781132075471698, "grad_norm": 0.6484588384628296, "kl": 0.0489501953125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6982992887496948, "reward_std": 0.16055608913302422, "rewards/accuracy_reward": 0.708503395318985, "rewards/format_reward": 0.9897959232330322, "step": 1770 }, { "completion_length": 225.4183578491211, "epoch": 0.17821383647798741, "grad_norm": 1.1902271509170532, "kl": 0.056396484375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6810501217842102, "reward_std": 0.2154759243130684, "rewards/accuracy_reward": 0.6810500919818878, "rewards/format_reward": 1.0, "step": 1771 }, { "completion_length": 226.77550506591797, "epoch": 0.17831446540880502, "grad_norm": 0.7479236125946045, "kl": 0.056884765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7355684638023376, "reward_std": 0.21635400503873825, "rewards/accuracy_reward": 0.755976676940918, "rewards/format_reward": 0.9795918166637421, "step": 1772 }, { "completion_length": 204.87754821777344, "epoch": 0.17841509433962263, "grad_norm": 1.425686001777649, "kl": 0.052978515625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7319965362548828, "reward_std": 0.22382721304893494, "rewards/accuracy_reward": 0.7319965660572052, "rewards/format_reward": 1.0, "step": 1773 }, { "completion_length": 243.40816497802734, "epoch": 0.17851572327044024, "grad_norm": 1.8002421855926514, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6868910193443298, "reward_std": 0.19678393751382828, "rewards/accuracy_reward": 0.7072992026805878, "rewards/format_reward": 0.9795918464660645, "step": 1774 }, { "completion_length": 171.35713958740234, "epoch": 0.17861635220125785, "grad_norm": 0.7707467079162598, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6571428179740906, "reward_std": 0.11622969061136246, "rewards/accuracy_reward": 0.6571428179740906, "rewards/format_reward": 1.0, "step": 1775 }, { "completion_length": 228.01020050048828, "epoch": 0.17871698113207546, "grad_norm": 0.8217542171478271, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7543367743492126, "reward_std": 0.14678387343883514, "rewards/accuracy_reward": 0.7543366849422455, "rewards/format_reward": 1.0, "step": 1776 }, { "completion_length": 233.73468780517578, "epoch": 0.1788176100628931, "grad_norm": 0.8950762748718262, "kl": 0.0640869140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.678455412387848, "reward_std": 0.26000674068927765, "rewards/accuracy_reward": 0.6988635957241058, "rewards/format_reward": 0.9795918166637421, "step": 1777 }, { "completion_length": 169.2142791748047, "epoch": 0.1789182389937107, "grad_norm": 0.9370786547660828, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7329930663108826, "reward_std": 0.12741263210773468, "rewards/accuracy_reward": 0.7329932153224945, "rewards/format_reward": 1.0, "step": 1778 }, { "completion_length": 243.78571319580078, "epoch": 0.1790188679245283, "grad_norm": 0.8259791135787964, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6539592742919922, "reward_std": 0.1508084386587143, "rewards/accuracy_reward": 0.6641634106636047, "rewards/format_reward": 0.9897959232330322, "step": 1779 }, { "completion_length": 200.81632232666016, "epoch": 0.17911949685534592, "grad_norm": 5.83355712890625, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6206915974617004, "reward_std": 0.2913898155093193, "rewards/accuracy_reward": 0.6206915974617004, "rewards/format_reward": 1.0, "step": 1780 }, { "completion_length": 198.56122589111328, "epoch": 0.17922012578616353, "grad_norm": 0.9522623419761658, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.666962444782257, "reward_std": 0.1479930728673935, "rewards/accuracy_reward": 0.6669624149799347, "rewards/format_reward": 1.0, "step": 1781 }, { "completion_length": 201.23468780517578, "epoch": 0.17932075471698114, "grad_norm": 1.5254677534103394, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.5843051075935364, "reward_std": 0.18914610520005226, "rewards/accuracy_reward": 0.5945091843605042, "rewards/format_reward": 0.9897959232330322, "step": 1782 }, { "completion_length": 205.09182739257812, "epoch": 0.17942138364779875, "grad_norm": 1.1254750490188599, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.751862645149231, "reward_std": 0.26488345116376877, "rewards/accuracy_reward": 0.762066662311554, "rewards/format_reward": 0.9897959232330322, "step": 1783 }, { "completion_length": 167.43877410888672, "epoch": 0.17952201257861636, "grad_norm": 0.9552911520004272, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7850340008735657, "reward_std": 0.12064821645617485, "rewards/accuracy_reward": 0.7952381074428558, "rewards/format_reward": 0.9897959232330322, "step": 1784 }, { "completion_length": 187.80612182617188, "epoch": 0.17962264150943397, "grad_norm": 0.88420170545578, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7287569046020508, "reward_std": 0.2298482358455658, "rewards/accuracy_reward": 0.7287569344043732, "rewards/format_reward": 1.0, "step": 1785 }, { "completion_length": 232.1224365234375, "epoch": 0.17972327044025158, "grad_norm": 0.712856650352478, "kl": 0.107666015625, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7614200115203857, "reward_std": 0.20402730256319046, "rewards/accuracy_reward": 0.7614199817180634, "rewards/format_reward": 1.0, "step": 1786 }, { "completion_length": 189.80611419677734, "epoch": 0.17982389937106918, "grad_norm": 1.0586167573928833, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7653060555458069, "reward_std": 0.23712077736854553, "rewards/accuracy_reward": 0.7755101919174194, "rewards/format_reward": 0.9897959232330322, "step": 1787 }, { "completion_length": 165.1530532836914, "epoch": 0.1799245283018868, "grad_norm": 1.0232378244400024, "kl": 0.096923828125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6776834726333618, "reward_std": 0.12827062979340553, "rewards/accuracy_reward": 0.6980916857719421, "rewards/format_reward": 0.9795918464660645, "step": 1788 }, { "completion_length": 199.11223602294922, "epoch": 0.1800251572327044, "grad_norm": 1.0341299772262573, "kl": 0.05859375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6325658559799194, "reward_std": 0.22236160933971405, "rewards/accuracy_reward": 0.652974009513855, "rewards/format_reward": 0.9795918166637421, "step": 1789 }, { "completion_length": 161.62245178222656, "epoch": 0.180125786163522, "grad_norm": 0.44319987297058105, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6632652878761292, "reward_std": 0.03485357388854027, "rewards/accuracy_reward": 0.6632652878761292, "rewards/format_reward": 1.0, "step": 1790 }, { "completion_length": 189.7040786743164, "epoch": 0.18022641509433962, "grad_norm": 0.9319689273834229, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6861494183540344, "reward_std": 0.127944678068161, "rewards/accuracy_reward": 0.6861494779586792, "rewards/format_reward": 1.0, "step": 1791 }, { "completion_length": 216.15306091308594, "epoch": 0.18032704402515723, "grad_norm": 1.0735208988189697, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6836004853248596, "reward_std": 0.20107129961252213, "rewards/accuracy_reward": 0.6836005449295044, "rewards/format_reward": 1.0, "step": 1792 }, { "completion_length": 198.58162689208984, "epoch": 0.18042767295597484, "grad_norm": 0.9564060568809509, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8274582624435425, "reward_std": 0.2178301140666008, "rewards/accuracy_reward": 0.8274582326412201, "rewards/format_reward": 1.0, "step": 1793 }, { "completion_length": 227.448974609375, "epoch": 0.18052830188679245, "grad_norm": 0.7023254036903381, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7482993006706238, "reward_std": 0.11275793239474297, "rewards/accuracy_reward": 0.7482993304729462, "rewards/format_reward": 1.0, "step": 1794 }, { "completion_length": 254.1530532836914, "epoch": 0.18062893081761006, "grad_norm": 0.9384056329727173, "kl": 0.05078125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6633774042129517, "reward_std": 0.2492564618587494, "rewards/accuracy_reward": 0.6735815107822418, "rewards/format_reward": 0.9897959232330322, "step": 1795 }, { "completion_length": 183.59182739257812, "epoch": 0.18072955974842767, "grad_norm": 6.505752086639404, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.521503210067749, "reward_std": 0.13691871985793114, "rewards/accuracy_reward": 0.5317072570323944, "rewards/format_reward": 0.9897959232330322, "step": 1796 }, { "completion_length": 258.83673095703125, "epoch": 0.18083018867924527, "grad_norm": 0.7656378149986267, "kl": 0.0723876953125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6617279052734375, "reward_std": 0.240365169942379, "rewards/accuracy_reward": 0.6617280840873718, "rewards/format_reward": 1.0, "step": 1797 }, { "completion_length": 272.87754821777344, "epoch": 0.18093081761006288, "grad_norm": 2.215785026550293, "kl": 0.14404296875, "learning_rate": 1e-06, "loss": 0.0058, "reward": 1.7064161896705627, "reward_std": 0.2221349999308586, "rewards/accuracy_reward": 0.7166202664375305, "rewards/format_reward": 0.9897959232330322, "step": 1798 }, { "completion_length": 191.17346954345703, "epoch": 0.1810314465408805, "grad_norm": 0.7302164435386658, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.566569447517395, "reward_std": 0.15436596795916557, "rewards/accuracy_reward": 0.5767735838890076, "rewards/format_reward": 0.9897959232330322, "step": 1799 }, { "completion_length": 209.2244873046875, "epoch": 0.1811320754716981, "grad_norm": 0.8898962736129761, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7489795684814453, "reward_std": 0.10934438183903694, "rewards/accuracy_reward": 0.7591836750507355, "rewards/format_reward": 0.9897959232330322, "step": 1800 }, { "completion_length": 207.64285278320312, "epoch": 0.1812327044025157, "grad_norm": 2.117086887359619, "kl": 0.106689453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.795076847076416, "reward_std": 0.19986699521541595, "rewards/accuracy_reward": 0.8154850006103516, "rewards/format_reward": 0.9795918464660645, "step": 1801 }, { "completion_length": 204.17346954345703, "epoch": 0.18133333333333335, "grad_norm": 0.6936955451965332, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7594309449195862, "reward_std": 0.18178751319646835, "rewards/accuracy_reward": 0.7798391878604889, "rewards/format_reward": 0.9795918464660645, "step": 1802 }, { "completion_length": 162.2755126953125, "epoch": 0.18143396226415096, "grad_norm": 0.7561336159706116, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7857142686843872, "reward_std": 0.13821138441562653, "rewards/accuracy_reward": 0.7959183752536774, "rewards/format_reward": 0.9897959232330322, "step": 1803 }, { "completion_length": 163.4081573486328, "epoch": 0.18153459119496856, "grad_norm": 0.8114879727363586, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.680369257926941, "reward_std": 0.1071251891553402, "rewards/accuracy_reward": 0.6803692579269409, "rewards/format_reward": 1.0, "step": 1804 }, { "completion_length": 177.10203552246094, "epoch": 0.18163522012578617, "grad_norm": 1.4131520986557007, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7428570985794067, "reward_std": 0.24404320865869522, "rewards/accuracy_reward": 0.7428571283817291, "rewards/format_reward": 1.0, "step": 1805 }, { "completion_length": 277.3163299560547, "epoch": 0.18173584905660378, "grad_norm": 0.5942214727401733, "kl": 0.0416259765625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7460871934890747, "reward_std": 0.1895165517926216, "rewards/accuracy_reward": 0.7460871636867523, "rewards/format_reward": 1.0, "step": 1806 }, { "completion_length": 182.1836700439453, "epoch": 0.1818364779874214, "grad_norm": 0.9815289378166199, "kl": 0.0592041015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.777287244796753, "reward_std": 0.2070872113108635, "rewards/accuracy_reward": 0.7976954579353333, "rewards/format_reward": 0.9795918166637421, "step": 1807 }, { "completion_length": 165.4693832397461, "epoch": 0.181937106918239, "grad_norm": 1.4441852569580078, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.828571379184723, "reward_std": 0.24536766111850739, "rewards/accuracy_reward": 0.8489795625209808, "rewards/format_reward": 0.9795918464660645, "step": 1808 }, { "completion_length": 238.61224365234375, "epoch": 0.1820377358490566, "grad_norm": 0.9459424614906311, "kl": 0.049560546875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6226192712783813, "reward_std": 0.3001397028565407, "rewards/accuracy_reward": 0.6532315313816071, "rewards/format_reward": 0.9693877398967743, "step": 1809 }, { "completion_length": 168.62245178222656, "epoch": 0.18213836477987422, "grad_norm": 1.7107497453689575, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6293488144874573, "reward_std": 0.28143545985221863, "rewards/accuracy_reward": 0.659961074590683, "rewards/format_reward": 0.9693877398967743, "step": 1810 }, { "completion_length": 184.84693145751953, "epoch": 0.18223899371069183, "grad_norm": 1.0523273944854736, "kl": 0.05712890625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7730623483657837, "reward_std": 0.2029121294617653, "rewards/accuracy_reward": 0.7934704720973969, "rewards/format_reward": 0.9795918166637421, "step": 1811 }, { "completion_length": 272.89796447753906, "epoch": 0.18233962264150944, "grad_norm": 0.8236611485481262, "kl": 0.0592041015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7199546694755554, "reward_std": 0.24401766806840897, "rewards/accuracy_reward": 0.7505668997764587, "rewards/format_reward": 0.9693877398967743, "step": 1812 }, { "completion_length": 264.35713958740234, "epoch": 0.18244025157232704, "grad_norm": 0.5838273167610168, "kl": 0.049560546875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.569171667098999, "reward_std": 0.11404548585414886, "rewards/accuracy_reward": 0.5793757438659668, "rewards/format_reward": 0.9897959232330322, "step": 1813 }, { "completion_length": 220.4591827392578, "epoch": 0.18254088050314465, "grad_norm": 0.6290989518165588, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.696371853351593, "reward_std": 0.23432984948158264, "rewards/accuracy_reward": 0.7269841134548187, "rewards/format_reward": 0.9693877398967743, "step": 1814 }, { "completion_length": 239.41836547851562, "epoch": 0.18264150943396226, "grad_norm": 0.6363841891288757, "kl": 0.05908203125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7129936218261719, "reward_std": 0.2000269815325737, "rewards/accuracy_reward": 0.7334017753601074, "rewards/format_reward": 0.9795918464660645, "step": 1815 }, { "completion_length": 271.02040100097656, "epoch": 0.18274213836477987, "grad_norm": 1.117447853088379, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6224489212036133, "reward_std": 0.2619134485721588, "rewards/accuracy_reward": 0.6224489510059357, "rewards/format_reward": 1.0, "step": 1816 }, { "completion_length": 299.8571472167969, "epoch": 0.18284276729559748, "grad_norm": 0.7086206078529358, "kl": 0.0506591796875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5816326141357422, "reward_std": 0.30019615590572357, "rewards/accuracy_reward": 0.6122449040412903, "rewards/format_reward": 0.9693877398967743, "step": 1817 }, { "completion_length": 193.62244415283203, "epoch": 0.1829433962264151, "grad_norm": 1.3316065073013306, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.675218641757965, "reward_std": 0.26802655309438705, "rewards/accuracy_reward": 0.6956267952919006, "rewards/format_reward": 0.9795918464660645, "step": 1818 }, { "completion_length": 221.9693832397461, "epoch": 0.1830440251572327, "grad_norm": 1.0374215841293335, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.684280812740326, "reward_std": 0.2717996910214424, "rewards/accuracy_reward": 0.7046889662742615, "rewards/format_reward": 0.9795918464660645, "step": 1819 }, { "completion_length": 195.21428680419922, "epoch": 0.1831446540880503, "grad_norm": 1.4329243898391724, "kl": 0.107666015625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.718552827835083, "reward_std": 0.15106475353240967, "rewards/accuracy_reward": 0.7287569046020508, "rewards/format_reward": 0.9897959232330322, "step": 1820 }, { "completion_length": 260.2857131958008, "epoch": 0.18324528301886792, "grad_norm": 1.0184966325759888, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.4936831593513489, "reward_std": 0.15782583877444267, "rewards/accuracy_reward": 0.49368317425251007, "rewards/format_reward": 1.0, "step": 1821 }, { "completion_length": 245.5, "epoch": 0.18334591194968552, "grad_norm": 1.1699345111846924, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7834384441375732, "reward_std": 0.22697683423757553, "rewards/accuracy_reward": 0.8038466274738312, "rewards/format_reward": 0.9795918166637421, "step": 1822 }, { "completion_length": 222.60203552246094, "epoch": 0.18344654088050313, "grad_norm": 0.7108540534973145, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5431784391403198, "reward_std": 0.19219355285167694, "rewards/accuracy_reward": 0.5533825755119324, "rewards/format_reward": 0.9897959232330322, "step": 1823 }, { "completion_length": 199.46938705444336, "epoch": 0.18354716981132074, "grad_norm": 0.9743837118148804, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7551019787788391, "reward_std": 0.18435019254684448, "rewards/accuracy_reward": 0.7653061151504517, "rewards/format_reward": 0.9897959232330322, "step": 1824 }, { "completion_length": 212.43877410888672, "epoch": 0.18364779874213835, "grad_norm": 0.42476367950439453, "kl": 0.0384521484375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.7975881695747375, "reward_std": 0.054439106956124306, "rewards/accuracy_reward": 0.7975882291793823, "rewards/format_reward": 1.0, "step": 1825 }, { "completion_length": 239.1938705444336, "epoch": 0.183748427672956, "grad_norm": 1.1975775957107544, "kl": 0.0697021484375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5598101019859314, "reward_std": 0.26250527054071426, "rewards/accuracy_reward": 0.5802182704210281, "rewards/format_reward": 0.9795918166637421, "step": 1826 }, { "completion_length": 193.4591827392578, "epoch": 0.1838490566037736, "grad_norm": 0.9867011904716492, "kl": 0.0655517578125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5229591727256775, "reward_std": 0.20673929899930954, "rewards/accuracy_reward": 0.5331632643938065, "rewards/format_reward": 0.9897959232330322, "step": 1827 }, { "completion_length": 165.7448959350586, "epoch": 0.1839496855345912, "grad_norm": 0.9333814978599548, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7925170063972473, "reward_std": 0.14761149883270264, "rewards/accuracy_reward": 0.7925169765949249, "rewards/format_reward": 1.0, "step": 1828 }, { "completion_length": 206.4897918701172, "epoch": 0.18405031446540882, "grad_norm": 0.9619346857070923, "kl": 0.0699462890625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7117199301719666, "reward_std": 0.07634665304794908, "rewards/accuracy_reward": 0.7117199301719666, "rewards/format_reward": 1.0, "step": 1829 }, { "completion_length": 241.73468780517578, "epoch": 0.18415094339622642, "grad_norm": 0.5024710297584534, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6244897246360779, "reward_std": 0.11986752226948738, "rewards/accuracy_reward": 0.6346938610076904, "rewards/format_reward": 0.9897959232330322, "step": 1830 }, { "completion_length": 175.02040100097656, "epoch": 0.18425157232704403, "grad_norm": 3.8885223865509033, "kl": 0.0521240234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7346938252449036, "reward_std": 0.1652088463306427, "rewards/accuracy_reward": 0.734693855047226, "rewards/format_reward": 1.0, "step": 1831 }, { "completion_length": 228.02040100097656, "epoch": 0.18435220125786164, "grad_norm": 0.5495272278785706, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6794183254241943, "reward_std": 0.12020628899335861, "rewards/accuracy_reward": 0.6998265385627747, "rewards/format_reward": 0.9795918464660645, "step": 1832 }, { "completion_length": 177.7653045654297, "epoch": 0.18445283018867925, "grad_norm": 1.0166813135147095, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7032997608184814, "reward_std": 0.12384052574634552, "rewards/accuracy_reward": 0.7032997012138367, "rewards/format_reward": 1.0, "step": 1833 }, { "completion_length": 175.36734008789062, "epoch": 0.18455345911949686, "grad_norm": 1.674094557762146, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8424551486968994, "reward_std": 0.09760072641074657, "rewards/accuracy_reward": 0.8424551486968994, "rewards/format_reward": 1.0, "step": 1834 }, { "completion_length": 184.8061180114746, "epoch": 0.18465408805031447, "grad_norm": 1.0743438005447388, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.806122362613678, "reward_std": 0.11584595590829849, "rewards/accuracy_reward": 0.8061224222183228, "rewards/format_reward": 1.0, "step": 1835 }, { "completion_length": 250.79591369628906, "epoch": 0.18475471698113208, "grad_norm": 1.5744925737380981, "kl": 0.0579833984375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7545634508132935, "reward_std": 0.17567259445786476, "rewards/accuracy_reward": 0.7545634508132935, "rewards/format_reward": 1.0, "step": 1836 }, { "completion_length": 177.77550506591797, "epoch": 0.1848553459119497, "grad_norm": 6.120277404785156, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8265305757522583, "reward_std": 0.16984088718891144, "rewards/accuracy_reward": 0.8367346823215485, "rewards/format_reward": 0.9897959232330322, "step": 1837 }, { "completion_length": 234.85713958740234, "epoch": 0.1849559748427673, "grad_norm": 1.3016304969787598, "kl": 0.065185546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7420889735221863, "reward_std": 0.2213917002081871, "rewards/accuracy_reward": 0.7522930800914764, "rewards/format_reward": 0.9897959232330322, "step": 1838 }, { "completion_length": 186.86734771728516, "epoch": 0.1850566037735849, "grad_norm": 1.2039737701416016, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6729648113250732, "reward_std": 0.14898373186588287, "rewards/accuracy_reward": 0.672964796423912, "rewards/format_reward": 1.0, "step": 1839 }, { "completion_length": 214.1836700439453, "epoch": 0.1851572327044025, "grad_norm": 0.6917938590049744, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.639455795288086, "reward_std": 0.18092107772827148, "rewards/accuracy_reward": 0.6598639488220215, "rewards/format_reward": 0.9795918464660645, "step": 1840 }, { "completion_length": 158.55101776123047, "epoch": 0.18525786163522012, "grad_norm": 0.5085185766220093, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8921157717704773, "reward_std": 0.10190672799944878, "rewards/accuracy_reward": 0.8921158313751221, "rewards/format_reward": 1.0, "step": 1841 }, { "completion_length": 271.9285659790039, "epoch": 0.18535849056603773, "grad_norm": 1.4431064128875732, "kl": 0.0582275390625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6206348538398743, "reward_std": 0.2693079933524132, "rewards/accuracy_reward": 0.6308389902114868, "rewards/format_reward": 0.9897959232330322, "step": 1842 }, { "completion_length": 247.14285278320312, "epoch": 0.18545911949685534, "grad_norm": 1.044700026512146, "kl": 0.049072265625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.554470419883728, "reward_std": 0.20849395543336868, "rewards/accuracy_reward": 0.5544703602790833, "rewards/format_reward": 1.0, "step": 1843 }, { "completion_length": 195.88774871826172, "epoch": 0.18555974842767295, "grad_norm": 1.1192115545272827, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7983037233352661, "reward_std": 0.14492326974868774, "rewards/accuracy_reward": 0.8085077702999115, "rewards/format_reward": 0.9897959232330322, "step": 1844 }, { "completion_length": 243.28570556640625, "epoch": 0.18566037735849056, "grad_norm": 0.6420137882232666, "kl": 0.0496826171875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.80498868227005, "reward_std": 0.14967603981494904, "rewards/accuracy_reward": 0.8049886524677277, "rewards/format_reward": 1.0, "step": 1845 }, { "completion_length": 282.7142791748047, "epoch": 0.18576100628930817, "grad_norm": 1.0386263132095337, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.714285671710968, "reward_std": 0.2664429470896721, "rewards/accuracy_reward": 0.7244897782802582, "rewards/format_reward": 0.9897959232330322, "step": 1846 }, { "completion_length": 225.39795684814453, "epoch": 0.18586163522012578, "grad_norm": 1.2473664283752441, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.705442190170288, "reward_std": 0.24797671288251877, "rewards/accuracy_reward": 0.736054390668869, "rewards/format_reward": 0.9693877398967743, "step": 1847 }, { "completion_length": 241.59183502197266, "epoch": 0.18596226415094338, "grad_norm": 0.716018557548523, "kl": 0.0478515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7817097902297974, "reward_std": 0.19478332996368408, "rewards/accuracy_reward": 0.7817097902297974, "rewards/format_reward": 1.0, "step": 1848 }, { "completion_length": 212.9591827392578, "epoch": 0.186062893081761, "grad_norm": 0.7685845494270325, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6646556854248047, "reward_std": 0.1602369174361229, "rewards/accuracy_reward": 0.6850639134645462, "rewards/format_reward": 0.9795918464660645, "step": 1849 }, { "completion_length": 201.7040786743164, "epoch": 0.1861635220125786, "grad_norm": 0.8367860913276672, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7982832789421082, "reward_std": 0.17707980424165726, "rewards/accuracy_reward": 0.8084873855113983, "rewards/format_reward": 0.9897959232330322, "step": 1850 }, { "completion_length": 253.2244873046875, "epoch": 0.18626415094339624, "grad_norm": 1.1587927341461182, "kl": 0.0545654296875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6306190490722656, "reward_std": 0.26591479033231735, "rewards/accuracy_reward": 0.6612313091754913, "rewards/format_reward": 0.9693877398967743, "step": 1851 }, { "completion_length": 320.6632614135742, "epoch": 0.18636477987421385, "grad_norm": 0.6627776622772217, "kl": 0.0711669921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6238327622413635, "reward_std": 0.14196093007922173, "rewards/accuracy_reward": 0.6544450521469116, "rewards/format_reward": 0.9693877398967743, "step": 1852 }, { "completion_length": 317.448974609375, "epoch": 0.18646540880503146, "grad_norm": 1.2177494764328003, "kl": 0.09765625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.4974821209907532, "reward_std": 0.25782378017902374, "rewards/accuracy_reward": 0.5076861679553986, "rewards/format_reward": 0.9897959232330322, "step": 1853 }, { "completion_length": 143.2040786743164, "epoch": 0.18656603773584907, "grad_norm": 0.8268029689788818, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.832921028137207, "reward_std": 0.0815240740776062, "rewards/accuracy_reward": 0.8329210877418518, "rewards/format_reward": 1.0, "step": 1854 }, { "completion_length": 184.87754440307617, "epoch": 0.18666666666666668, "grad_norm": 0.7466661334037781, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6777187585830688, "reward_std": 0.10714149102568626, "rewards/accuracy_reward": 0.6777188777923584, "rewards/format_reward": 1.0, "step": 1855 }, { "completion_length": 219.61224365234375, "epoch": 0.18676729559748428, "grad_norm": 0.6474363207817078, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7907058596611023, "reward_std": 0.07468035072088242, "rewards/accuracy_reward": 0.7907058596611023, "rewards/format_reward": 1.0, "step": 1856 }, { "completion_length": 194.08162689208984, "epoch": 0.1868679245283019, "grad_norm": 0.4103492796421051, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.9217687249183655, "reward_std": 0.0656127817928791, "rewards/accuracy_reward": 0.9217686951160431, "rewards/format_reward": 1.0, "step": 1857 }, { "completion_length": 264.948974609375, "epoch": 0.1869685534591195, "grad_norm": 1.5117669105529785, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5556122660636902, "reward_std": 0.27825959771871567, "rewards/accuracy_reward": 0.5760204046964645, "rewards/format_reward": 0.9795918464660645, "step": 1858 }, { "completion_length": 172.49999237060547, "epoch": 0.1870691823899371, "grad_norm": 1.4054137468338013, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8649173974990845, "reward_std": 0.1657535694539547, "rewards/accuracy_reward": 0.8751214146614075, "rewards/format_reward": 0.9897959232330322, "step": 1859 }, { "completion_length": 235.33673095703125, "epoch": 0.18716981132075472, "grad_norm": 0.6257138252258301, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6922857761383057, "reward_std": 0.18342887610197067, "rewards/accuracy_reward": 0.6922858357429504, "rewards/format_reward": 1.0, "step": 1860 }, { "completion_length": 273.1734619140625, "epoch": 0.18727044025157233, "grad_norm": 0.7330483198165894, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7523986101150513, "reward_std": 0.17188023403286934, "rewards/accuracy_reward": 0.762602686882019, "rewards/format_reward": 0.9897959232330322, "step": 1861 }, { "completion_length": 300.2550964355469, "epoch": 0.18737106918238994, "grad_norm": 1.0411155223846436, "kl": 0.0765380859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7109977006912231, "reward_std": 0.18909604102373123, "rewards/accuracy_reward": 0.7212018072605133, "rewards/format_reward": 0.9897959232330322, "step": 1862 }, { "completion_length": 209.04080963134766, "epoch": 0.18747169811320755, "grad_norm": 0.587833821773529, "kl": 0.0484619140625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6829618215560913, "reward_std": 0.13736249320209026, "rewards/accuracy_reward": 0.6829618513584137, "rewards/format_reward": 1.0, "step": 1863 }, { "completion_length": 305.8061218261719, "epoch": 0.18757232704402516, "grad_norm": 0.815432071685791, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.4301174879074097, "reward_std": 0.21322428435087204, "rewards/accuracy_reward": 0.4301174730062485, "rewards/format_reward": 1.0, "step": 1864 }, { "completion_length": 204.57142639160156, "epoch": 0.18767295597484276, "grad_norm": 0.9397623538970947, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7086755633354187, "reward_std": 0.16546685248613358, "rewards/accuracy_reward": 0.7290838360786438, "rewards/format_reward": 0.9795918464660645, "step": 1865 }, { "completion_length": 214.01020050048828, "epoch": 0.18777358490566037, "grad_norm": 1.5074079036712646, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6164847016334534, "reward_std": 0.1481752246618271, "rewards/accuracy_reward": 0.6266888082027435, "rewards/format_reward": 0.9897959232330322, "step": 1866 }, { "completion_length": 232.63265228271484, "epoch": 0.18787421383647798, "grad_norm": 0.7782421708106995, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6411564350128174, "reward_std": 0.16806386411190033, "rewards/accuracy_reward": 0.6411564648151398, "rewards/format_reward": 1.0, "step": 1867 }, { "completion_length": 301.9183654785156, "epoch": 0.1879748427672956, "grad_norm": 1.023547887802124, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5618935823440552, "reward_std": 0.20549732446670532, "rewards/accuracy_reward": 0.5720977485179901, "rewards/format_reward": 0.9897959232330322, "step": 1868 }, { "completion_length": 319.5, "epoch": 0.1880754716981132, "grad_norm": 1.189794898033142, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.4898298978805542, "reward_std": 0.3252359926700592, "rewards/accuracy_reward": 0.5306462943553925, "rewards/format_reward": 0.9591836631298065, "step": 1869 }, { "completion_length": 245.49999237060547, "epoch": 0.1881761006289308, "grad_norm": 0.5961570143699646, "kl": 0.0423583984375, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7695116996765137, "reward_std": 0.18833667039871216, "rewards/accuracy_reward": 0.7797158062458038, "rewards/format_reward": 0.9897959232330322, "step": 1870 }, { "completion_length": 216.88774871826172, "epoch": 0.18827672955974842, "grad_norm": 1.0572192668914795, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6933209300041199, "reward_std": 0.15773647651076317, "rewards/accuracy_reward": 0.6933209598064423, "rewards/format_reward": 1.0, "step": 1871 }, { "completion_length": 243.62244415283203, "epoch": 0.18837735849056603, "grad_norm": 2.803226947784424, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6661286354064941, "reward_std": 0.18544132262468338, "rewards/accuracy_reward": 0.6763326823711395, "rewards/format_reward": 0.9897959232330322, "step": 1872 }, { "completion_length": 230.98978424072266, "epoch": 0.18847798742138364, "grad_norm": 0.7200779914855957, "kl": 0.0799560546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5967970490455627, "reward_std": 0.22430811077356339, "rewards/accuracy_reward": 0.6172052174806595, "rewards/format_reward": 0.9795918166637421, "step": 1873 }, { "completion_length": 181.1938705444336, "epoch": 0.18857861635220124, "grad_norm": 0.4477297067642212, "kl": 0.0655517578125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.884075105190277, "reward_std": 0.06425597332417965, "rewards/accuracy_reward": 0.8840751051902771, "rewards/format_reward": 1.0, "step": 1874 }, { "completion_length": 230.74488830566406, "epoch": 0.18867924528301888, "grad_norm": 1.5044441223144531, "kl": 0.127197265625, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.7431663870811462, "reward_std": 0.21982048451900482, "rewards/accuracy_reward": 0.7533704340457916, "rewards/format_reward": 0.9897959232330322, "step": 1875 }, { "completion_length": 283.6326599121094, "epoch": 0.1887798742138365, "grad_norm": 0.8590790629386902, "kl": 0.0728759765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6049562096595764, "reward_std": 0.30577316880226135, "rewards/accuracy_reward": 0.6355684697628021, "rewards/format_reward": 0.9693877398967743, "step": 1876 }, { "completion_length": 241.39795684814453, "epoch": 0.1888805031446541, "grad_norm": 0.6707947254180908, "kl": 0.03662109375, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.6794593930244446, "reward_std": 0.11733349040150642, "rewards/accuracy_reward": 0.6794594526290894, "rewards/format_reward": 1.0, "step": 1877 }, { "completion_length": 307.7857208251953, "epoch": 0.1889811320754717, "grad_norm": 0.7969806790351868, "kl": 0.0416259765625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6439951062202454, "reward_std": 0.21580829471349716, "rewards/accuracy_reward": 0.6541992723941803, "rewards/format_reward": 0.9897959232330322, "step": 1878 }, { "completion_length": 219.0204086303711, "epoch": 0.18908176100628932, "grad_norm": 1.0255215167999268, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7294136881828308, "reward_std": 0.20549291372299194, "rewards/accuracy_reward": 0.7294136881828308, "rewards/format_reward": 1.0, "step": 1879 }, { "completion_length": 189.38774871826172, "epoch": 0.18918238993710693, "grad_norm": 0.8337689638137817, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8715986609458923, "reward_std": 0.08695212192833424, "rewards/accuracy_reward": 0.87159863114357, "rewards/format_reward": 1.0, "step": 1880 }, { "completion_length": 210.11224365234375, "epoch": 0.18928301886792454, "grad_norm": 0.862312376499176, "kl": 0.0595703125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6942437291145325, "reward_std": 0.12467120960354805, "rewards/accuracy_reward": 0.6942438185214996, "rewards/format_reward": 1.0, "step": 1881 }, { "completion_length": 213.11224365234375, "epoch": 0.18938364779874214, "grad_norm": 2.6344921588897705, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8202356696128845, "reward_std": 0.1291266530752182, "rewards/accuracy_reward": 0.8202356100082397, "rewards/format_reward": 1.0, "step": 1882 }, { "completion_length": 265.3775405883789, "epoch": 0.18948427672955975, "grad_norm": 0.9074133038520813, "kl": 0.0469970703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6777868866920471, "reward_std": 0.22949715703725815, "rewards/accuracy_reward": 0.6777868270874023, "rewards/format_reward": 1.0, "step": 1883 }, { "completion_length": 297.2550964355469, "epoch": 0.18958490566037736, "grad_norm": 196.9308624267578, "kl": 8.656494140625, "learning_rate": 1e-06, "loss": 0.3472, "reward": 1.699306070804596, "reward_std": 0.2539086639881134, "rewards/accuracy_reward": 0.7197141945362091, "rewards/format_reward": 0.9795918166637421, "step": 1884 }, { "completion_length": 254.38775634765625, "epoch": 0.18968553459119497, "grad_norm": 0.6909894347190857, "kl": 0.05517578125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.598453938961029, "reward_std": 0.2797270938754082, "rewards/accuracy_reward": 0.6392702162265778, "rewards/format_reward": 0.9591836333274841, "step": 1885 }, { "completion_length": 307.13265228271484, "epoch": 0.18978616352201258, "grad_norm": 0.8969265818595886, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7449788451194763, "reward_std": 0.2621697708964348, "rewards/accuracy_reward": 0.7551830112934113, "rewards/format_reward": 0.9897959232330322, "step": 1886 }, { "completion_length": 278.7652893066406, "epoch": 0.1898867924528302, "grad_norm": 0.3411753177642822, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6918367147445679, "reward_std": 0.14854149892926216, "rewards/accuracy_reward": 0.7122448682785034, "rewards/format_reward": 0.9795918166637421, "step": 1887 }, { "completion_length": 233.01020050048828, "epoch": 0.1899874213836478, "grad_norm": 1.1626533269882202, "kl": 0.0672607421875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6054054498672485, "reward_std": 0.22195270657539368, "rewards/accuracy_reward": 0.6156094968318939, "rewards/format_reward": 0.9897959232330322, "step": 1888 }, { "completion_length": 276.9897918701172, "epoch": 0.1900880503144654, "grad_norm": 1.2925325632095337, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5658385753631592, "reward_std": 0.22843965888023376, "rewards/accuracy_reward": 0.5658385157585144, "rewards/format_reward": 1.0, "step": 1889 }, { "completion_length": 244.59183502197266, "epoch": 0.19018867924528302, "grad_norm": 0.8084824681282043, "kl": 0.0513916015625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6896257996559143, "reward_std": 0.189032144844532, "rewards/accuracy_reward": 0.6998299062252045, "rewards/format_reward": 0.9897959232330322, "step": 1890 }, { "completion_length": 259.76529693603516, "epoch": 0.19028930817610062, "grad_norm": 0.6875227689743042, "kl": 0.0499267578125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8414456248283386, "reward_std": 0.21668777614831924, "rewards/accuracy_reward": 0.8516497611999512, "rewards/format_reward": 0.9897959232330322, "step": 1891 }, { "completion_length": 368.32652282714844, "epoch": 0.19038993710691823, "grad_norm": 0.7989954948425293, "kl": 0.052490234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6835379600524902, "reward_std": 0.3096299022436142, "rewards/accuracy_reward": 0.7243543267250061, "rewards/format_reward": 0.9591836631298065, "step": 1892 }, { "completion_length": 186.62245178222656, "epoch": 0.19049056603773584, "grad_norm": 2.6103837490081787, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7925170063972473, "reward_std": 0.12168079242110252, "rewards/accuracy_reward": 0.8027210831642151, "rewards/format_reward": 0.9897959232330322, "step": 1893 }, { "completion_length": 344.7142791748047, "epoch": 0.19059119496855345, "grad_norm": 0.8726804256439209, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5886102318763733, "reward_std": 0.31964805722236633, "rewards/accuracy_reward": 0.6192225515842438, "rewards/format_reward": 0.9693877398967743, "step": 1894 }, { "completion_length": 249.69387817382812, "epoch": 0.19069182389937106, "grad_norm": 0.7798527479171753, "kl": 0.0643310546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.766233742237091, "reward_std": 0.17992983758449554, "rewards/accuracy_reward": 0.7866418957710266, "rewards/format_reward": 0.9795918464660645, "step": 1895 }, { "completion_length": 225.4285659790039, "epoch": 0.19079245283018867, "grad_norm": 0.5776962041854858, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7050116062164307, "reward_std": 0.14257048070430756, "rewards/accuracy_reward": 0.7152157425880432, "rewards/format_reward": 0.9897959232330322, "step": 1896 }, { "completion_length": 258.2550964355469, "epoch": 0.19089308176100628, "grad_norm": 0.49416637420654297, "kl": 0.04345703125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7857142686843872, "reward_std": 0.14599499851465225, "rewards/accuracy_reward": 0.7959183752536774, "rewards/format_reward": 0.9897959232330322, "step": 1897 }, { "completion_length": 202.9387664794922, "epoch": 0.1909937106918239, "grad_norm": 0.6450547575950623, "kl": 0.05126953125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6345083713531494, "reward_std": 0.12699662148952484, "rewards/accuracy_reward": 0.6651205718517303, "rewards/format_reward": 0.9693877398967743, "step": 1898 }, { "completion_length": 291.1938781738281, "epoch": 0.1910943396226415, "grad_norm": 0.821898877620697, "kl": 0.048828125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6785714030265808, "reward_std": 0.22951554507017136, "rewards/accuracy_reward": 0.688775509595871, "rewards/format_reward": 0.9897959232330322, "step": 1899 }, { "completion_length": 282.61224365234375, "epoch": 0.19119496855345913, "grad_norm": 0.8236446380615234, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6820011138916016, "reward_std": 0.20263849943876266, "rewards/accuracy_reward": 0.6820011734962463, "rewards/format_reward": 1.0, "step": 1900 }, { "completion_length": 311.5408172607422, "epoch": 0.19129559748427674, "grad_norm": 1.0250678062438965, "kl": 0.05615234375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5155032873153687, "reward_std": 0.3687545210123062, "rewards/accuracy_reward": 0.5767278075218201, "rewards/format_reward": 0.9387754797935486, "step": 1901 }, { "completion_length": 288.0408020019531, "epoch": 0.19139622641509435, "grad_norm": 0.4637211859226227, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6459723114967346, "reward_std": 0.17858701199293137, "rewards/accuracy_reward": 0.6765846014022827, "rewards/format_reward": 0.9693877398967743, "step": 1902 }, { "completion_length": 219.1836700439453, "epoch": 0.19149685534591196, "grad_norm": 0.7975533604621887, "kl": 0.0595703125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7827975153923035, "reward_std": 0.18042335659265518, "rewards/accuracy_reward": 0.7930015325546265, "rewards/format_reward": 0.9897959232330322, "step": 1903 }, { "completion_length": 199.7040786743164, "epoch": 0.19159748427672957, "grad_norm": 11.431389808654785, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6887938380241394, "reward_std": 0.3234550952911377, "rewards/accuracy_reward": 0.7194061279296875, "rewards/format_reward": 0.9693877398967743, "step": 1904 }, { "completion_length": 211.33673095703125, "epoch": 0.19169811320754718, "grad_norm": 0.6580115556716919, "kl": 0.0640869140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.754970669746399, "reward_std": 0.1303359642624855, "rewards/accuracy_reward": 0.7549707591533661, "rewards/format_reward": 1.0, "step": 1905 }, { "completion_length": 180.32653045654297, "epoch": 0.1917987421383648, "grad_norm": 1.516585350036621, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7333616614341736, "reward_std": 0.15782105922698975, "rewards/accuracy_reward": 0.733361691236496, "rewards/format_reward": 1.0, "step": 1906 }, { "completion_length": 223.33673095703125, "epoch": 0.1918993710691824, "grad_norm": 2.2493371963500977, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.538042426109314, "reward_std": 0.1761426106095314, "rewards/accuracy_reward": 0.5482465028762817, "rewards/format_reward": 0.9897959232330322, "step": 1907 }, { "completion_length": 243.71428680419922, "epoch": 0.192, "grad_norm": 5.139043807983398, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6836734414100647, "reward_std": 0.26681824028491974, "rewards/accuracy_reward": 0.7040816247463226, "rewards/format_reward": 0.9795918464660645, "step": 1908 }, { "completion_length": 243.26529693603516, "epoch": 0.1921006289308176, "grad_norm": 1.35785973072052, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6207482814788818, "reward_std": 0.2549324333667755, "rewards/accuracy_reward": 0.6309523582458496, "rewards/format_reward": 0.9897959232330322, "step": 1909 }, { "completion_length": 205.36734008789062, "epoch": 0.19220125786163522, "grad_norm": 0.643616259098053, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8287363052368164, "reward_std": 0.1433895118534565, "rewards/accuracy_reward": 0.8287363052368164, "rewards/format_reward": 1.0, "step": 1910 }, { "completion_length": 287.4387664794922, "epoch": 0.19230188679245283, "grad_norm": 0.890548586845398, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5610544085502625, "reward_std": 0.16896734200417995, "rewards/accuracy_reward": 0.5610544085502625, "rewards/format_reward": 1.0, "step": 1911 }, { "completion_length": 218.2755126953125, "epoch": 0.19240251572327044, "grad_norm": 1.0530669689178467, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6316267251968384, "reward_std": 0.20689667761325836, "rewards/accuracy_reward": 0.6418308317661285, "rewards/format_reward": 0.9897959232330322, "step": 1912 }, { "completion_length": 212.5, "epoch": 0.19250314465408805, "grad_norm": 1.9984641075134277, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7216463685035706, "reward_std": 0.21435287594795227, "rewards/accuracy_reward": 0.7318505346775055, "rewards/format_reward": 0.9897959232330322, "step": 1913 }, { "completion_length": 243.8673324584961, "epoch": 0.19260377358490566, "grad_norm": 1.0786375999450684, "kl": 0.0650634765625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5702947974205017, "reward_std": 0.1889830306172371, "rewards/accuracy_reward": 0.5702947676181793, "rewards/format_reward": 1.0, "step": 1914 }, { "completion_length": 245.89794921875, "epoch": 0.19270440251572327, "grad_norm": 0.5741130709648132, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7522616982460022, "reward_std": 0.138689786195755, "rewards/accuracy_reward": 0.752261608839035, "rewards/format_reward": 1.0, "step": 1915 }, { "completion_length": 258.5306091308594, "epoch": 0.19280503144654088, "grad_norm": 1.2363824844360352, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.4844031929969788, "reward_std": 0.23427043110132217, "rewards/accuracy_reward": 0.4844031184911728, "rewards/format_reward": 1.0, "step": 1916 }, { "completion_length": 176.0204086303711, "epoch": 0.19290566037735848, "grad_norm": 1.0383391380310059, "kl": 0.0521240234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8503401279449463, "reward_std": 0.11017340235412121, "rewards/accuracy_reward": 0.8503401577472687, "rewards/format_reward": 1.0, "step": 1917 }, { "completion_length": 195.41836166381836, "epoch": 0.1930062893081761, "grad_norm": 0.776311457157135, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7108843326568604, "reward_std": 0.06299407407641411, "rewards/accuracy_reward": 0.7108843475580215, "rewards/format_reward": 1.0, "step": 1918 }, { "completion_length": 208.05101776123047, "epoch": 0.1931069182389937, "grad_norm": 0.5985573530197144, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8015992045402527, "reward_std": 0.12193852663040161, "rewards/accuracy_reward": 0.8118033111095428, "rewards/format_reward": 0.9897959232330322, "step": 1919 }, { "completion_length": 199.78571319580078, "epoch": 0.1932075471698113, "grad_norm": 0.962162435054779, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6594387888908386, "reward_std": 0.24326171725988388, "rewards/accuracy_reward": 0.6696428656578064, "rewards/format_reward": 0.9897959232330322, "step": 1920 }, { "completion_length": 228.30612182617188, "epoch": 0.19330817610062892, "grad_norm": 1.0936930179595947, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7089367508888245, "reward_std": 0.2529304698109627, "rewards/accuracy_reward": 0.7293449640274048, "rewards/format_reward": 0.9795918464660645, "step": 1921 }, { "completion_length": 254.1428451538086, "epoch": 0.19340880503144653, "grad_norm": 0.7270193696022034, "kl": 0.05029296875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6890814900398254, "reward_std": 0.07292085513472557, "rewards/accuracy_reward": 0.6890815198421478, "rewards/format_reward": 1.0, "step": 1922 }, { "completion_length": 296.7142791748047, "epoch": 0.19350943396226414, "grad_norm": 0.6818730235099792, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5631680488586426, "reward_std": 0.1191653460264206, "rewards/accuracy_reward": 0.5733721852302551, "rewards/format_reward": 0.9897959232330322, "step": 1923 }, { "completion_length": 277.3571319580078, "epoch": 0.19361006289308177, "grad_norm": 0.8392757773399353, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.527506411075592, "reward_std": 0.18217406421899796, "rewards/accuracy_reward": 0.537710502743721, "rewards/format_reward": 0.9897959232330322, "step": 1924 }, { "completion_length": 266.5816345214844, "epoch": 0.19371069182389938, "grad_norm": 0.5903117060661316, "kl": 0.04931640625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5682215094566345, "reward_std": 0.19921643286943436, "rewards/accuracy_reward": 0.5988337993621826, "rewards/format_reward": 0.9693877398967743, "step": 1925 }, { "completion_length": 219.7653045654297, "epoch": 0.193811320754717, "grad_norm": 0.9999762177467346, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7015637159347534, "reward_std": 0.1808339743874967, "rewards/accuracy_reward": 0.7015637159347534, "rewards/format_reward": 1.0, "step": 1926 }, { "completion_length": 233.12245178222656, "epoch": 0.1939119496855346, "grad_norm": 0.7905865907669067, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6373575329780579, "reward_std": 0.1530509740114212, "rewards/accuracy_reward": 0.6577657014131546, "rewards/format_reward": 0.9795918464660645, "step": 1927 }, { "completion_length": 231.87754821777344, "epoch": 0.1940125786163522, "grad_norm": 2.8994550704956055, "kl": 0.0655517578125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5224143862724304, "reward_std": 0.257743239402771, "rewards/accuracy_reward": 0.5326184928417206, "rewards/format_reward": 0.9897959232330322, "step": 1928 }, { "completion_length": 139.7448959350586, "epoch": 0.19411320754716982, "grad_norm": 0.8096266388893127, "kl": 0.1044921875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.674064576625824, "reward_std": 0.07435411866754293, "rewards/accuracy_reward": 0.6740646362304688, "rewards/format_reward": 1.0, "step": 1929 }, { "completion_length": 237.37754821777344, "epoch": 0.19421383647798743, "grad_norm": 0.675593912601471, "kl": 0.0592041015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5865095257759094, "reward_std": 0.22040100395679474, "rewards/accuracy_reward": 0.5967136323451996, "rewards/format_reward": 0.9897959232330322, "step": 1930 }, { "completion_length": 222.2040786743164, "epoch": 0.19431446540880504, "grad_norm": 0.7206714749336243, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7482109069824219, "reward_std": 0.12478571757674217, "rewards/accuracy_reward": 0.7482109367847443, "rewards/format_reward": 1.0, "step": 1931 }, { "completion_length": 224.1326446533203, "epoch": 0.19441509433962265, "grad_norm": 6.140426158905029, "kl": 0.0482177734375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6045351028442383, "reward_std": 0.11711684986948967, "rewards/accuracy_reward": 0.6045351326465607, "rewards/format_reward": 1.0, "step": 1932 }, { "completion_length": 281.6122360229492, "epoch": 0.19451572327044026, "grad_norm": 0.7092191576957703, "kl": 0.046875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7823128700256348, "reward_std": 0.2915784567594528, "rewards/accuracy_reward": 0.7925170063972473, "rewards/format_reward": 0.9897959232330322, "step": 1933 }, { "completion_length": 177.51020050048828, "epoch": 0.19461635220125786, "grad_norm": 1.0984498262405396, "kl": 0.1162109375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.6905784010887146, "reward_std": 0.23607034236192703, "rewards/accuracy_reward": 0.7007825374603271, "rewards/format_reward": 0.9897959232330322, "step": 1934 }, { "completion_length": 175.14285278320312, "epoch": 0.19471698113207547, "grad_norm": 2.3243794441223145, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6520304083824158, "reward_std": 0.14029264077544212, "rewards/accuracy_reward": 0.6520304679870605, "rewards/format_reward": 1.0, "step": 1935 }, { "completion_length": 211.08163452148438, "epoch": 0.19481761006289308, "grad_norm": 0.5516336560249329, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8073979020118713, "reward_std": 0.18217843025922775, "rewards/accuracy_reward": 0.8176020085811615, "rewards/format_reward": 0.9897959232330322, "step": 1936 }, { "completion_length": 217.57142639160156, "epoch": 0.1949182389937107, "grad_norm": 0.8260658979415894, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7244897484779358, "reward_std": 0.2334745079278946, "rewards/accuracy_reward": 0.7448979616165161, "rewards/format_reward": 0.9795918166637421, "step": 1937 }, { "completion_length": 275.1122360229492, "epoch": 0.1950188679245283, "grad_norm": 0.9777312874794006, "kl": 0.0457763671875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7448979020118713, "reward_std": 0.2033890038728714, "rewards/accuracy_reward": 0.7448979318141937, "rewards/format_reward": 1.0, "step": 1938 }, { "completion_length": 229.77550506591797, "epoch": 0.1951194968553459, "grad_norm": 0.8716727495193481, "kl": 0.0699462890625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5540884137153625, "reward_std": 0.17502304166555405, "rewards/accuracy_reward": 0.5540883392095566, "rewards/format_reward": 1.0, "step": 1939 }, { "completion_length": 168.23468780517578, "epoch": 0.19522012578616352, "grad_norm": 0.7768348455429077, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7130061984062195, "reward_std": 0.14247651398181915, "rewards/accuracy_reward": 0.7130062282085419, "rewards/format_reward": 1.0, "step": 1940 }, { "completion_length": 189.12244415283203, "epoch": 0.19532075471698113, "grad_norm": 0.6734170317649841, "kl": 0.0487060546875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6753246188163757, "reward_std": 0.11612255498766899, "rewards/accuracy_reward": 0.6753246188163757, "rewards/format_reward": 1.0, "step": 1941 }, { "completion_length": 211.27550506591797, "epoch": 0.19542138364779874, "grad_norm": 102.27176666259766, "kl": 5.368896484375, "learning_rate": 1e-06, "loss": 0.2148, "reward": 1.710592806339264, "reward_std": 0.13321801647543907, "rewards/accuracy_reward": 0.7310009598731995, "rewards/format_reward": 0.9795918166637421, "step": 1942 }, { "completion_length": 243.08163452148438, "epoch": 0.19552201257861634, "grad_norm": 0.8038986325263977, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.745869755744934, "reward_std": 0.24605268239974976, "rewards/accuracy_reward": 0.766277939081192, "rewards/format_reward": 0.9795918464660645, "step": 1943 }, { "completion_length": 222.15306091308594, "epoch": 0.19562264150943395, "grad_norm": 1.2095855474472046, "kl": 0.0640869140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5732260942459106, "reward_std": 0.19435857981443405, "rewards/accuracy_reward": 0.5834302604198456, "rewards/format_reward": 0.9897959232330322, "step": 1944 }, { "completion_length": 252.448974609375, "epoch": 0.19572327044025156, "grad_norm": 0.6588822603225708, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6839929223060608, "reward_std": 0.21217148005962372, "rewards/accuracy_reward": 0.6941970884799957, "rewards/format_reward": 0.9897959232330322, "step": 1945 }, { "completion_length": 250.69387817382812, "epoch": 0.19582389937106917, "grad_norm": 0.590347945690155, "kl": 0.0543212890625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6616455912590027, "reward_std": 0.2189301922917366, "rewards/accuracy_reward": 0.6616455912590027, "rewards/format_reward": 1.0, "step": 1946 }, { "completion_length": 213.02040100097656, "epoch": 0.19592452830188678, "grad_norm": 1.110591173171997, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7342079281806946, "reward_std": 0.10873100720345974, "rewards/accuracy_reward": 0.7342078983783722, "rewards/format_reward": 1.0, "step": 1947 }, { "completion_length": 211.31632232666016, "epoch": 0.1960251572327044, "grad_norm": 0.726730227470398, "kl": 0.0699462890625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7275510430335999, "reward_std": 0.17855770885944366, "rewards/accuracy_reward": 0.7377550899982452, "rewards/format_reward": 0.9897959232330322, "step": 1948 }, { "completion_length": 225.73468780517578, "epoch": 0.19612578616352203, "grad_norm": 0.850817084312439, "kl": 0.0494384765625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.68367338180542, "reward_std": 0.12223168462514877, "rewards/accuracy_reward": 0.7040815949440002, "rewards/format_reward": 0.9795918464660645, "step": 1949 }, { "completion_length": 203.88774871826172, "epoch": 0.19622641509433963, "grad_norm": 1.0667366981506348, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.70998615026474, "reward_std": 0.14552296325564384, "rewards/accuracy_reward": 0.70998615026474, "rewards/format_reward": 1.0, "step": 1950 }, { "completion_length": 208.448974609375, "epoch": 0.19632704402515724, "grad_norm": 0.9657596945762634, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8003152012825012, "reward_std": 0.15535226836800575, "rewards/accuracy_reward": 0.8105193078517914, "rewards/format_reward": 0.9897959232330322, "step": 1951 }, { "completion_length": 184.2142791748047, "epoch": 0.19642767295597485, "grad_norm": 0.5082454085350037, "kl": 0.0826416015625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8684446215629578, "reward_std": 0.07824789732694626, "rewards/accuracy_reward": 0.8786486983299255, "rewards/format_reward": 0.9897959232330322, "step": 1952 }, { "completion_length": 247.33673095703125, "epoch": 0.19652830188679246, "grad_norm": 0.684371292591095, "kl": 0.0738525390625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6100594997406006, "reward_std": 0.18363669514656067, "rewards/accuracy_reward": 0.6100594699382782, "rewards/format_reward": 1.0, "step": 1953 }, { "completion_length": 250.9591827392578, "epoch": 0.19662893081761007, "grad_norm": 0.8017944097518921, "kl": 0.0526123046875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5796101093292236, "reward_std": 0.15121835097670555, "rewards/accuracy_reward": 0.5796100497245789, "rewards/format_reward": 1.0, "step": 1954 }, { "completion_length": 201.10203552246094, "epoch": 0.19672955974842768, "grad_norm": 0.6251445412635803, "kl": 0.0697021484375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.777953326702118, "reward_std": 0.1882735937833786, "rewards/accuracy_reward": 0.7779532670974731, "rewards/format_reward": 1.0, "step": 1955 }, { "completion_length": 186.4897918701172, "epoch": 0.1968301886792453, "grad_norm": 0.5127649307250977, "kl": 0.05029296875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8525267243385315, "reward_std": 0.1329033225774765, "rewards/accuracy_reward": 0.8627307713031769, "rewards/format_reward": 0.9897959232330322, "step": 1956 }, { "completion_length": 216.61223602294922, "epoch": 0.1969308176100629, "grad_norm": 0.5702341794967651, "kl": 0.0677490234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7346938252449036, "reward_std": 0.14284342527389526, "rewards/accuracy_reward": 0.7551020085811615, "rewards/format_reward": 0.9795918464660645, "step": 1957 }, { "completion_length": 208.9387664794922, "epoch": 0.1970314465408805, "grad_norm": 0.70222008228302, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6437803506851196, "reward_std": 0.20319464802742004, "rewards/accuracy_reward": 0.6437803506851196, "rewards/format_reward": 1.0, "step": 1958 }, { "completion_length": 270.4387741088867, "epoch": 0.19713207547169811, "grad_norm": 1.0324430465698242, "kl": 0.0498046875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.517492651939392, "reward_std": 0.24651308357715607, "rewards/accuracy_reward": 0.5276967585086823, "rewards/format_reward": 0.9897959232330322, "step": 1959 }, { "completion_length": 241.38774871826172, "epoch": 0.19723270440251572, "grad_norm": 0.6444383859634399, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8305886387825012, "reward_std": 0.19190069288015366, "rewards/accuracy_reward": 0.8305886685848236, "rewards/format_reward": 1.0, "step": 1960 }, { "completion_length": 185.78570556640625, "epoch": 0.19733333333333333, "grad_norm": 0.7032864689826965, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.675736904144287, "reward_std": 0.14034833386540413, "rewards/accuracy_reward": 0.6757369339466095, "rewards/format_reward": 1.0, "step": 1961 }, { "completion_length": 238.04080963134766, "epoch": 0.19743396226415094, "grad_norm": 0.9235148429870605, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.68483966588974, "reward_std": 0.25623802095651627, "rewards/accuracy_reward": 0.7154518663883209, "rewards/format_reward": 0.9693877398967743, "step": 1962 }, { "completion_length": 203.93877410888672, "epoch": 0.19753459119496855, "grad_norm": 1.338651180267334, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7462098598480225, "reward_std": 0.2802850604057312, "rewards/accuracy_reward": 0.7564139664173126, "rewards/format_reward": 0.9897959232330322, "step": 1963 }, { "completion_length": 245.36734008789062, "epoch": 0.19763522012578616, "grad_norm": 0.6892933249473572, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5714285969734192, "reward_std": 0.13864652067422867, "rewards/accuracy_reward": 0.5816326588392258, "rewards/format_reward": 0.9897959232330322, "step": 1964 }, { "completion_length": 201.61224365234375, "epoch": 0.19773584905660377, "grad_norm": 1.213762640953064, "kl": 0.062744140625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7448007464408875, "reward_std": 0.19618020951747894, "rewards/accuracy_reward": 0.7550048828125, "rewards/format_reward": 0.9897959232330322, "step": 1965 }, { "completion_length": 250.39794921875, "epoch": 0.19783647798742138, "grad_norm": 0.6695626378059387, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5575871467590332, "reward_std": 0.2157311886548996, "rewards/accuracy_reward": 0.567791298031807, "rewards/format_reward": 0.9897959232330322, "step": 1966 }, { "completion_length": 187.7448959350586, "epoch": 0.197937106918239, "grad_norm": 0.8876319527626038, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7336071729660034, "reward_std": 0.17439544945955276, "rewards/accuracy_reward": 0.7438112795352936, "rewards/format_reward": 0.9897959232330322, "step": 1967 }, { "completion_length": 252.32653045654297, "epoch": 0.1980377358490566, "grad_norm": 0.7603585720062256, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.63150292634964, "reward_std": 0.20057158544659615, "rewards/accuracy_reward": 0.6519111096858978, "rewards/format_reward": 0.9795918166637421, "step": 1968 }, { "completion_length": 146.85713958740234, "epoch": 0.1981383647798742, "grad_norm": 1.4664517641067505, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6937891840934753, "reward_std": 0.23679044097661972, "rewards/accuracy_reward": 0.7039932608604431, "rewards/format_reward": 0.9897959232330322, "step": 1969 }, { "completion_length": 226.27550506591797, "epoch": 0.1982389937106918, "grad_norm": 0.895024836063385, "kl": 0.0787353515625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7959182858467102, "reward_std": 0.2125505730509758, "rewards/accuracy_reward": 0.8163264989852905, "rewards/format_reward": 0.9795918464660645, "step": 1970 }, { "completion_length": 270.8061218261719, "epoch": 0.19833962264150942, "grad_norm": 1.021531581878662, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.5351474285125732, "reward_std": 0.21323642134666443, "rewards/accuracy_reward": 0.5453514754772186, "rewards/format_reward": 0.9897959232330322, "step": 1971 }, { "completion_length": 247.4285659790039, "epoch": 0.19844025157232703, "grad_norm": 0.9005959033966064, "kl": 0.0699462890625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.569978654384613, "reward_std": 0.2678995952010155, "rewards/accuracy_reward": 0.5903867930173874, "rewards/format_reward": 0.9795918464660645, "step": 1972 }, { "completion_length": 292.8061218261719, "epoch": 0.19854088050314467, "grad_norm": 0.6835687160491943, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5345560908317566, "reward_std": 0.21031438559293747, "rewards/accuracy_reward": 0.534556195139885, "rewards/format_reward": 1.0, "step": 1973 }, { "completion_length": 245.87754821777344, "epoch": 0.19864150943396228, "grad_norm": 1.7283164262771606, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6235091090202332, "reward_std": 0.23310843855142593, "rewards/accuracy_reward": 0.6337131857872009, "rewards/format_reward": 0.9897959232330322, "step": 1974 }, { "completion_length": 210.28570556640625, "epoch": 0.19874213836477989, "grad_norm": 1.1474870443344116, "kl": 0.0457763671875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6647564768791199, "reward_std": 0.14903127029538155, "rewards/accuracy_reward": 0.6749605536460876, "rewards/format_reward": 0.9897959232330322, "step": 1975 }, { "completion_length": 220.95917510986328, "epoch": 0.1988427672955975, "grad_norm": 0.578010618686676, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5738337635993958, "reward_std": 0.1166091114282608, "rewards/accuracy_reward": 0.5840378701686859, "rewards/format_reward": 0.9897959232330322, "step": 1976 }, { "completion_length": 204.39795684814453, "epoch": 0.1989433962264151, "grad_norm": 0.763152003288269, "kl": 0.0643310546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7972648739814758, "reward_std": 0.16538570076227188, "rewards/accuracy_reward": 0.7972648739814758, "rewards/format_reward": 1.0, "step": 1977 }, { "completion_length": 186.23468780517578, "epoch": 0.1990440251572327, "grad_norm": 0.7032949924468994, "kl": 0.093994140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8501943945884705, "reward_std": 0.16736947000026703, "rewards/accuracy_reward": 0.870602548122406, "rewards/format_reward": 0.9795918166637421, "step": 1978 }, { "completion_length": 189.43877410888672, "epoch": 0.19914465408805032, "grad_norm": 4.092311859130859, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6250735521316528, "reward_std": 0.2036946788430214, "rewards/accuracy_reward": 0.6556858718395233, "rewards/format_reward": 0.9693877398967743, "step": 1979 }, { "completion_length": 225.21428680419922, "epoch": 0.19924528301886793, "grad_norm": 1.3651899099349976, "kl": 0.053466796875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6548919677734375, "reward_std": 0.2831740751862526, "rewards/accuracy_reward": 0.6855041980743408, "rewards/format_reward": 0.9693877398967743, "step": 1980 }, { "completion_length": 149.1326446533203, "epoch": 0.19934591194968554, "grad_norm": 0.8046257495880127, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8772542476654053, "reward_std": 0.13957568630576134, "rewards/accuracy_reward": 0.8976625204086304, "rewards/format_reward": 0.9795918166637421, "step": 1981 }, { "completion_length": 163.71428680419922, "epoch": 0.19944654088050315, "grad_norm": 1.4012411832809448, "kl": 0.056640625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6771541237831116, "reward_std": 0.19794238358736038, "rewards/accuracy_reward": 0.6771541833877563, "rewards/format_reward": 1.0, "step": 1982 }, { "completion_length": 239.57142639160156, "epoch": 0.19954716981132076, "grad_norm": 3.423964023590088, "kl": 0.052490234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.650238573551178, "reward_std": 0.3287178725004196, "rewards/accuracy_reward": 0.6604427397251129, "rewards/format_reward": 0.9897959232330322, "step": 1983 }, { "completion_length": 279.76529693603516, "epoch": 0.19964779874213837, "grad_norm": 0.8809433579444885, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.524092972278595, "reward_std": 0.21395275741815567, "rewards/accuracy_reward": 0.5342970192432404, "rewards/format_reward": 0.9897959232330322, "step": 1984 }, { "completion_length": 172.87754821777344, "epoch": 0.19974842767295597, "grad_norm": 1.1913689374923706, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6853742003440857, "reward_std": 0.169432757422328, "rewards/accuracy_reward": 0.6955782473087311, "rewards/format_reward": 0.9897959232330322, "step": 1985 }, { "completion_length": 149.22449111938477, "epoch": 0.19984905660377358, "grad_norm": 1.1245949268341064, "kl": 0.0787353515625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7524700164794922, "reward_std": 0.2218088135123253, "rewards/accuracy_reward": 0.7728781998157501, "rewards/format_reward": 0.9795918166637421, "step": 1986 }, { "completion_length": 207.55101776123047, "epoch": 0.1999496855345912, "grad_norm": 1.5129984617233276, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8544482588768005, "reward_std": 0.20859306305646896, "rewards/accuracy_reward": 0.8748564422130585, "rewards/format_reward": 0.9795918166637421, "step": 1987 }, { "completion_length": 263.59183502197266, "epoch": 0.2000503144654088, "grad_norm": 0.7617941498756409, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5672121047973633, "reward_std": 0.26242997497320175, "rewards/accuracy_reward": 0.5978243947029114, "rewards/format_reward": 0.9693877398967743, "step": 1988 }, { "completion_length": 208.80611419677734, "epoch": 0.2001509433962264, "grad_norm": 1.2522469758987427, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6585034132003784, "reward_std": 0.2376963496208191, "rewards/accuracy_reward": 0.658503383398056, "rewards/format_reward": 1.0, "step": 1989 }, { "completion_length": 225.35714721679688, "epoch": 0.20025157232704402, "grad_norm": 1.6980664730072021, "kl": 0.05126953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6032415628433228, "reward_std": 0.27549727261066437, "rewards/accuracy_reward": 0.6134456843137741, "rewards/format_reward": 0.9897959232330322, "step": 1990 }, { "completion_length": 163.55101776123047, "epoch": 0.20035220125786163, "grad_norm": 0.7660464644432068, "kl": 0.05859375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.722513735294342, "reward_std": 0.1328626258764416, "rewards/accuracy_reward": 0.7327178120613098, "rewards/format_reward": 0.9897959232330322, "step": 1991 }, { "completion_length": 210.62244415283203, "epoch": 0.20045283018867924, "grad_norm": 1.2582827806472778, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5448226928710938, "reward_std": 0.27996229380369186, "rewards/accuracy_reward": 0.5652309209108353, "rewards/format_reward": 0.9795918464660645, "step": 1992 }, { "completion_length": 198.4081573486328, "epoch": 0.20055345911949685, "grad_norm": 1.1620838642120361, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6095955967903137, "reward_std": 0.17453057318925858, "rewards/accuracy_reward": 0.6197996735572815, "rewards/format_reward": 0.9897959232330322, "step": 1993 }, { "completion_length": 202.56122589111328, "epoch": 0.20065408805031446, "grad_norm": 0.9173434972763062, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7206546068191528, "reward_std": 0.10320940986275673, "rewards/accuracy_reward": 0.7206545770168304, "rewards/format_reward": 1.0, "step": 1994 }, { "completion_length": 290.7142791748047, "epoch": 0.20075471698113206, "grad_norm": 0.6462668180465698, "kl": 0.05615234375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7170793414115906, "reward_std": 0.17031803727149963, "rewards/accuracy_reward": 0.7374874651432037, "rewards/format_reward": 0.9795918166637421, "step": 1995 }, { "completion_length": 167.7244873046875, "epoch": 0.20085534591194967, "grad_norm": 0.6267403960227966, "kl": 0.0797119140625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7474313974380493, "reward_std": 0.18090800940990448, "rewards/accuracy_reward": 0.7576354146003723, "rewards/format_reward": 0.9897959232330322, "step": 1996 }, { "completion_length": 208.82652282714844, "epoch": 0.20095597484276728, "grad_norm": 0.9522850513458252, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7046794891357422, "reward_std": 0.16062700003385544, "rewards/accuracy_reward": 0.7046794295310974, "rewards/format_reward": 1.0, "step": 1997 }, { "completion_length": 220.2142791748047, "epoch": 0.20105660377358492, "grad_norm": 0.45440274477005005, "kl": 0.06689453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6688642501831055, "reward_std": 0.19883402436971664, "rewards/accuracy_reward": 0.6994765102863312, "rewards/format_reward": 0.9693877398967743, "step": 1998 }, { "completion_length": 205.5, "epoch": 0.20115723270440253, "grad_norm": 0.8990285396575928, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7393320798873901, "reward_std": 0.18679871410131454, "rewards/accuracy_reward": 0.7393321096897125, "rewards/format_reward": 1.0, "step": 1999 }, { "completion_length": 188.75509643554688, "epoch": 0.20125786163522014, "grad_norm": 1.5002684593200684, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7576530575752258, "reward_std": 0.176590234041214, "rewards/accuracy_reward": 0.7678571343421936, "rewards/format_reward": 0.9897959232330322, "step": 2000 }, { "completion_length": 187.84693145751953, "epoch": 0.20135849056603775, "grad_norm": 1.101613163948059, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7718998193740845, "reward_std": 0.18540896475315094, "rewards/accuracy_reward": 0.7821038663387299, "rewards/format_reward": 0.9897959232330322, "step": 2001 }, { "completion_length": 172.9387664794922, "epoch": 0.20145911949685535, "grad_norm": 0.8849332928657532, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7964966893196106, "reward_std": 0.13864728063344955, "rewards/accuracy_reward": 0.8169049322605133, "rewards/format_reward": 0.9795918166637421, "step": 2002 }, { "completion_length": 226.04080200195312, "epoch": 0.20155974842767296, "grad_norm": 0.6857432126998901, "kl": 0.0574951171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6230158805847168, "reward_std": 0.19013812392950058, "rewards/accuracy_reward": 0.6230158805847168, "rewards/format_reward": 1.0, "step": 2003 }, { "completion_length": 197.4285659790039, "epoch": 0.20166037735849057, "grad_norm": 1.1630775928497314, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6368196606636047, "reward_std": 0.21069709956645966, "rewards/accuracy_reward": 0.6470237970352173, "rewards/format_reward": 0.9897959232330322, "step": 2004 }, { "completion_length": 176.33673095703125, "epoch": 0.20176100628930818, "grad_norm": 1.0300060510635376, "kl": 0.0509033203125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8918368220329285, "reward_std": 0.13670486956834793, "rewards/accuracy_reward": 0.8918367028236389, "rewards/format_reward": 1.0, "step": 2005 }, { "completion_length": 233.53060913085938, "epoch": 0.2018616352201258, "grad_norm": 1.5382726192474365, "kl": 0.05517578125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5408725142478943, "reward_std": 0.2214798703789711, "rewards/accuracy_reward": 0.5408724844455719, "rewards/format_reward": 1.0, "step": 2006 }, { "completion_length": 156.9897918701172, "epoch": 0.2019622641509434, "grad_norm": 3.649685859680176, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.71679025888443, "reward_std": 0.25454793870449066, "rewards/accuracy_reward": 0.7371984720230103, "rewards/format_reward": 0.9795918166637421, "step": 2007 }, { "completion_length": 170.2448959350586, "epoch": 0.202062893081761, "grad_norm": 0.41912707686424255, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8979591727256775, "reward_std": 0.06185103580355644, "rewards/accuracy_reward": 0.8979591727256775, "rewards/format_reward": 1.0, "step": 2008 }, { "completion_length": 211.93877410888672, "epoch": 0.20216352201257862, "grad_norm": 0.6077131032943726, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6108276844024658, "reward_std": 0.1445881947875023, "rewards/accuracy_reward": 0.610827624797821, "rewards/format_reward": 1.0, "step": 2009 }, { "completion_length": 223.17346954345703, "epoch": 0.20226415094339623, "grad_norm": 0.7694345116615295, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7585033774375916, "reward_std": 0.22259024530649185, "rewards/accuracy_reward": 0.7585033774375916, "rewards/format_reward": 1.0, "step": 2010 }, { "completion_length": 205.9897918701172, "epoch": 0.20236477987421383, "grad_norm": 1.1382352113723755, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8305392861366272, "reward_std": 0.12479893863201141, "rewards/accuracy_reward": 0.830539345741272, "rewards/format_reward": 1.0, "step": 2011 }, { "completion_length": 239.07141876220703, "epoch": 0.20246540880503144, "grad_norm": 2.8246190547943115, "kl": 0.052978515625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6362637877464294, "reward_std": 0.3176138550043106, "rewards/accuracy_reward": 0.6464678049087524, "rewards/format_reward": 0.9897959232330322, "step": 2012 }, { "completion_length": 208.51020050048828, "epoch": 0.20256603773584905, "grad_norm": 1.7745733261108398, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7033792734146118, "reward_std": 0.21961301565170288, "rewards/accuracy_reward": 0.7033793330192566, "rewards/format_reward": 1.0, "step": 2013 }, { "completion_length": 207.9897918701172, "epoch": 0.20266666666666666, "grad_norm": 0.7093980312347412, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5447391271591187, "reward_std": 0.19131877273321152, "rewards/accuracy_reward": 0.544739156961441, "rewards/format_reward": 1.0, "step": 2014 }, { "completion_length": 214.448974609375, "epoch": 0.20276729559748427, "grad_norm": 4.013785362243652, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6411821246147156, "reward_std": 0.21811746805906296, "rewards/accuracy_reward": 0.6411822140216827, "rewards/format_reward": 1.0, "step": 2015 }, { "completion_length": 213.64285278320312, "epoch": 0.20286792452830188, "grad_norm": 0.8510108590126038, "kl": 0.0653076171875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.690052568912506, "reward_std": 0.15502991899847984, "rewards/accuracy_reward": 0.7002566754817963, "rewards/format_reward": 0.9897959232330322, "step": 2016 }, { "completion_length": 213.84693908691406, "epoch": 0.2029685534591195, "grad_norm": 0.9275171160697937, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7595657706260681, "reward_std": 0.23892273753881454, "rewards/accuracy_reward": 0.7697699069976807, "rewards/format_reward": 0.9897959232330322, "step": 2017 }, { "completion_length": 224.89794921875, "epoch": 0.2030691823899371, "grad_norm": 1.2371503114700317, "kl": 0.0477294921875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7297397255897522, "reward_std": 0.23966248333454132, "rewards/accuracy_reward": 0.7297396957874298, "rewards/format_reward": 1.0, "step": 2018 }, { "completion_length": 225.75509643554688, "epoch": 0.2031698113207547, "grad_norm": 3.0016775131225586, "kl": 0.0738525390625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6423413157463074, "reward_std": 0.28945429623126984, "rewards/accuracy_reward": 0.6525454819202423, "rewards/format_reward": 0.9897959232330322, "step": 2019 }, { "completion_length": 240.96937561035156, "epoch": 0.20327044025157232, "grad_norm": 1.0581856966018677, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.673469364643097, "reward_std": 0.20633212476968765, "rewards/accuracy_reward": 0.6938775479793549, "rewards/format_reward": 0.9795918166637421, "step": 2020 }, { "completion_length": 226.15306091308594, "epoch": 0.20337106918238992, "grad_norm": 1.011322259902954, "kl": 0.052734375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6464831233024597, "reward_std": 0.23637767136096954, "rewards/accuracy_reward": 0.6566871106624603, "rewards/format_reward": 0.9897959232330322, "step": 2021 }, { "completion_length": 173.7244873046875, "epoch": 0.20347169811320756, "grad_norm": 0.5290164947509766, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8675736784934998, "reward_std": 0.13497066125273705, "rewards/accuracy_reward": 0.8675736784934998, "rewards/format_reward": 1.0, "step": 2022 }, { "completion_length": 281.3877487182617, "epoch": 0.20357232704402517, "grad_norm": 3.804020643234253, "kl": 0.0506591796875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.589833915233612, "reward_std": 0.23842238634824753, "rewards/accuracy_reward": 0.5898339152336121, "rewards/format_reward": 1.0, "step": 2023 }, { "completion_length": 280.8877410888672, "epoch": 0.20367295597484278, "grad_norm": 0.7032535076141357, "kl": 0.05810546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5287874341011047, "reward_std": 0.1930253729224205, "rewards/accuracy_reward": 0.5287875384092331, "rewards/format_reward": 1.0, "step": 2024 }, { "completion_length": 264.6224365234375, "epoch": 0.2037735849056604, "grad_norm": 0.8195409774780273, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7457005381584167, "reward_std": 0.21440213173627853, "rewards/accuracy_reward": 0.7457005381584167, "rewards/format_reward": 1.0, "step": 2025 }, { "completion_length": 244.10203552246094, "epoch": 0.203874213836478, "grad_norm": 1.5503623485565186, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.726783037185669, "reward_std": 0.05348761007189751, "rewards/accuracy_reward": 0.7267830967903137, "rewards/format_reward": 1.0, "step": 2026 }, { "completion_length": 334.5408172607422, "epoch": 0.2039748427672956, "grad_norm": 0.9333059191703796, "kl": 0.053466796875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6793089509010315, "reward_std": 0.22281626611948013, "rewards/accuracy_reward": 0.6997170746326447, "rewards/format_reward": 0.9795918166637421, "step": 2027 }, { "completion_length": 252.2755126953125, "epoch": 0.20407547169811321, "grad_norm": 0.41841429471969604, "kl": 0.0701904296875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7159863710403442, "reward_std": 0.10860937088727951, "rewards/accuracy_reward": 0.7261904180049896, "rewards/format_reward": 0.9897959232330322, "step": 2028 }, { "completion_length": 272.4591751098633, "epoch": 0.20417610062893082, "grad_norm": 0.8584169149398804, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6095480918884277, "reward_std": 0.2002510353922844, "rewards/accuracy_reward": 0.6503644287586212, "rewards/format_reward": 0.9591836333274841, "step": 2029 }, { "completion_length": 191.38775634765625, "epoch": 0.20427672955974843, "grad_norm": 0.8788625001907349, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7751457691192627, "reward_std": 0.1612342670559883, "rewards/accuracy_reward": 0.7853498160839081, "rewards/format_reward": 0.9897959232330322, "step": 2030 }, { "completion_length": 163.61223602294922, "epoch": 0.20437735849056604, "grad_norm": 0.3474474847316742, "kl": 0.09765625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.804906964302063, "reward_std": 0.06661146134138107, "rewards/accuracy_reward": 0.8049070239067078, "rewards/format_reward": 1.0, "step": 2031 }, { "completion_length": 189.05101776123047, "epoch": 0.20447798742138365, "grad_norm": 1.1963975429534912, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7891156673431396, "reward_std": 0.23738063126802444, "rewards/accuracy_reward": 0.7891156673431396, "rewards/format_reward": 1.0, "step": 2032 }, { "completion_length": 275.82652282714844, "epoch": 0.20457861635220126, "grad_norm": 0.7442343235015869, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.5486497282981873, "reward_std": 0.1748827062547207, "rewards/accuracy_reward": 0.548649787902832, "rewards/format_reward": 1.0, "step": 2033 }, { "completion_length": 142.75509643554688, "epoch": 0.20467924528301887, "grad_norm": 0.6223271489143372, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7823129296302795, "reward_std": 0.17586569488048553, "rewards/accuracy_reward": 0.7823128700256348, "rewards/format_reward": 1.0, "step": 2034 }, { "completion_length": 281.4591751098633, "epoch": 0.20477987421383648, "grad_norm": 0.6815460920333862, "kl": 0.050537109375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.687416911125183, "reward_std": 0.18317560106515884, "rewards/accuracy_reward": 0.7078250646591187, "rewards/format_reward": 0.9795918166637421, "step": 2035 }, { "completion_length": 235.4183578491211, "epoch": 0.20488050314465409, "grad_norm": 0.4530925750732422, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.665684700012207, "reward_std": 0.08772055804729462, "rewards/accuracy_reward": 0.6656846702098846, "rewards/format_reward": 1.0, "step": 2036 }, { "completion_length": 211.38775634765625, "epoch": 0.2049811320754717, "grad_norm": 1.2878471612930298, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6771798729896545, "reward_std": 0.1330261565744877, "rewards/accuracy_reward": 0.6873840093612671, "rewards/format_reward": 0.9897959232330322, "step": 2037 }, { "completion_length": 238.72447967529297, "epoch": 0.2050817610062893, "grad_norm": 1.6317451000213623, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6286370158195496, "reward_std": 0.2484012320637703, "rewards/accuracy_reward": 0.6490452587604523, "rewards/format_reward": 0.9795918464660645, "step": 2038 }, { "completion_length": 246.47958374023438, "epoch": 0.2051823899371069, "grad_norm": 1.0989477634429932, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7140913009643555, "reward_std": 0.30317919701337814, "rewards/accuracy_reward": 0.7344995141029358, "rewards/format_reward": 0.9795918166637421, "step": 2039 }, { "completion_length": 223.42857360839844, "epoch": 0.20528301886792452, "grad_norm": 0.9169691801071167, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.789914071559906, "reward_std": 0.2579396814107895, "rewards/accuracy_reward": 0.7899141311645508, "rewards/format_reward": 1.0, "step": 2040 }, { "completion_length": 332.3163146972656, "epoch": 0.20538364779874213, "grad_norm": 0.9383313059806824, "kl": 0.0372314453125, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.4823230504989624, "reward_std": 0.31246116757392883, "rewards/accuracy_reward": 0.492527112364769, "rewards/format_reward": 0.9897959232330322, "step": 2041 }, { "completion_length": 286.60203552246094, "epoch": 0.20548427672955974, "grad_norm": 1.3651119470596313, "kl": 0.0499267578125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5499897003173828, "reward_std": 0.17834923043847084, "rewards/accuracy_reward": 0.5601938217878342, "rewards/format_reward": 0.9897959232330322, "step": 2042 }, { "completion_length": 204.85713958740234, "epoch": 0.20558490566037735, "grad_norm": 1.0448408126831055, "kl": 0.049072265625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7857142686843872, "reward_std": 0.1652088463306427, "rewards/accuracy_reward": 0.795918345451355, "rewards/format_reward": 0.9897959232330322, "step": 2043 }, { "completion_length": 197.06122589111328, "epoch": 0.20568553459119496, "grad_norm": 0.6055852174758911, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8265305757522583, "reward_std": 0.07272815518081188, "rewards/accuracy_reward": 0.8265305757522583, "rewards/format_reward": 1.0, "step": 2044 }, { "completion_length": 240.49999237060547, "epoch": 0.20578616352201257, "grad_norm": 0.7970975041389465, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7015921473503113, "reward_std": 0.14884864538908005, "rewards/accuracy_reward": 0.7117962837219238, "rewards/format_reward": 0.9897959232330322, "step": 2045 }, { "completion_length": 169.89795684814453, "epoch": 0.20588679245283018, "grad_norm": 1.0991337299346924, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6324347853660583, "reward_std": 0.11534995585680008, "rewards/accuracy_reward": 0.6324348151683807, "rewards/format_reward": 1.0, "step": 2046 }, { "completion_length": 252.38775634765625, "epoch": 0.2059874213836478, "grad_norm": 1.0472607612609863, "kl": 0.0545654296875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6679393649101257, "reward_std": 0.19689466804265976, "rewards/accuracy_reward": 0.6781434416770935, "rewards/format_reward": 0.9897959232330322, "step": 2047 }, { "completion_length": 181.78571319580078, "epoch": 0.20608805031446542, "grad_norm": 0.8408018350601196, "kl": 0.099609375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7565310597419739, "reward_std": 0.13702915608882904, "rewards/accuracy_reward": 0.7565310597419739, "rewards/format_reward": 1.0, "step": 2048 }, { "completion_length": 194.59183502197266, "epoch": 0.20618867924528303, "grad_norm": 0.9482337832450867, "kl": 0.0635986328125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8096475005149841, "reward_std": 0.16334763914346695, "rewards/accuracy_reward": 0.8198515772819519, "rewards/format_reward": 0.9897959232330322, "step": 2049 }, { "completion_length": 189.79591369628906, "epoch": 0.20628930817610064, "grad_norm": 0.343438982963562, "kl": 0.06689453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.9359925389289856, "reward_std": 0.07452723942697048, "rewards/accuracy_reward": 0.9461966454982758, "rewards/format_reward": 0.9897959232330322, "step": 2050 }, { "completion_length": 295.6632537841797, "epoch": 0.20638993710691825, "grad_norm": 0.8216707110404968, "kl": 0.0552978515625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6239697337150574, "reward_std": 0.32007017731666565, "rewards/accuracy_reward": 0.6341738998889923, "rewards/format_reward": 0.9897959232330322, "step": 2051 }, { "completion_length": 245.47958374023438, "epoch": 0.20649056603773586, "grad_norm": 0.948695719242096, "kl": 0.0684814453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.624880075454712, "reward_std": 0.19281885027885437, "rewards/accuracy_reward": 0.6452882289886475, "rewards/format_reward": 0.9795918464660645, "step": 2052 }, { "completion_length": 209.38775634765625, "epoch": 0.20659119496855347, "grad_norm": 1.12405264377594, "kl": 0.0595703125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8215986490249634, "reward_std": 0.16664747148752213, "rewards/accuracy_reward": 0.8318026959896088, "rewards/format_reward": 0.9897959232330322, "step": 2053 }, { "completion_length": 185.57142639160156, "epoch": 0.20669182389937107, "grad_norm": 0.9570887088775635, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8369614481925964, "reward_std": 0.13801952823996544, "rewards/accuracy_reward": 0.8369614779949188, "rewards/format_reward": 1.0, "step": 2054 }, { "completion_length": 268.6122283935547, "epoch": 0.20679245283018868, "grad_norm": 0.538236141204834, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.59183669090271, "reward_std": 0.17612408846616745, "rewards/accuracy_reward": 0.6122448742389679, "rewards/format_reward": 0.9795918166637421, "step": 2055 }, { "completion_length": 195.99999237060547, "epoch": 0.2068930817610063, "grad_norm": 0.8133295178413391, "kl": 0.0501708984375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6014577746391296, "reward_std": 0.18123332411050797, "rewards/accuracy_reward": 0.6014577150344849, "rewards/format_reward": 1.0, "step": 2056 }, { "completion_length": 241.29591369628906, "epoch": 0.2069937106918239, "grad_norm": 1.4308934211730957, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7052411437034607, "reward_std": 0.2096170112490654, "rewards/accuracy_reward": 0.7052411735057831, "rewards/format_reward": 1.0, "step": 2057 }, { "completion_length": 187.73468780517578, "epoch": 0.2070943396226415, "grad_norm": 0.6451849937438965, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6443148255348206, "reward_std": 0.12621691823005676, "rewards/accuracy_reward": 0.6443148553371429, "rewards/format_reward": 1.0, "step": 2058 }, { "completion_length": 165.06122207641602, "epoch": 0.20719496855345912, "grad_norm": 0.5136286020278931, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7324101328849792, "reward_std": 0.08939722925424576, "rewards/accuracy_reward": 0.7324100732803345, "rewards/format_reward": 1.0, "step": 2059 }, { "completion_length": 213.03060913085938, "epoch": 0.20729559748427673, "grad_norm": 0.9310405850410461, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.666963517665863, "reward_std": 0.23774901032447815, "rewards/accuracy_reward": 0.6873716711997986, "rewards/format_reward": 0.9795918166637421, "step": 2060 }, { "completion_length": 209.89794921875, "epoch": 0.20739622641509434, "grad_norm": 0.8878008127212524, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5591042041778564, "reward_std": 0.18608873523771763, "rewards/accuracy_reward": 0.5693082362413406, "rewards/format_reward": 0.9897959232330322, "step": 2061 }, { "completion_length": 222.60203552246094, "epoch": 0.20749685534591195, "grad_norm": 0.7327316999435425, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7575072050094604, "reward_std": 0.14824530482292175, "rewards/accuracy_reward": 0.7779153883457184, "rewards/format_reward": 0.9795918166637421, "step": 2062 }, { "completion_length": 253.98978424072266, "epoch": 0.20759748427672955, "grad_norm": 2.6690807342529297, "kl": 0.1021728515625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.4829931855201721, "reward_std": 0.24200434237718582, "rewards/accuracy_reward": 0.50340136885643, "rewards/format_reward": 0.9795918464660645, "step": 2063 }, { "completion_length": 282.9285659790039, "epoch": 0.20769811320754716, "grad_norm": 0.7423524260520935, "kl": 0.0709228515625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6351934671401978, "reward_std": 0.13987358659505844, "rewards/accuracy_reward": 0.6351934969425201, "rewards/format_reward": 1.0, "step": 2064 }, { "completion_length": 247.8571319580078, "epoch": 0.20779874213836477, "grad_norm": 2.44376802444458, "kl": 0.0921630859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6518029570579529, "reward_std": 0.22060264647006989, "rewards/accuracy_reward": 0.6620070040225983, "rewards/format_reward": 0.9897959232330322, "step": 2065 }, { "completion_length": 268.29591369628906, "epoch": 0.20789937106918238, "grad_norm": 1.47622811794281, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7354269623756409, "reward_std": 0.28724828362464905, "rewards/accuracy_reward": 0.7354269921779633, "rewards/format_reward": 1.0, "step": 2066 }, { "completion_length": 231.43877410888672, "epoch": 0.208, "grad_norm": 1.0382930040359497, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.5897884368896484, "reward_std": 0.20218327641487122, "rewards/accuracy_reward": 0.6101966947317123, "rewards/format_reward": 0.9795918166637421, "step": 2067 }, { "completion_length": 158.36734008789062, "epoch": 0.2081006289308176, "grad_norm": 1.3535692691802979, "kl": 0.0662841796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7728121876716614, "reward_std": 0.15002794563770294, "rewards/accuracy_reward": 0.7830162644386292, "rewards/format_reward": 0.9897959232330322, "step": 2068 }, { "completion_length": 222.51020050048828, "epoch": 0.2082012578616352, "grad_norm": 0.9317672252655029, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7693722248077393, "reward_std": 0.23034558445215225, "rewards/accuracy_reward": 0.7693722546100616, "rewards/format_reward": 1.0, "step": 2069 }, { "completion_length": 221.34693145751953, "epoch": 0.20830188679245282, "grad_norm": 0.9489959478378296, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7323290705680847, "reward_std": 0.16600101441144943, "rewards/accuracy_reward": 0.7323291003704071, "rewards/format_reward": 1.0, "step": 2070 }, { "completion_length": 220.86734771728516, "epoch": 0.20840251572327045, "grad_norm": 1.2755622863769531, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6658935546875, "reward_std": 0.12475726008415222, "rewards/accuracy_reward": 0.6863017678260803, "rewards/format_reward": 0.9795918464660645, "step": 2071 }, { "completion_length": 267.5612106323242, "epoch": 0.20850314465408806, "grad_norm": 0.5872578620910645, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6530611515045166, "reward_std": 0.23834511637687683, "rewards/accuracy_reward": 0.6938775330781937, "rewards/format_reward": 0.9591836333274841, "step": 2072 }, { "completion_length": 226.2653045654297, "epoch": 0.20860377358490567, "grad_norm": 0.5813249349594116, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7337220907211304, "reward_std": 0.10082288458943367, "rewards/accuracy_reward": 0.743926078081131, "rewards/format_reward": 0.9897959232330322, "step": 2073 }, { "completion_length": 255.13265228271484, "epoch": 0.20870440251572328, "grad_norm": 0.5102896094322205, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5963718891143799, "reward_std": 0.10717368870973587, "rewards/accuracy_reward": 0.5963719040155411, "rewards/format_reward": 1.0, "step": 2074 }, { "completion_length": 250.56121826171875, "epoch": 0.2088050314465409, "grad_norm": 0.6330438852310181, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8076530694961548, "reward_std": 0.13654190674424171, "rewards/accuracy_reward": 0.8076530694961548, "rewards/format_reward": 1.0, "step": 2075 }, { "completion_length": 262.7346954345703, "epoch": 0.2089056603773585, "grad_norm": 0.5939304232597351, "kl": 0.0537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.562584936618805, "reward_std": 0.17306633666157722, "rewards/accuracy_reward": 0.5625850260257721, "rewards/format_reward": 1.0, "step": 2076 }, { "completion_length": 262.31632232666016, "epoch": 0.2090062893081761, "grad_norm": 0.8915203809738159, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7270779609680176, "reward_std": 0.1420835480093956, "rewards/accuracy_reward": 0.7372820675373077, "rewards/format_reward": 0.9897959232330322, "step": 2077 }, { "completion_length": 231.78570556640625, "epoch": 0.20910691823899372, "grad_norm": 0.8570170998573303, "kl": 0.0740966796875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.647902250289917, "reward_std": 0.27262843400239944, "rewards/accuracy_reward": 0.6581063568592072, "rewards/format_reward": 0.9897959232330322, "step": 2078 }, { "completion_length": 228.52040100097656, "epoch": 0.20920754716981133, "grad_norm": 0.8248209357261658, "kl": 0.03863525390625, "learning_rate": 1e-06, "loss": 0.0015, "reward": 1.5728879570960999, "reward_std": 0.2258886694908142, "rewards/accuracy_reward": 0.5932960510253906, "rewards/format_reward": 0.9795918166637421, "step": 2079 }, { "completion_length": 266.37755584716797, "epoch": 0.20930817610062893, "grad_norm": 0.9105157852172852, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6085520386695862, "reward_std": 0.3097119480371475, "rewards/accuracy_reward": 0.6391642689704895, "rewards/format_reward": 0.9693877398967743, "step": 2080 }, { "completion_length": 214.83673095703125, "epoch": 0.20940880503144654, "grad_norm": 0.9743185639381409, "kl": 0.0682373046875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.765063762664795, "reward_std": 0.1473691537976265, "rewards/accuracy_reward": 0.7650638818740845, "rewards/format_reward": 1.0, "step": 2081 }, { "completion_length": 323.4897918701172, "epoch": 0.20950943396226415, "grad_norm": 1.2249113321304321, "kl": 0.0587158203125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6156463623046875, "reward_std": 0.29450061172246933, "rewards/accuracy_reward": 0.6156462728977203, "rewards/format_reward": 1.0, "step": 2082 }, { "completion_length": 268.99998474121094, "epoch": 0.20961006289308176, "grad_norm": 2.0742485523223877, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6071478724479675, "reward_std": 0.19167619943618774, "rewards/accuracy_reward": 0.6275561451911926, "rewards/format_reward": 0.9795918166637421, "step": 2083 }, { "completion_length": 218.78571319580078, "epoch": 0.20971069182389937, "grad_norm": 1.4334772825241089, "kl": 0.104736328125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6130576729774475, "reward_std": 0.1604442596435547, "rewards/accuracy_reward": 0.6334658414125443, "rewards/format_reward": 0.9795918464660645, "step": 2084 }, { "completion_length": 228.75509643554688, "epoch": 0.20981132075471698, "grad_norm": 0.855291485786438, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5304266810417175, "reward_std": 0.2572201266884804, "rewards/accuracy_reward": 0.5610389411449432, "rewards/format_reward": 0.9693877398967743, "step": 2085 }, { "completion_length": 278.8775405883789, "epoch": 0.2099119496855346, "grad_norm": 0.6686556935310364, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5445955395698547, "reward_std": 0.18161991238594055, "rewards/accuracy_reward": 0.5547996014356613, "rewards/format_reward": 0.9897959232330322, "step": 2086 }, { "completion_length": 267.05101013183594, "epoch": 0.2100125786163522, "grad_norm": 0.7478333115577698, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8299319744110107, "reward_std": 0.25865958631038666, "rewards/accuracy_reward": 0.8299319446086884, "rewards/format_reward": 1.0, "step": 2087 }, { "completion_length": 191.33673095703125, "epoch": 0.2101132075471698, "grad_norm": 0.7381629347801208, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8484776020050049, "reward_std": 0.12288982421159744, "rewards/accuracy_reward": 0.8484776318073273, "rewards/format_reward": 1.0, "step": 2088 }, { "completion_length": 184.10203552246094, "epoch": 0.21021383647798741, "grad_norm": 1.0535001754760742, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7523809671401978, "reward_std": 0.21987295150756836, "rewards/accuracy_reward": 0.7625850737094879, "rewards/format_reward": 0.9897959232330322, "step": 2089 }, { "completion_length": 211.45917510986328, "epoch": 0.21031446540880502, "grad_norm": 0.8972283601760864, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7095701694488525, "reward_std": 0.19638396054506302, "rewards/accuracy_reward": 0.7095701694488525, "rewards/format_reward": 1.0, "step": 2090 }, { "completion_length": 217.29591369628906, "epoch": 0.21041509433962263, "grad_norm": 0.579233705997467, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.899611234664917, "reward_std": 0.1226576454937458, "rewards/accuracy_reward": 0.8996112644672394, "rewards/format_reward": 1.0, "step": 2091 }, { "completion_length": 219.92855834960938, "epoch": 0.21051572327044024, "grad_norm": 0.9873914122581482, "kl": 0.052978515625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6890347003936768, "reward_std": 0.19543161988258362, "rewards/accuracy_reward": 0.6992387175559998, "rewards/format_reward": 0.9897959232330322, "step": 2092 }, { "completion_length": 251.55101776123047, "epoch": 0.21061635220125785, "grad_norm": 0.9167847633361816, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.511825442314148, "reward_std": 0.27485281229019165, "rewards/accuracy_reward": 0.542437732219696, "rewards/format_reward": 0.9693877398967743, "step": 2093 }, { "completion_length": 321.9285583496094, "epoch": 0.21071698113207546, "grad_norm": 0.6813616752624512, "kl": 0.0494384765625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6649901866912842, "reward_std": 0.30104486644268036, "rewards/accuracy_reward": 0.6853983998298645, "rewards/format_reward": 0.9795918464660645, "step": 2094 }, { "completion_length": 282.9387664794922, "epoch": 0.21081761006289307, "grad_norm": 0.8381240367889404, "kl": 0.0540771484375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6063936948776245, "reward_std": 0.20880667865276337, "rewards/accuracy_reward": 0.6165978014469147, "rewards/format_reward": 0.9897959232330322, "step": 2095 }, { "completion_length": 231.84693908691406, "epoch": 0.2109182389937107, "grad_norm": 0.834161102771759, "kl": 0.056396484375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6938775181770325, "reward_std": 0.22140854224562645, "rewards/accuracy_reward": 0.7040816247463226, "rewards/format_reward": 0.9897959232330322, "step": 2096 }, { "completion_length": 235.80611419677734, "epoch": 0.2110188679245283, "grad_norm": 1.8335585594177246, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.614614725112915, "reward_std": 0.18870000541210175, "rewards/accuracy_reward": 0.6248188316822052, "rewards/format_reward": 0.9897959232330322, "step": 2097 }, { "completion_length": 201.4285659790039, "epoch": 0.21111949685534592, "grad_norm": 0.9116761088371277, "kl": 0.09423828125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6855795979499817, "reward_std": 0.13802292570471764, "rewards/accuracy_reward": 0.6957837641239166, "rewards/format_reward": 0.9897959232330322, "step": 2098 }, { "completion_length": 240.11224365234375, "epoch": 0.21122012578616353, "grad_norm": 0.45385614037513733, "kl": 0.0572509765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7098965048789978, "reward_std": 0.12913289666175842, "rewards/accuracy_reward": 0.720100611448288, "rewards/format_reward": 0.9897959232330322, "step": 2099 }, { "completion_length": 212.77550506591797, "epoch": 0.21132075471698114, "grad_norm": 0.7628639936447144, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.628401279449463, "reward_std": 0.14305870607495308, "rewards/accuracy_reward": 0.62840136885643, "rewards/format_reward": 1.0, "step": 2100 }, { "completion_length": 241.11223602294922, "epoch": 0.21142138364779875, "grad_norm": 0.8990694284439087, "kl": 0.0653076171875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7174248695373535, "reward_std": 0.1657574474811554, "rewards/accuracy_reward": 0.7174249291419983, "rewards/format_reward": 1.0, "step": 2101 }, { "completion_length": 239.0918426513672, "epoch": 0.21152201257861636, "grad_norm": 1.3301010131835938, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.569532036781311, "reward_std": 0.25027996301651, "rewards/accuracy_reward": 0.569532036781311, "rewards/format_reward": 1.0, "step": 2102 }, { "completion_length": 246.79590606689453, "epoch": 0.21162264150943397, "grad_norm": 0.7482179403305054, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7081838846206665, "reward_std": 0.18471812456846237, "rewards/accuracy_reward": 0.7183879613876343, "rewards/format_reward": 0.9897959232330322, "step": 2103 }, { "completion_length": 268.9795837402344, "epoch": 0.21172327044025158, "grad_norm": 0.6889788508415222, "kl": 0.0548095703125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5646376013755798, "reward_std": 0.19765929877758026, "rewards/accuracy_reward": 0.5850457847118378, "rewards/format_reward": 0.9795918464660645, "step": 2104 }, { "completion_length": 238.29590606689453, "epoch": 0.21182389937106919, "grad_norm": 0.9835090041160583, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8293997645378113, "reward_std": 0.2193431556224823, "rewards/accuracy_reward": 0.8293997943401337, "rewards/format_reward": 1.0, "step": 2105 }, { "completion_length": 287.6938781738281, "epoch": 0.2119245283018868, "grad_norm": 0.686322808265686, "kl": 0.0640869140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7484848499298096, "reward_std": 0.16602466255426407, "rewards/accuracy_reward": 0.7484848499298096, "rewards/format_reward": 1.0, "step": 2106 }, { "completion_length": 255.6326446533203, "epoch": 0.2120251572327044, "grad_norm": 0.8117220401763916, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6729024052619934, "reward_std": 0.18511810898780823, "rewards/accuracy_reward": 0.6729024648666382, "rewards/format_reward": 1.0, "step": 2107 }, { "completion_length": 249.09183502197266, "epoch": 0.212125786163522, "grad_norm": 0.7757408618927002, "kl": 0.0736083984375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7153547406196594, "reward_std": 0.20770156383514404, "rewards/accuracy_reward": 0.7255587875843048, "rewards/format_reward": 0.9897959232330322, "step": 2108 }, { "completion_length": 176.40816497802734, "epoch": 0.21222641509433962, "grad_norm": 1.3070573806762695, "kl": 0.0760498046875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7653060555458069, "reward_std": 0.15519911795854568, "rewards/accuracy_reward": 0.7755101919174194, "rewards/format_reward": 0.9897959232330322, "step": 2109 }, { "completion_length": 186.89795684814453, "epoch": 0.21232704402515723, "grad_norm": 29.512765884399414, "kl": 0.107421875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.6972754001617432, "reward_std": 0.1435178741812706, "rewards/accuracy_reward": 0.6972754299640656, "rewards/format_reward": 1.0, "step": 2110 }, { "completion_length": 241.29590606689453, "epoch": 0.21242767295597484, "grad_norm": 0.6821797490119934, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.626360535621643, "reward_std": 0.09198711812496185, "rewards/accuracy_reward": 0.6263605356216431, "rewards/format_reward": 1.0, "step": 2111 }, { "completion_length": 287.5306091308594, "epoch": 0.21252830188679245, "grad_norm": 0.7989060282707214, "kl": 0.0576171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7513914108276367, "reward_std": 0.2494870200753212, "rewards/accuracy_reward": 0.7615955471992493, "rewards/format_reward": 0.9897959232330322, "step": 2112 }, { "completion_length": 246.84693145751953, "epoch": 0.21262893081761006, "grad_norm": 0.7721518874168396, "kl": 0.05224609375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.571266531944275, "reward_std": 0.11335516348481178, "rewards/accuracy_reward": 0.5712665319442749, "rewards/format_reward": 1.0, "step": 2113 }, { "completion_length": 270.62245178222656, "epoch": 0.21272955974842767, "grad_norm": 0.6712179183959961, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7602527141571045, "reward_std": 0.13303785026073456, "rewards/accuracy_reward": 0.7602526843547821, "rewards/format_reward": 1.0, "step": 2114 }, { "completion_length": 202.38775634765625, "epoch": 0.21283018867924527, "grad_norm": 1.000144600868225, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7901301980018616, "reward_std": 0.13437636196613312, "rewards/accuracy_reward": 0.7901301681995392, "rewards/format_reward": 1.0, "step": 2115 }, { "completion_length": 262.67346954345703, "epoch": 0.21293081761006288, "grad_norm": 1.1304781436920166, "kl": 0.0465087890625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.667800486087799, "reward_std": 0.1879250556230545, "rewards/accuracy_reward": 0.6678004264831543, "rewards/format_reward": 1.0, "step": 2116 }, { "completion_length": 302.05101776123047, "epoch": 0.2130314465408805, "grad_norm": 1.0227798223495483, "kl": 0.0509033203125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5654840469360352, "reward_std": 0.29240185022354126, "rewards/accuracy_reward": 0.565484032034874, "rewards/format_reward": 1.0, "step": 2117 }, { "completion_length": 299.6734619140625, "epoch": 0.2131320754716981, "grad_norm": 0.8394201993942261, "kl": 0.0400390625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.4919029474258423, "reward_std": 0.3141987845301628, "rewards/accuracy_reward": 0.5021069794893265, "rewards/format_reward": 0.9897959232330322, "step": 2118 }, { "completion_length": 152.83673095703125, "epoch": 0.2132327044025157, "grad_norm": 0.6419543623924255, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8979591727256775, "reward_std": 0.08099238574504852, "rewards/accuracy_reward": 0.9081632494926453, "rewards/format_reward": 0.9897959232330322, "step": 2119 }, { "completion_length": 223.82653045654297, "epoch": 0.21333333333333335, "grad_norm": 1.5205763578414917, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.566083550453186, "reward_std": 0.2026933655142784, "rewards/accuracy_reward": 0.566083550453186, "rewards/format_reward": 1.0, "step": 2120 }, { "completion_length": 238.6938705444336, "epoch": 0.21343396226415096, "grad_norm": 1.0047534704208374, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7940746545791626, "reward_std": 0.15389341488480568, "rewards/accuracy_reward": 0.7940746545791626, "rewards/format_reward": 1.0, "step": 2121 }, { "completion_length": 180.5204086303711, "epoch": 0.21353459119496856, "grad_norm": 1.1594364643096924, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8775509595870972, "reward_std": 0.06517763808369637, "rewards/accuracy_reward": 0.8775509893894196, "rewards/format_reward": 1.0, "step": 2122 }, { "completion_length": 201.28570556640625, "epoch": 0.21363522012578617, "grad_norm": 0.7829911112785339, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7877241969108582, "reward_std": 0.12840765714645386, "rewards/accuracy_reward": 0.7877241373062134, "rewards/format_reward": 1.0, "step": 2123 }, { "completion_length": 262.1938705444336, "epoch": 0.21373584905660378, "grad_norm": 0.776257336139679, "kl": 0.054931640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.738946557044983, "reward_std": 0.23376132547855377, "rewards/accuracy_reward": 0.7491507232189178, "rewards/format_reward": 0.9897959232330322, "step": 2124 }, { "completion_length": 253.03060150146484, "epoch": 0.2138364779874214, "grad_norm": 0.8171751499176025, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6150278449058533, "reward_std": 0.15596122294664383, "rewards/accuracy_reward": 0.6150278151035309, "rewards/format_reward": 1.0, "step": 2125 }, { "completion_length": 203.4591827392578, "epoch": 0.213937106918239, "grad_norm": 5.2446699142456055, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.838192343711853, "reward_std": 0.12916803359985352, "rewards/accuracy_reward": 0.8381923735141754, "rewards/format_reward": 1.0, "step": 2126 }, { "completion_length": 232.92855834960938, "epoch": 0.2140377358490566, "grad_norm": 0.7949644923210144, "kl": 0.0640869140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7468345761299133, "reward_std": 0.22266142815351486, "rewards/accuracy_reward": 0.7468345165252686, "rewards/format_reward": 1.0, "step": 2127 }, { "completion_length": 191.7653045654297, "epoch": 0.21413836477987422, "grad_norm": 1.9557554721832275, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7221184968948364, "reward_std": 0.2520606443285942, "rewards/accuracy_reward": 0.7323226034641266, "rewards/format_reward": 0.9897959232330322, "step": 2128 }, { "completion_length": 160.54080963134766, "epoch": 0.21423899371069183, "grad_norm": 0.5462207794189453, "kl": 0.098388671875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.861451268196106, "reward_std": 0.05817686580121517, "rewards/accuracy_reward": 0.861451268196106, "rewards/format_reward": 1.0, "step": 2129 }, { "completion_length": 289.1428451538086, "epoch": 0.21433962264150944, "grad_norm": 0.6314989924430847, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5581632256507874, "reward_std": 0.16148275136947632, "rewards/accuracy_reward": 0.5683673471212387, "rewards/format_reward": 0.9897959232330322, "step": 2130 }, { "completion_length": 161.66326141357422, "epoch": 0.21444025157232705, "grad_norm": 0.9962368607521057, "kl": 0.09912109375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7177293300628662, "reward_std": 0.20094099640846252, "rewards/accuracy_reward": 0.7381374835968018, "rewards/format_reward": 0.9795918166637421, "step": 2131 }, { "completion_length": 267.27549743652344, "epoch": 0.21454088050314465, "grad_norm": 0.9190019965171814, "kl": 0.0469970703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6485146284103394, "reward_std": 0.21256192028522491, "rewards/accuracy_reward": 0.6587187349796295, "rewards/format_reward": 0.9897959232330322, "step": 2132 }, { "completion_length": 286.8571472167969, "epoch": 0.21464150943396226, "grad_norm": 0.736324667930603, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6983045935630798, "reward_std": 0.26990240067243576, "rewards/accuracy_reward": 0.7187126874923706, "rewards/format_reward": 0.9795918464660645, "step": 2133 }, { "completion_length": 261.10203552246094, "epoch": 0.21474213836477987, "grad_norm": 1.2569515705108643, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5473275184631348, "reward_std": 0.2300027832388878, "rewards/accuracy_reward": 0.5677356719970703, "rewards/format_reward": 0.9795918464660645, "step": 2134 }, { "completion_length": 289.87754821777344, "epoch": 0.21484276729559748, "grad_norm": 0.44533076882362366, "kl": 0.0714111328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6734886765480042, "reward_std": 0.12699046358466148, "rewards/accuracy_reward": 0.6836927235126495, "rewards/format_reward": 0.9897959232330322, "step": 2135 }, { "completion_length": 200.31632232666016, "epoch": 0.2149433962264151, "grad_norm": 0.8806126713752747, "kl": 0.057373046875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.678425669670105, "reward_std": 0.26298798620700836, "rewards/accuracy_reward": 0.6886297166347504, "rewards/format_reward": 0.9897959232330322, "step": 2136 }, { "completion_length": 241.86734008789062, "epoch": 0.2150440251572327, "grad_norm": 0.6004747748374939, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.836734652519226, "reward_std": 0.148374792188406, "rewards/accuracy_reward": 0.8469387590885162, "rewards/format_reward": 0.9897959232330322, "step": 2137 }, { "completion_length": 211.28571319580078, "epoch": 0.2151446540880503, "grad_norm": 1.3318920135498047, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.663581132888794, "reward_std": 0.2025488018989563, "rewards/accuracy_reward": 0.6737852692604065, "rewards/format_reward": 0.9897959232330322, "step": 2138 }, { "completion_length": 249.9693832397461, "epoch": 0.21524528301886792, "grad_norm": 1.0654314756393433, "kl": 0.05419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6339285373687744, "reward_std": 0.2951236963272095, "rewards/accuracy_reward": 0.6543367207050323, "rewards/format_reward": 0.9795918464660645, "step": 2139 }, { "completion_length": 235.6326446533203, "epoch": 0.21534591194968553, "grad_norm": 1.352937936782837, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6732043623924255, "reward_std": 0.24258951842784882, "rewards/accuracy_reward": 0.7038165330886841, "rewards/format_reward": 0.9693877398967743, "step": 2140 }, { "completion_length": 279.24488830566406, "epoch": 0.21544654088050313, "grad_norm": 0.517087996006012, "kl": 0.0574951171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7275398969650269, "reward_std": 0.15451006591320038, "rewards/accuracy_reward": 0.7275399267673492, "rewards/format_reward": 1.0, "step": 2141 }, { "completion_length": 267.62244415283203, "epoch": 0.21554716981132074, "grad_norm": 0.9193854331970215, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.698906660079956, "reward_std": 0.15134931728243828, "rewards/accuracy_reward": 0.6989066898822784, "rewards/format_reward": 1.0, "step": 2142 }, { "completion_length": 229.33672332763672, "epoch": 0.21564779874213835, "grad_norm": 1.0974618196487427, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7661564350128174, "reward_std": 0.24221639335155487, "rewards/accuracy_reward": 0.8069727718830109, "rewards/format_reward": 0.9591836631298065, "step": 2143 }, { "completion_length": 213.09183502197266, "epoch": 0.21574842767295596, "grad_norm": 1.9097785949707031, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7187536358833313, "reward_std": 0.3327390104532242, "rewards/accuracy_reward": 0.749365895986557, "rewards/format_reward": 0.9693877398967743, "step": 2144 }, { "completion_length": 241.4387664794922, "epoch": 0.2158490566037736, "grad_norm": 0.48642778396606445, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.5895928740501404, "reward_std": 0.11493733897805214, "rewards/accuracy_reward": 0.5997970104217529, "rewards/format_reward": 0.9897959232330322, "step": 2145 }, { "completion_length": 321.83673095703125, "epoch": 0.2159496855345912, "grad_norm": 0.7298604846000671, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6028019785881042, "reward_std": 0.19533353298902512, "rewards/accuracy_reward": 0.6130061447620392, "rewards/format_reward": 0.9897959232330322, "step": 2146 }, { "completion_length": 225.83673858642578, "epoch": 0.21605031446540882, "grad_norm": 0.7311446070671082, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7190476059913635, "reward_std": 0.14814196527004242, "rewards/accuracy_reward": 0.7394557595252991, "rewards/format_reward": 0.9795918464660645, "step": 2147 }, { "completion_length": 262.4081573486328, "epoch": 0.21615094339622642, "grad_norm": 1.2555851936340332, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.721900463104248, "reward_std": 0.09193644300103188, "rewards/accuracy_reward": 0.7219004333019257, "rewards/format_reward": 1.0, "step": 2148 }, { "completion_length": 273.7449035644531, "epoch": 0.21625157232704403, "grad_norm": 0.3866826593875885, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5591674447059631, "reward_std": 0.0514392489567399, "rewards/accuracy_reward": 0.5693715512752533, "rewards/format_reward": 0.9897959232330322, "step": 2149 }, { "completion_length": 231.06121826171875, "epoch": 0.21635220125786164, "grad_norm": 1.5774915218353271, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7347174286842346, "reward_std": 0.26742444187402725, "rewards/accuracy_reward": 0.7551256120204926, "rewards/format_reward": 0.9795918464660645, "step": 2150 }, { "completion_length": 281.39795684814453, "epoch": 0.21645283018867925, "grad_norm": 0.7648463845252991, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7719387412071228, "reward_std": 0.2553233653306961, "rewards/accuracy_reward": 0.8025510013103485, "rewards/format_reward": 0.9693877398967743, "step": 2151 }, { "completion_length": 251.99999237060547, "epoch": 0.21655345911949686, "grad_norm": 0.985178530216217, "kl": 0.0689697265625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.718517005443573, "reward_std": 0.2524239420890808, "rewards/accuracy_reward": 0.7185169458389282, "rewards/format_reward": 1.0, "step": 2152 }, { "completion_length": 272.4081573486328, "epoch": 0.21665408805031447, "grad_norm": 0.6628477573394775, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6290916800498962, "reward_std": 0.11702987551689148, "rewards/accuracy_reward": 0.6290916204452515, "rewards/format_reward": 1.0, "step": 2153 }, { "completion_length": 276.01019287109375, "epoch": 0.21675471698113208, "grad_norm": 0.5482441186904907, "kl": 0.0850830078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.580276906490326, "reward_std": 0.1934966892004013, "rewards/accuracy_reward": 0.6006850898265839, "rewards/format_reward": 0.9795918464660645, "step": 2154 }, { "completion_length": 208.77550506591797, "epoch": 0.2168553459119497, "grad_norm": 4.467949867248535, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7124391794204712, "reward_std": 0.23000027984380722, "rewards/accuracy_reward": 0.7226433157920837, "rewards/format_reward": 0.9897959232330322, "step": 2155 }, { "completion_length": 241.08163452148438, "epoch": 0.2169559748427673, "grad_norm": 0.735197126865387, "kl": 0.108154296875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7653311491012573, "reward_std": 0.152755219489336, "rewards/accuracy_reward": 0.7755351662635803, "rewards/format_reward": 0.9897959232330322, "step": 2156 }, { "completion_length": 369.3061218261719, "epoch": 0.2170566037735849, "grad_norm": 0.7334287762641907, "kl": 0.0662841796875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6454081535339355, "reward_std": 0.21889179944992065, "rewards/accuracy_reward": 0.6658163070678711, "rewards/format_reward": 0.9795918166637421, "step": 2157 }, { "completion_length": 231.79591369628906, "epoch": 0.21715723270440251, "grad_norm": 0.7745801210403442, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.811776578426361, "reward_std": 0.18071584403514862, "rewards/accuracy_reward": 0.8117766380310059, "rewards/format_reward": 1.0, "step": 2158 }, { "completion_length": 171.83673095703125, "epoch": 0.21725786163522012, "grad_norm": 0.8802618980407715, "kl": 0.0633544921875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8068026304244995, "reward_std": 0.1555267721414566, "rewards/accuracy_reward": 0.8170067667961121, "rewards/format_reward": 0.9897959232330322, "step": 2159 }, { "completion_length": 261.53060150146484, "epoch": 0.21735849056603773, "grad_norm": 0.7972529530525208, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6644657254219055, "reward_std": 0.3098858743906021, "rewards/accuracy_reward": 0.6644657552242279, "rewards/format_reward": 1.0, "step": 2160 }, { "completion_length": 184.77550506591797, "epoch": 0.21745911949685534, "grad_norm": 0.7954519391059875, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.9387754201889038, "reward_std": 0.10003120824694633, "rewards/accuracy_reward": 0.9387754797935486, "rewards/format_reward": 1.0, "step": 2161 }, { "completion_length": 269.9795913696289, "epoch": 0.21755974842767295, "grad_norm": 0.5454850196838379, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7431983351707458, "reward_std": 0.1716700829565525, "rewards/accuracy_reward": 0.743198424577713, "rewards/format_reward": 1.0, "step": 2162 }, { "completion_length": 317.52040100097656, "epoch": 0.21766037735849056, "grad_norm": 8.460241317749023, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6651700735092163, "reward_std": 0.2171832099556923, "rewards/accuracy_reward": 0.6855781972408295, "rewards/format_reward": 0.9795918166637421, "step": 2163 }, { "completion_length": 304.3571319580078, "epoch": 0.21776100628930817, "grad_norm": 0.6235284805297852, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8061224222183228, "reward_std": 0.2810548171401024, "rewards/accuracy_reward": 0.8265306055545807, "rewards/format_reward": 0.9795918166637421, "step": 2164 }, { "completion_length": 266.4897918701172, "epoch": 0.21786163522012578, "grad_norm": 0.8307931423187256, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6887754797935486, "reward_std": 0.21948976814746857, "rewards/accuracy_reward": 0.7091836631298065, "rewards/format_reward": 0.9795918464660645, "step": 2165 }, { "completion_length": 255.1224365234375, "epoch": 0.21796226415094339, "grad_norm": 1.0237135887145996, "kl": 0.0758056640625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6535398960113525, "reward_std": 0.1494942419230938, "rewards/accuracy_reward": 0.6637440025806427, "rewards/format_reward": 0.9897959232330322, "step": 2166 }, { "completion_length": 320.2550811767578, "epoch": 0.218062893081761, "grad_norm": 0.6701176762580872, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.4828017354011536, "reward_std": 0.14666436612606049, "rewards/accuracy_reward": 0.4828017055988312, "rewards/format_reward": 1.0, "step": 2167 }, { "completion_length": 240.63265228271484, "epoch": 0.2181635220125786, "grad_norm": 0.6380500793457031, "kl": 0.0821533203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7039071321487427, "reward_std": 0.17542962729930878, "rewards/accuracy_reward": 0.724315345287323, "rewards/format_reward": 0.9795918166637421, "step": 2168 }, { "completion_length": 185.59183502197266, "epoch": 0.21826415094339624, "grad_norm": 0.4434320032596588, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7390176653862, "reward_std": 0.07601487636566162, "rewards/accuracy_reward": 0.7390176951885223, "rewards/format_reward": 1.0, "step": 2169 }, { "completion_length": 215.12244415283203, "epoch": 0.21836477987421385, "grad_norm": 1.0174349546432495, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5636540055274963, "reward_std": 0.24124057590961456, "rewards/accuracy_reward": 0.6044703423976898, "rewards/format_reward": 0.9591836631298065, "step": 2170 }, { "completion_length": 252.14285278320312, "epoch": 0.21846540880503146, "grad_norm": 0.6240662336349487, "kl": 0.0552978515625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6316369771957397, "reward_std": 0.21348270773887634, "rewards/accuracy_reward": 0.6520452499389648, "rewards/format_reward": 0.9795918464660645, "step": 2171 }, { "completion_length": 194.93877410888672, "epoch": 0.21856603773584907, "grad_norm": 0.6962969303131104, "kl": 0.0953369140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7522108554840088, "reward_std": 0.1676025092601776, "rewards/accuracy_reward": 0.7522108852863312, "rewards/format_reward": 1.0, "step": 2172 }, { "completion_length": 285.2550964355469, "epoch": 0.21866666666666668, "grad_norm": 0.5428879261016846, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6416640281677246, "reward_std": 0.09955765306949615, "rewards/accuracy_reward": 0.6416640728712082, "rewards/format_reward": 1.0, "step": 2173 }, { "completion_length": 221.57142639160156, "epoch": 0.21876729559748428, "grad_norm": 1.238301396369934, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6524052619934082, "reward_std": 0.22439494729042053, "rewards/accuracy_reward": 0.6524052321910858, "rewards/format_reward": 1.0, "step": 2174 }, { "completion_length": 166.0408172607422, "epoch": 0.2188679245283019, "grad_norm": 1.0489107370376587, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7899659872055054, "reward_std": 0.12962392531335354, "rewards/accuracy_reward": 0.7899659276008606, "rewards/format_reward": 1.0, "step": 2175 }, { "completion_length": 256.72449493408203, "epoch": 0.2189685534591195, "grad_norm": 0.5857729315757751, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6089624762535095, "reward_std": 0.14904502034187317, "rewards/accuracy_reward": 0.6089624613523483, "rewards/format_reward": 1.0, "step": 2176 }, { "completion_length": 267.06121826171875, "epoch": 0.2190691823899371, "grad_norm": 0.5166831612586975, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.813410997390747, "reward_std": 0.1108638159930706, "rewards/accuracy_reward": 0.8134110569953918, "rewards/format_reward": 1.0, "step": 2177 }, { "completion_length": 276.8877487182617, "epoch": 0.21916981132075472, "grad_norm": 0.5697761178016663, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.71683669090271, "reward_std": 0.19794508814811707, "rewards/accuracy_reward": 0.7270407974720001, "rewards/format_reward": 0.9897959232330322, "step": 2178 }, { "completion_length": 209.91836547851562, "epoch": 0.21927044025157233, "grad_norm": 0.6647440791130066, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7960566878318787, "reward_std": 0.08012100635096431, "rewards/accuracy_reward": 0.8062607645988464, "rewards/format_reward": 0.9897959232330322, "step": 2179 }, { "completion_length": 199.39795684814453, "epoch": 0.21937106918238994, "grad_norm": 1.0201555490493774, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6165608167648315, "reward_std": 0.0777960829436779, "rewards/accuracy_reward": 0.6267650127410889, "rewards/format_reward": 0.9897959232330322, "step": 2180 }, { "completion_length": 192.80612182617188, "epoch": 0.21947169811320755, "grad_norm": 0.8863093852996826, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.695918321609497, "reward_std": 0.13276729360222816, "rewards/accuracy_reward": 0.6959183216094971, "rewards/format_reward": 1.0, "step": 2181 }, { "completion_length": 195.55101776123047, "epoch": 0.21957232704402516, "grad_norm": 0.8892148733139038, "kl": 0.0833740234375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7751020789146423, "reward_std": 0.09004896506667137, "rewards/accuracy_reward": 0.7751019895076752, "rewards/format_reward": 1.0, "step": 2182 }, { "completion_length": 296.39794921875, "epoch": 0.21967295597484277, "grad_norm": 1.0405105352401733, "kl": 0.047119140625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6064139008522034, "reward_std": 0.2222912423312664, "rewards/accuracy_reward": 0.6166180372238159, "rewards/format_reward": 0.9897959232330322, "step": 2183 }, { "completion_length": 296.2142791748047, "epoch": 0.21977358490566037, "grad_norm": 0.6762236952781677, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6491957306861877, "reward_std": 0.22697406634688377, "rewards/accuracy_reward": 0.6593998074531555, "rewards/format_reward": 0.9897959232330322, "step": 2184 }, { "completion_length": 239.89795684814453, "epoch": 0.21987421383647798, "grad_norm": 0.7761473059654236, "kl": 0.0579833984375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5746443271636963, "reward_std": 0.1470312848687172, "rewards/accuracy_reward": 0.6052566468715668, "rewards/format_reward": 0.9693877398967743, "step": 2185 }, { "completion_length": 244.89795684814453, "epoch": 0.2199748427672956, "grad_norm": 0.6390475630760193, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7495456337928772, "reward_std": 0.18441136181354523, "rewards/accuracy_reward": 0.7699537873268127, "rewards/format_reward": 0.9795918166637421, "step": 2186 }, { "completion_length": 266.7346954345703, "epoch": 0.2200754716981132, "grad_norm": 0.8135467767715454, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6312252283096313, "reward_std": 0.17058308981359005, "rewards/accuracy_reward": 0.6618374586105347, "rewards/format_reward": 0.9693877398967743, "step": 2187 }, { "completion_length": 266.96937561035156, "epoch": 0.2201761006289308, "grad_norm": 0.7550514340400696, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5459399819374084, "reward_std": 0.16800778359174728, "rewards/accuracy_reward": 0.5561440587043762, "rewards/format_reward": 0.9897959232330322, "step": 2188 }, { "completion_length": 236.48979949951172, "epoch": 0.22027672955974842, "grad_norm": 0.9047356843948364, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7477917671203613, "reward_std": 0.2143690139055252, "rewards/accuracy_reward": 0.7579957842826843, "rewards/format_reward": 0.9897959232330322, "step": 2189 }, { "completion_length": 232.9387664794922, "epoch": 0.22037735849056603, "grad_norm": 0.5411642789840698, "kl": 0.0595703125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.711734652519226, "reward_std": 0.1961982101202011, "rewards/accuracy_reward": 0.7321428656578064, "rewards/format_reward": 0.9795918464660645, "step": 2190 }, { "completion_length": 220.0816192626953, "epoch": 0.22047798742138364, "grad_norm": 1.1129688024520874, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6638484001159668, "reward_std": 0.23378697037696838, "rewards/accuracy_reward": 0.6638484001159668, "rewards/format_reward": 1.0, "step": 2191 }, { "completion_length": 210.0408172607422, "epoch": 0.22057861635220125, "grad_norm": 0.48659732937812805, "kl": 0.0584716796875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8481053113937378, "reward_std": 0.09363136440515518, "rewards/accuracy_reward": 0.8481052219867706, "rewards/format_reward": 1.0, "step": 2192 }, { "completion_length": 204.12244415283203, "epoch": 0.22067924528301885, "grad_norm": 2.8428337574005127, "kl": 0.166259765625, "learning_rate": 1e-06, "loss": 0.0066, "reward": 1.8359248042106628, "reward_std": 0.1521681621670723, "rewards/accuracy_reward": 0.8563329577445984, "rewards/format_reward": 0.9795918166637421, "step": 2193 }, { "completion_length": 168.9591827392578, "epoch": 0.2207798742138365, "grad_norm": 0.9083370566368103, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.871985137462616, "reward_std": 0.12328040972352028, "rewards/accuracy_reward": 0.8821892142295837, "rewards/format_reward": 0.9897959232330322, "step": 2194 }, { "completion_length": 259.1836624145508, "epoch": 0.2208805031446541, "grad_norm": 0.7000588178634644, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6632652878761292, "reward_std": 0.11517903953790665, "rewards/accuracy_reward": 0.6734693646430969, "rewards/format_reward": 0.9897959232330322, "step": 2195 }, { "completion_length": 295.7857131958008, "epoch": 0.2209811320754717, "grad_norm": 1.726554036140442, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7938775420188904, "reward_std": 0.19209562987089157, "rewards/accuracy_reward": 0.8142857253551483, "rewards/format_reward": 0.9795918464660645, "step": 2196 }, { "completion_length": 296.51019287109375, "epoch": 0.22108176100628932, "grad_norm": 0.9639984369277954, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7961961030960083, "reward_std": 0.1732400618493557, "rewards/accuracy_reward": 0.8064002096652985, "rewards/format_reward": 0.9897959232330322, "step": 2197 }, { "completion_length": 205.11223602294922, "epoch": 0.22118238993710693, "grad_norm": 0.7788791060447693, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7291989922523499, "reward_std": 0.1887742318212986, "rewards/accuracy_reward": 0.749607115983963, "rewards/format_reward": 0.9795918166637421, "step": 2198 }, { "completion_length": 235.07141876220703, "epoch": 0.22128301886792454, "grad_norm": 0.46261146664619446, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6525753140449524, "reward_std": 0.09870181232690811, "rewards/accuracy_reward": 0.6525753140449524, "rewards/format_reward": 1.0, "step": 2199 }, { "completion_length": 265.27549743652344, "epoch": 0.22138364779874214, "grad_norm": 1.0041108131408691, "kl": 0.0484619140625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6155400276184082, "reward_std": 0.3254287540912628, "rewards/accuracy_reward": 0.6461523175239563, "rewards/format_reward": 0.9693877398967743, "step": 2200 }, { "completion_length": 202.07142639160156, "epoch": 0.22148427672955975, "grad_norm": 1.0845304727554321, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7602128982543945, "reward_std": 0.19599568098783493, "rewards/accuracy_reward": 0.7704169452190399, "rewards/format_reward": 0.9897959232330322, "step": 2201 }, { "completion_length": 195.40816497802734, "epoch": 0.22158490566037736, "grad_norm": 0.7013297080993652, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7857142090797424, "reward_std": 0.10788732394576073, "rewards/accuracy_reward": 0.7857142686843872, "rewards/format_reward": 1.0, "step": 2202 }, { "completion_length": 211.54080963134766, "epoch": 0.22168553459119497, "grad_norm": 0.9373869895935059, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6122448444366455, "reward_std": 0.14721053838729858, "rewards/accuracy_reward": 0.6326530277729034, "rewards/format_reward": 0.9795918166637421, "step": 2203 }, { "completion_length": 201.2040786743164, "epoch": 0.22178616352201258, "grad_norm": 0.9679313898086548, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7682216167449951, "reward_std": 0.17656289786100388, "rewards/accuracy_reward": 0.7886297106742859, "rewards/format_reward": 0.9795918464660645, "step": 2204 }, { "completion_length": 230.56122589111328, "epoch": 0.2218867924528302, "grad_norm": 1.1267973184585571, "kl": 0.0662841796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7348130941390991, "reward_std": 0.24765567481517792, "rewards/accuracy_reward": 0.775629460811615, "rewards/format_reward": 0.9591836631298065, "step": 2205 }, { "completion_length": 203.92857360839844, "epoch": 0.2219874213836478, "grad_norm": 1.1928696632385254, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6768706440925598, "reward_std": 0.1791251376271248, "rewards/accuracy_reward": 0.6870747804641724, "rewards/format_reward": 0.9897959232330322, "step": 2206 }, { "completion_length": 221.7040786743164, "epoch": 0.2220880503144654, "grad_norm": 0.9900704622268677, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7018263339996338, "reward_std": 0.18510745465755463, "rewards/accuracy_reward": 0.712030440568924, "rewards/format_reward": 0.9897959232330322, "step": 2207 }, { "completion_length": 229.32652282714844, "epoch": 0.22218867924528302, "grad_norm": 0.6655951142311096, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6705134510993958, "reward_std": 0.12124881893396378, "rewards/accuracy_reward": 0.6705134212970734, "rewards/format_reward": 1.0, "step": 2208 }, { "completion_length": 165.28571319580078, "epoch": 0.22228930817610063, "grad_norm": 1.4134836196899414, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5969387292861938, "reward_std": 0.10088848695158958, "rewards/accuracy_reward": 0.5969387590885162, "rewards/format_reward": 1.0, "step": 2209 }, { "completion_length": 199.55101013183594, "epoch": 0.22238993710691823, "grad_norm": 0.9849177598953247, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6170759797096252, "reward_std": 0.20773260295391083, "rewards/accuracy_reward": 0.6170759797096252, "rewards/format_reward": 1.0, "step": 2210 }, { "completion_length": 211.59182739257812, "epoch": 0.22249056603773584, "grad_norm": 0.6434733867645264, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6662130951881409, "reward_std": 0.0919932872056961, "rewards/accuracy_reward": 0.676417201757431, "rewards/format_reward": 0.9897959232330322, "step": 2211 }, { "completion_length": 205.4897918701172, "epoch": 0.22259119496855345, "grad_norm": 0.8885044455528259, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6353417038917542, "reward_std": 0.18566761910915375, "rewards/accuracy_reward": 0.6455458402633667, "rewards/format_reward": 0.9897959232330322, "step": 2212 }, { "completion_length": 245.08162689208984, "epoch": 0.22269182389937106, "grad_norm": 2.571061849594116, "kl": 0.0589599609375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6582295298576355, "reward_std": 0.08152149245142937, "rewards/accuracy_reward": 0.6582295000553131, "rewards/format_reward": 1.0, "step": 2213 }, { "completion_length": 255.8163299560547, "epoch": 0.22279245283018867, "grad_norm": 1.123381495475769, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5682458281517029, "reward_std": 0.12783636432141066, "rewards/accuracy_reward": 0.578449934720993, "rewards/format_reward": 0.9897959232330322, "step": 2214 }, { "completion_length": 240.41836547851562, "epoch": 0.22289308176100628, "grad_norm": 3.9551520347595215, "kl": 0.10546875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.666517734527588, "reward_std": 0.24062101542949677, "rewards/accuracy_reward": 0.6665177792310715, "rewards/format_reward": 1.0, "step": 2215 }, { "completion_length": 212.06121826171875, "epoch": 0.2229937106918239, "grad_norm": 5.0462446212768555, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6775957942008972, "reward_std": 0.24217405915260315, "rewards/accuracy_reward": 0.6980039179325104, "rewards/format_reward": 0.9795918464660645, "step": 2216 }, { "completion_length": 196.73468780517578, "epoch": 0.2230943396226415, "grad_norm": 0.31547242403030396, "kl": 0.0574951171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8571428060531616, "reward_std": 0.07636035233736038, "rewards/accuracy_reward": 0.857142835855484, "rewards/format_reward": 1.0, "step": 2217 }, { "completion_length": 261.8367233276367, "epoch": 0.22319496855345913, "grad_norm": 0.7386794686317444, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5629858374595642, "reward_std": 0.17077262699604034, "rewards/accuracy_reward": 0.5833940505981445, "rewards/format_reward": 0.9795918166637421, "step": 2218 }, { "completion_length": 183.2142791748047, "epoch": 0.22329559748427674, "grad_norm": 0.9067921042442322, "kl": 0.089111328125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.501695990562439, "reward_std": 0.08306868933141232, "rewards/accuracy_reward": 0.5016960054636002, "rewards/format_reward": 1.0, "step": 2219 }, { "completion_length": 266.9795913696289, "epoch": 0.22339622641509435, "grad_norm": 0.9560195803642273, "kl": 0.046142578125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.5933518409729004, "reward_std": 0.25173769146203995, "rewards/accuracy_reward": 0.5933518707752228, "rewards/format_reward": 1.0, "step": 2220 }, { "completion_length": 120.6224479675293, "epoch": 0.22349685534591196, "grad_norm": 0.9449621438980103, "kl": 0.124755859375, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.8265305757522583, "reward_std": 0.09217509999871254, "rewards/accuracy_reward": 0.8265305757522583, "rewards/format_reward": 1.0, "step": 2221 }, { "completion_length": 204.22447967529297, "epoch": 0.22359748427672957, "grad_norm": 0.8370823860168457, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.689679741859436, "reward_std": 0.21129482239484787, "rewards/accuracy_reward": 0.6896798610687256, "rewards/format_reward": 1.0, "step": 2222 }, { "completion_length": 240.85713958740234, "epoch": 0.22369811320754718, "grad_norm": 0.9627939462661743, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.62351793050766, "reward_std": 0.26738137751817703, "rewards/accuracy_reward": 0.6541301906108856, "rewards/format_reward": 0.9693877398967743, "step": 2223 }, { "completion_length": 225.7244873046875, "epoch": 0.2237987421383648, "grad_norm": 1.0585522651672363, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7369614839553833, "reward_std": 0.19923672825098038, "rewards/accuracy_reward": 0.7369614243507385, "rewards/format_reward": 1.0, "step": 2224 }, { "completion_length": 248.2653045654297, "epoch": 0.2238993710691824, "grad_norm": 0.8851826786994934, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6817785501480103, "reward_std": 0.14303772896528244, "rewards/accuracy_reward": 0.6919826567173004, "rewards/format_reward": 0.9897959232330322, "step": 2225 }, { "completion_length": 210.78570556640625, "epoch": 0.224, "grad_norm": 0.8671273589134216, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.9144165515899658, "reward_std": 0.11670610308647156, "rewards/accuracy_reward": 0.934824675321579, "rewards/format_reward": 0.9795918464660645, "step": 2226 }, { "completion_length": 197.4693832397461, "epoch": 0.2241006289308176, "grad_norm": 1.1340569257736206, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.750437319278717, "reward_std": 0.15491390414536, "rewards/accuracy_reward": 0.7606413960456848, "rewards/format_reward": 0.9897959232330322, "step": 2227 }, { "completion_length": 291.72447967529297, "epoch": 0.22420125786163522, "grad_norm": 0.5567606687545776, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6300778985023499, "reward_std": 0.12157323956489563, "rewards/accuracy_reward": 0.6300778985023499, "rewards/format_reward": 1.0, "step": 2228 }, { "completion_length": 222.1836700439453, "epoch": 0.22430188679245283, "grad_norm": 1.0814141035079956, "kl": 0.0718994140625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7558308243751526, "reward_std": 0.17909520864486694, "rewards/accuracy_reward": 0.7558308839797974, "rewards/format_reward": 1.0, "step": 2229 }, { "completion_length": 260.5918273925781, "epoch": 0.22440251572327044, "grad_norm": 0.8666971325874329, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6670311093330383, "reward_std": 0.11336430534720421, "rewards/accuracy_reward": 0.6772351264953613, "rewards/format_reward": 0.9897959232330322, "step": 2230 }, { "completion_length": 270.3877410888672, "epoch": 0.22450314465408805, "grad_norm": 0.7219316363334656, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.9253320097923279, "reward_std": 0.15399916842579842, "rewards/accuracy_reward": 0.9457401633262634, "rewards/format_reward": 0.9795918464660645, "step": 2231 }, { "completion_length": 258.79591369628906, "epoch": 0.22460377358490566, "grad_norm": 0.8410083055496216, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5110316276550293, "reward_std": 0.18425299227237701, "rewards/accuracy_reward": 0.5212356746196747, "rewards/format_reward": 0.9897959232330322, "step": 2232 }, { "completion_length": 149.16326522827148, "epoch": 0.22470440251572327, "grad_norm": 1.105323314666748, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8352503776550293, "reward_std": 0.1563473455607891, "rewards/accuracy_reward": 0.8454545140266418, "rewards/format_reward": 0.9897959232330322, "step": 2233 }, { "completion_length": 234.9897918701172, "epoch": 0.22480503144654088, "grad_norm": 1.2925105094909668, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5510203838348389, "reward_std": 0.272076852619648, "rewards/accuracy_reward": 0.5714285671710968, "rewards/format_reward": 0.9795918464660645, "step": 2234 }, { "completion_length": 273.65306091308594, "epoch": 0.22490566037735849, "grad_norm": 0.7515420317649841, "kl": 0.0718994140625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7870423793792725, "reward_std": 0.2108517587184906, "rewards/accuracy_reward": 0.8074505925178528, "rewards/format_reward": 0.9795918166637421, "step": 2235 }, { "completion_length": 182.41836547851562, "epoch": 0.2250062893081761, "grad_norm": 0.9483771920204163, "kl": 0.10498046875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7283858060836792, "reward_std": 0.11781171336770058, "rewards/accuracy_reward": 0.728385865688324, "rewards/format_reward": 1.0, "step": 2236 }, { "completion_length": 223.2551040649414, "epoch": 0.2251069182389937, "grad_norm": 0.8289863467216492, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6465824246406555, "reward_std": 0.18473180383443832, "rewards/accuracy_reward": 0.6567865312099457, "rewards/format_reward": 0.9897959232330322, "step": 2237 }, { "completion_length": 195.13265228271484, "epoch": 0.2252075471698113, "grad_norm": 0.7818146347999573, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7837706208229065, "reward_std": 0.186021126806736, "rewards/accuracy_reward": 0.7939747273921967, "rewards/format_reward": 0.9897959232330322, "step": 2238 }, { "completion_length": 247.23468780517578, "epoch": 0.22530817610062892, "grad_norm": 1.6632956266403198, "kl": 0.0614013671875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.639455795288086, "reward_std": 0.14504818618297577, "rewards/accuracy_reward": 0.6598638892173767, "rewards/format_reward": 0.9795918166637421, "step": 2239 }, { "completion_length": 263.39794921875, "epoch": 0.22540880503144653, "grad_norm": 4.117835521697998, "kl": 0.0733642578125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6244661808013916, "reward_std": 0.20831774175167084, "rewards/accuracy_reward": 0.6346702873706818, "rewards/format_reward": 0.9897959232330322, "step": 2240 }, { "completion_length": 188.1326446533203, "epoch": 0.22550943396226414, "grad_norm": 1.6012507677078247, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.836734652519226, "reward_std": 0.19220630824565887, "rewards/accuracy_reward": 0.8367346823215485, "rewards/format_reward": 1.0, "step": 2241 }, { "completion_length": 183.23468780517578, "epoch": 0.22561006289308175, "grad_norm": 3.2154245376586914, "kl": 0.0743408203125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8095238208770752, "reward_std": 0.1652299165725708, "rewards/accuracy_reward": 0.819727897644043, "rewards/format_reward": 0.9897959232330322, "step": 2242 }, { "completion_length": 234.84693908691406, "epoch": 0.22571069182389938, "grad_norm": 0.6303834319114685, "kl": 0.054443359375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.727065086364746, "reward_std": 0.19431953877210617, "rewards/accuracy_reward": 0.7270651161670685, "rewards/format_reward": 1.0, "step": 2243 }, { "completion_length": 276.0306091308594, "epoch": 0.225811320754717, "grad_norm": 1.4846177101135254, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6029923558235168, "reward_std": 0.2530987560749054, "rewards/accuracy_reward": 0.6234005391597748, "rewards/format_reward": 0.9795918166637421, "step": 2244 }, { "completion_length": 263.46937561035156, "epoch": 0.2259119496855346, "grad_norm": 0.5529593825340271, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6416908502578735, "reward_std": 0.15868261456489563, "rewards/accuracy_reward": 0.6416909396648407, "rewards/format_reward": 1.0, "step": 2245 }, { "completion_length": 324.1632537841797, "epoch": 0.2260125786163522, "grad_norm": 1.1246259212493896, "kl": 0.056396484375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6597922444343567, "reward_std": 0.2690821960568428, "rewards/accuracy_reward": 0.6597923040390015, "rewards/format_reward": 1.0, "step": 2246 }, { "completion_length": 228.43877410888672, "epoch": 0.22611320754716982, "grad_norm": 1.1040239334106445, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8098064064979553, "reward_std": 0.08917771279811859, "rewards/accuracy_reward": 0.8098063468933105, "rewards/format_reward": 1.0, "step": 2247 }, { "completion_length": 196.02040100097656, "epoch": 0.22621383647798743, "grad_norm": 0.9495697021484375, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7653060555458069, "reward_std": 0.1652088537812233, "rewards/accuracy_reward": 0.7755101919174194, "rewards/format_reward": 0.9897959232330322, "step": 2248 }, { "completion_length": 177.88774871826172, "epoch": 0.22631446540880504, "grad_norm": 0.6788946986198425, "kl": 0.0755615234375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.749729335308075, "reward_std": 0.12531941384077072, "rewards/accuracy_reward": 0.7701375782489777, "rewards/format_reward": 0.9795918166637421, "step": 2249 }, { "completion_length": 231.21428680419922, "epoch": 0.22641509433962265, "grad_norm": 0.3343229591846466, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6948360800743103, "reward_std": 0.14436165243387222, "rewards/accuracy_reward": 0.7050401866436005, "rewards/format_reward": 0.9897959232330322, "step": 2250 }, { "completion_length": 293.82652282714844, "epoch": 0.22651572327044026, "grad_norm": 2.2709035873413086, "kl": 0.0472412109375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7244897484779358, "reward_std": 0.24740415066480637, "rewards/accuracy_reward": 0.7448979318141937, "rewards/format_reward": 0.9795918464660645, "step": 2251 }, { "completion_length": 243.2142791748047, "epoch": 0.22661635220125786, "grad_norm": 1.5668377876281738, "kl": 0.09423828125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6592970490455627, "reward_std": 0.25949639081954956, "rewards/accuracy_reward": 0.6592970490455627, "rewards/format_reward": 1.0, "step": 2252 }, { "completion_length": 253.33673858642578, "epoch": 0.22671698113207547, "grad_norm": 0.5336302518844604, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8594912886619568, "reward_std": 0.1512260138988495, "rewards/accuracy_reward": 0.8594914078712463, "rewards/format_reward": 1.0, "step": 2253 }, { "completion_length": 278.2244873046875, "epoch": 0.22681761006289308, "grad_norm": 1.4151555299758911, "kl": 0.0699462890625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.735759198665619, "reward_std": 0.20855076611042023, "rewards/accuracy_reward": 0.7459632754325867, "rewards/format_reward": 0.9897959232330322, "step": 2254 }, { "completion_length": 329.4387664794922, "epoch": 0.2269182389937107, "grad_norm": 2.836303472518921, "kl": 0.0489501953125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.743027150630951, "reward_std": 0.21239224821329117, "rewards/accuracy_reward": 0.7532312572002411, "rewards/format_reward": 0.9897959232330322, "step": 2255 }, { "completion_length": 299.82652282714844, "epoch": 0.2270188679245283, "grad_norm": 0.49806615710258484, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.711972713470459, "reward_std": 0.13354108110070229, "rewards/accuracy_reward": 0.7323808968067169, "rewards/format_reward": 0.9795918464660645, "step": 2256 }, { "completion_length": 287.02040100097656, "epoch": 0.2271194968553459, "grad_norm": 1.0664737224578857, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7102278470993042, "reward_std": 0.2646605297923088, "rewards/accuracy_reward": 0.7204319536685944, "rewards/format_reward": 0.9897959232330322, "step": 2257 }, { "completion_length": 211.7346954345703, "epoch": 0.22722012578616352, "grad_norm": 1.0079957246780396, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.862973690032959, "reward_std": 0.19685640186071396, "rewards/accuracy_reward": 0.8629737198352814, "rewards/format_reward": 1.0, "step": 2258 }, { "completion_length": 234.26529693603516, "epoch": 0.22732075471698113, "grad_norm": 6.617306709289551, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.738404393196106, "reward_std": 0.19837739691138268, "rewards/accuracy_reward": 0.7588126063346863, "rewards/format_reward": 0.9795918166637421, "step": 2259 }, { "completion_length": 240.27550506591797, "epoch": 0.22742138364779874, "grad_norm": 0.3664441704750061, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8024118542671204, "reward_std": 0.07146486267447472, "rewards/accuracy_reward": 0.8024118840694427, "rewards/format_reward": 1.0, "step": 2260 }, { "completion_length": 281.2755126953125, "epoch": 0.22752201257861634, "grad_norm": 0.7259037494659424, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.646267056465149, "reward_std": 0.30209487676620483, "rewards/accuracy_reward": 0.68708336353302, "rewards/format_reward": 0.9591836631298065, "step": 2261 }, { "completion_length": 227.7448959350586, "epoch": 0.22762264150943395, "grad_norm": 1.1191822290420532, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7043907642364502, "reward_std": 0.21925894170999527, "rewards/accuracy_reward": 0.704390823841095, "rewards/format_reward": 1.0, "step": 2262 }, { "completion_length": 273.10203552246094, "epoch": 0.22772327044025156, "grad_norm": 1.0746150016784668, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.723831057548523, "reward_std": 0.20137657225131989, "rewards/accuracy_reward": 0.7238309681415558, "rewards/format_reward": 1.0, "step": 2263 }, { "completion_length": 272.32653045654297, "epoch": 0.22782389937106917, "grad_norm": 0.9649458527565002, "kl": 0.094970703125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.4159234762191772, "reward_std": 0.24313632398843765, "rewards/accuracy_reward": 0.4363315999507904, "rewards/format_reward": 0.9795918464660645, "step": 2264 }, { "completion_length": 320.5408020019531, "epoch": 0.22792452830188678, "grad_norm": 0.5734569430351257, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.601328194141388, "reward_std": 0.1552184522151947, "rewards/accuracy_reward": 0.6013281047344208, "rewards/format_reward": 1.0, "step": 2265 }, { "completion_length": 217.6530532836914, "epoch": 0.2280251572327044, "grad_norm": 0.9077173471450806, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6614512205123901, "reward_std": 0.18470449000597, "rewards/accuracy_reward": 0.6614512503147125, "rewards/format_reward": 1.0, "step": 2266 }, { "completion_length": 194.9081573486328, "epoch": 0.22812578616352203, "grad_norm": 4.429165840148926, "kl": 0.10888671875, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.6173469424247742, "reward_std": 0.21501988545060158, "rewards/accuracy_reward": 0.6173469424247742, "rewards/format_reward": 1.0, "step": 2267 }, { "completion_length": 236.16326141357422, "epoch": 0.22822641509433964, "grad_norm": 2.2618026733398438, "kl": 0.1573486328125, "learning_rate": 1e-06, "loss": 0.0063, "reward": 1.7158058881759644, "reward_std": 0.24186152964830399, "rewards/accuracy_reward": 0.7260099351406097, "rewards/format_reward": 0.9897959232330322, "step": 2268 }, { "completion_length": 244.6734619140625, "epoch": 0.22832704402515724, "grad_norm": 1.0191185474395752, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5693877339363098, "reward_std": 0.20233186334371567, "rewards/accuracy_reward": 0.5693877637386322, "rewards/format_reward": 1.0, "step": 2269 }, { "completion_length": 239.16326141357422, "epoch": 0.22842767295597485, "grad_norm": 0.8310889005661011, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7139352560043335, "reward_std": 0.14356299489736557, "rewards/accuracy_reward": 0.7139352560043335, "rewards/format_reward": 1.0, "step": 2270 }, { "completion_length": 318.4897918701172, "epoch": 0.22852830188679246, "grad_norm": 0.5303008556365967, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5620715022087097, "reward_std": 0.1870739459991455, "rewards/accuracy_reward": 0.5824796259403229, "rewards/format_reward": 0.9795918166637421, "step": 2271 }, { "completion_length": 308.5816192626953, "epoch": 0.22862893081761007, "grad_norm": 2.424631357192993, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5369614362716675, "reward_std": 0.2611277922987938, "rewards/accuracy_reward": 0.5471655279397964, "rewards/format_reward": 0.9897959232330322, "step": 2272 }, { "completion_length": 328.0305938720703, "epoch": 0.22872955974842768, "grad_norm": 0.6518153548240662, "kl": 0.0478515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7585034370422363, "reward_std": 0.22791855037212372, "rewards/accuracy_reward": 0.7789115905761719, "rewards/format_reward": 0.9795918464660645, "step": 2273 }, { "completion_length": 274.7142791748047, "epoch": 0.2288301886792453, "grad_norm": 0.6925265789031982, "kl": 0.0670166015625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8096694946289062, "reward_std": 0.16716091707348824, "rewards/accuracy_reward": 0.809669554233551, "rewards/format_reward": 1.0, "step": 2274 }, { "completion_length": 206.37754821777344, "epoch": 0.2289308176100629, "grad_norm": 0.6355696320533752, "kl": 0.10400390625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6153624653816223, "reward_std": 0.16078584641218185, "rewards/accuracy_reward": 0.6153625249862671, "rewards/format_reward": 1.0, "step": 2275 }, { "completion_length": 248.22447967529297, "epoch": 0.2290314465408805, "grad_norm": 0.45068591833114624, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8187540173530579, "reward_std": 0.1844261921942234, "rewards/accuracy_reward": 0.8391622006893158, "rewards/format_reward": 0.9795918166637421, "step": 2276 }, { "completion_length": 278.2244873046875, "epoch": 0.22913207547169812, "grad_norm": 1.4829522371292114, "kl": 0.0716552734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.592515230178833, "reward_std": 0.18681024014949799, "rewards/accuracy_reward": 0.592515230178833, "rewards/format_reward": 1.0, "step": 2277 }, { "completion_length": 306.15306091308594, "epoch": 0.22923270440251572, "grad_norm": 1.5866857767105103, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5886621475219727, "reward_std": 0.22535506635904312, "rewards/accuracy_reward": 0.598866194486618, "rewards/format_reward": 0.9897959232330322, "step": 2278 }, { "completion_length": 313.83673095703125, "epoch": 0.22933333333333333, "grad_norm": 0.9415793418884277, "kl": 0.046630859375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5533190965652466, "reward_std": 0.2422148436307907, "rewards/accuracy_reward": 0.5737272948026657, "rewards/format_reward": 0.9795918166637421, "step": 2279 }, { "completion_length": 315.56121826171875, "epoch": 0.22943396226415094, "grad_norm": 1.228273630142212, "kl": 0.0589599609375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.562171995639801, "reward_std": 0.2970357611775398, "rewards/accuracy_reward": 0.5927842259407043, "rewards/format_reward": 0.9693877398967743, "step": 2280 }, { "completion_length": 275.7346954345703, "epoch": 0.22953459119496855, "grad_norm": 0.7697174549102783, "kl": 0.0670166015625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7180110216140747, "reward_std": 0.24914813041687012, "rewards/accuracy_reward": 0.7180109918117523, "rewards/format_reward": 1.0, "step": 2281 }, { "completion_length": 209.448974609375, "epoch": 0.22963522012578616, "grad_norm": 0.5433375239372253, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7551020383834839, "reward_std": 0.14153798669576645, "rewards/accuracy_reward": 0.7551020383834839, "rewards/format_reward": 1.0, "step": 2282 }, { "completion_length": 236.77550506591797, "epoch": 0.22973584905660377, "grad_norm": 0.649813711643219, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7013605237007141, "reward_std": 0.125474501401186, "rewards/accuracy_reward": 0.7115646153688431, "rewards/format_reward": 0.9897959232330322, "step": 2283 }, { "completion_length": 212.54080963134766, "epoch": 0.22983647798742138, "grad_norm": 0.8778161406517029, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8223446011543274, "reward_std": 0.1775718256831169, "rewards/accuracy_reward": 0.8427528440952301, "rewards/format_reward": 0.9795918464660645, "step": 2284 }, { "completion_length": 277.8163146972656, "epoch": 0.229937106918239, "grad_norm": 0.6858577728271484, "kl": 0.0753173828125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.792325735092163, "reward_std": 0.18535996973514557, "rewards/accuracy_reward": 0.812733918428421, "rewards/format_reward": 0.9795918464660645, "step": 2285 }, { "completion_length": 253.0408172607422, "epoch": 0.2300377358490566, "grad_norm": 0.9556591510772705, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6450666785240173, "reward_std": 0.23202566802501678, "rewards/accuracy_reward": 0.6450667977333069, "rewards/format_reward": 1.0, "step": 2286 }, { "completion_length": 287.34693908691406, "epoch": 0.2301383647798742, "grad_norm": 0.7817034125328064, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7062894701957703, "reward_std": 0.22168191522359848, "rewards/accuracy_reward": 0.7164935767650604, "rewards/format_reward": 0.9897959232330322, "step": 2287 }, { "completion_length": 322.346923828125, "epoch": 0.2302389937106918, "grad_norm": 0.7592135071754456, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6638615727424622, "reward_std": 0.23073800653219223, "rewards/accuracy_reward": 0.6740657091140747, "rewards/format_reward": 0.9897959232330322, "step": 2288 }, { "completion_length": 219.9387664794922, "epoch": 0.23033962264150942, "grad_norm": 0.5218398571014404, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8040816187858582, "reward_std": 0.12910156324505806, "rewards/accuracy_reward": 0.8142856955528259, "rewards/format_reward": 0.9897959232330322, "step": 2289 }, { "completion_length": 205.03060913085938, "epoch": 0.23044025157232703, "grad_norm": 0.7041828036308289, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7533060908317566, "reward_std": 0.11771904304623604, "rewards/accuracy_reward": 0.7635101079940796, "rewards/format_reward": 0.9897959232330322, "step": 2290 }, { "completion_length": 268.46937561035156, "epoch": 0.23054088050314464, "grad_norm": 24.78972053527832, "kl": 0.57763671875, "learning_rate": 1e-06, "loss": 0.0232, "reward": 1.661519706249237, "reward_std": 0.29338911175727844, "rewards/accuracy_reward": 0.6921319663524628, "rewards/format_reward": 0.9693877398967743, "step": 2291 }, { "completion_length": 275.5408172607422, "epoch": 0.23064150943396228, "grad_norm": 0.4766346216201782, "kl": 0.0595703125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6742630004882812, "reward_std": 0.162953183054924, "rewards/accuracy_reward": 0.6946711540222168, "rewards/format_reward": 0.9795918464660645, "step": 2292 }, { "completion_length": 304.4183578491211, "epoch": 0.2307421383647799, "grad_norm": 2.4673006534576416, "kl": 0.0595703125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.520602524280548, "reward_std": 0.2748645842075348, "rewards/accuracy_reward": 0.5308066010475159, "rewards/format_reward": 0.9897959232330322, "step": 2293 }, { "completion_length": 284.62245178222656, "epoch": 0.2308427672955975, "grad_norm": 0.8443219661712646, "kl": 0.050537109375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.657476544380188, "reward_std": 0.17160499468445778, "rewards/accuracy_reward": 0.6778847277164459, "rewards/format_reward": 0.9795918166637421, "step": 2294 }, { "completion_length": 367.948974609375, "epoch": 0.2309433962264151, "grad_norm": 0.9124691486358643, "kl": 0.052001953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6347529888153076, "reward_std": 0.33251629769802094, "rewards/accuracy_reward": 0.6857734620571136, "rewards/format_reward": 0.9489795863628387, "step": 2295 }, { "completion_length": 279.94898223876953, "epoch": 0.2310440251572327, "grad_norm": 0.7387452721595764, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6114068627357483, "reward_std": 0.1921621784567833, "rewards/accuracy_reward": 0.6114068627357483, "rewards/format_reward": 1.0, "step": 2296 }, { "completion_length": 255.29590606689453, "epoch": 0.23114465408805032, "grad_norm": 0.9479809403419495, "kl": 0.056640625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6272108554840088, "reward_std": 0.22487632930278778, "rewards/accuracy_reward": 0.6476190388202667, "rewards/format_reward": 0.9795918166637421, "step": 2297 }, { "completion_length": 260.6428451538086, "epoch": 0.23124528301886793, "grad_norm": 0.6132417321205139, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6142572164535522, "reward_std": 0.177944615483284, "rewards/accuracy_reward": 0.62446129322052, "rewards/format_reward": 0.9897959232330322, "step": 2298 }, { "completion_length": 295.34693908691406, "epoch": 0.23134591194968554, "grad_norm": 0.5631912350654602, "kl": 0.0704345703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7417462468147278, "reward_std": 0.1890873908996582, "rewards/accuracy_reward": 0.7519503831863403, "rewards/format_reward": 0.9897959232330322, "step": 2299 }, { "completion_length": 207.90816497802734, "epoch": 0.23144654088050315, "grad_norm": 1.3197004795074463, "kl": 0.0701904296875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7287009954452515, "reward_std": 0.26894713938236237, "rewards/accuracy_reward": 0.7287010252475739, "rewards/format_reward": 1.0, "step": 2300 }, { "completion_length": 258.27550506591797, "epoch": 0.23154716981132076, "grad_norm": 0.8768389821052551, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7097798585891724, "reward_std": 0.21833652257919312, "rewards/accuracy_reward": 0.719983845949173, "rewards/format_reward": 0.9897959232330322, "step": 2301 }, { "completion_length": 338.5408172607422, "epoch": 0.23164779874213837, "grad_norm": 0.9116306900978088, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6666048169136047, "reward_std": 0.2085193619132042, "rewards/accuracy_reward": 0.6768089234828949, "rewards/format_reward": 0.9897959232330322, "step": 2302 }, { "completion_length": 252.1428451538086, "epoch": 0.23174842767295598, "grad_norm": 1.30701744556427, "kl": 0.0684814453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6021979451179504, "reward_std": 0.2846352607011795, "rewards/accuracy_reward": 0.6226061284542084, "rewards/format_reward": 0.9795918166637421, "step": 2303 }, { "completion_length": 277.6530532836914, "epoch": 0.23184905660377358, "grad_norm": 1.2333908081054688, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6537039279937744, "reward_std": 0.27638792246580124, "rewards/accuracy_reward": 0.67411208152771, "rewards/format_reward": 0.9795918166637421, "step": 2304 }, { "completion_length": 306.98978424072266, "epoch": 0.2319496855345912, "grad_norm": 0.6450001001358032, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.639455795288086, "reward_std": 0.17420800030231476, "rewards/accuracy_reward": 0.6394557952880859, "rewards/format_reward": 1.0, "step": 2305 }, { "completion_length": 308.34693908691406, "epoch": 0.2320503144654088, "grad_norm": 0.6130468249320984, "kl": 0.0595703125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6578540205955505, "reward_std": 0.18565236404538155, "rewards/accuracy_reward": 0.6680581569671631, "rewards/format_reward": 0.9897959232330322, "step": 2306 }, { "completion_length": 307.92857360839844, "epoch": 0.2321509433962264, "grad_norm": 0.8901839852333069, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6025753021240234, "reward_std": 0.14767751842737198, "rewards/accuracy_reward": 0.6127793788909912, "rewards/format_reward": 0.9897959232330322, "step": 2307 }, { "completion_length": 319.35713958740234, "epoch": 0.23225157232704402, "grad_norm": 0.6923562288284302, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.728584885597229, "reward_std": 0.16863242536783218, "rewards/accuracy_reward": 0.7489930391311646, "rewards/format_reward": 0.9795918166637421, "step": 2308 }, { "completion_length": 211.7244873046875, "epoch": 0.23235220125786163, "grad_norm": 1.1732070446014404, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8278911113739014, "reward_std": 0.20895089954137802, "rewards/accuracy_reward": 0.8380952179431915, "rewards/format_reward": 0.9897959232330322, "step": 2309 }, { "completion_length": 291.9591827392578, "epoch": 0.23245283018867924, "grad_norm": 1.0233041048049927, "kl": 0.0640869140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6065112352371216, "reward_std": 0.3205288052558899, "rewards/accuracy_reward": 0.6269193291664124, "rewards/format_reward": 0.9795918464660645, "step": 2310 }, { "completion_length": 286.7653045654297, "epoch": 0.23255345911949685, "grad_norm": 0.7775588035583496, "kl": 0.0548095703125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7659010291099548, "reward_std": 0.16105026006698608, "rewards/accuracy_reward": 0.7659009695053101, "rewards/format_reward": 1.0, "step": 2311 }, { "completion_length": 254.84693145751953, "epoch": 0.23265408805031446, "grad_norm": 0.6866114139556885, "kl": 0.05712890625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.836248755455017, "reward_std": 0.19889332354068756, "rewards/accuracy_reward": 0.8566569089889526, "rewards/format_reward": 0.9795918166637421, "step": 2312 }, { "completion_length": 266.9387741088867, "epoch": 0.23275471698113206, "grad_norm": 0.5028848648071289, "kl": 0.048095703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.825453519821167, "reward_std": 0.13392265513539314, "rewards/accuracy_reward": 0.8254534900188446, "rewards/format_reward": 1.0, "step": 2313 }, { "completion_length": 248.9285659790039, "epoch": 0.23285534591194967, "grad_norm": 0.9210641384124756, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8061224222183228, "reward_std": 0.16984087973833084, "rewards/accuracy_reward": 0.8061224222183228, "rewards/format_reward": 1.0, "step": 2314 }, { "completion_length": 338.7040710449219, "epoch": 0.23295597484276728, "grad_norm": 0.8617719411849976, "kl": 0.05712890625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.3858243823051453, "reward_std": 0.2172009013593197, "rewards/accuracy_reward": 0.42664074897766113, "rewards/format_reward": 0.9591836631298065, "step": 2315 }, { "completion_length": 236.78571319580078, "epoch": 0.23305660377358492, "grad_norm": 1.1064252853393555, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.566666603088379, "reward_std": 0.19945135712623596, "rewards/accuracy_reward": 0.5870748460292816, "rewards/format_reward": 0.9795918166637421, "step": 2316 }, { "completion_length": 200.19387817382812, "epoch": 0.23315723270440253, "grad_norm": 0.5230211615562439, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.9146566987037659, "reward_std": 0.11738584190607071, "rewards/accuracy_reward": 0.9248608648777008, "rewards/format_reward": 0.9897959232330322, "step": 2317 }, { "completion_length": 265.55101776123047, "epoch": 0.23325786163522014, "grad_norm": 0.5670405030250549, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6836734414100647, "reward_std": 0.16984088718891144, "rewards/accuracy_reward": 0.7040816247463226, "rewards/format_reward": 0.9795918166637421, "step": 2318 }, { "completion_length": 248.87754821777344, "epoch": 0.23335849056603775, "grad_norm": 0.8473193049430847, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.5704346895217896, "reward_std": 0.18804314360022545, "rewards/accuracy_reward": 0.5806387662887573, "rewards/format_reward": 0.9897959232330322, "step": 2319 }, { "completion_length": 221.80612182617188, "epoch": 0.23345911949685536, "grad_norm": 0.659975528717041, "kl": 0.0718994140625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7429382801055908, "reward_std": 0.1833592988550663, "rewards/accuracy_reward": 0.7429382801055908, "rewards/format_reward": 1.0, "step": 2320 }, { "completion_length": 310.72447204589844, "epoch": 0.23355974842767296, "grad_norm": 0.7696061134338379, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.596103847026825, "reward_std": 0.21194026619195938, "rewards/accuracy_reward": 0.6165120750665665, "rewards/format_reward": 0.9795918464660645, "step": 2321 }, { "completion_length": 258.53060150146484, "epoch": 0.23366037735849057, "grad_norm": 1.428857445716858, "kl": 0.0740966796875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7486084699630737, "reward_std": 0.2982350140810013, "rewards/accuracy_reward": 0.7792207598686218, "rewards/format_reward": 0.9693877398967743, "step": 2322 }, { "completion_length": 283.39794921875, "epoch": 0.23376100628930818, "grad_norm": 0.9492547512054443, "kl": 0.046142578125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6136053800582886, "reward_std": 0.30919960141181946, "rewards/accuracy_reward": 0.6340135931968689, "rewards/format_reward": 0.9795918464660645, "step": 2323 }, { "completion_length": 270.39796447753906, "epoch": 0.2338616352201258, "grad_norm": 1.463058590888977, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6973130106925964, "reward_std": 0.2201031967997551, "rewards/accuracy_reward": 0.7075170874595642, "rewards/format_reward": 0.9897959232330322, "step": 2324 }, { "completion_length": 134.4591827392578, "epoch": 0.2339622641509434, "grad_norm": 1.1107538938522339, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8163264989852905, "reward_std": 0.08884849026799202, "rewards/accuracy_reward": 0.8163264989852905, "rewards/format_reward": 1.0, "step": 2325 }, { "completion_length": 217.5204086303711, "epoch": 0.234062893081761, "grad_norm": 0.6164371371269226, "kl": 0.059326171875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.646088421344757, "reward_std": 0.13334324955940247, "rewards/accuracy_reward": 0.6460884660482407, "rewards/format_reward": 1.0, "step": 2326 }, { "completion_length": 232.31632232666016, "epoch": 0.23416352201257862, "grad_norm": 2.226872444152832, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5482993125915527, "reward_std": 0.16843585669994354, "rewards/accuracy_reward": 0.5687074661254883, "rewards/format_reward": 0.9795918464660645, "step": 2327 }, { "completion_length": 343.8061218261719, "epoch": 0.23426415094339623, "grad_norm": 0.5958755016326904, "kl": 0.05712890625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.3988338112831116, "reward_std": 0.14001647010445595, "rewards/accuracy_reward": 0.3988337963819504, "rewards/format_reward": 1.0, "step": 2328 }, { "completion_length": 164.19387817382812, "epoch": 0.23436477987421384, "grad_norm": 1.2058415412902832, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7831323146820068, "reward_std": 0.07150986418128014, "rewards/accuracy_reward": 0.7831323444843292, "rewards/format_reward": 1.0, "step": 2329 }, { "completion_length": 233.51019668579102, "epoch": 0.23446540880503144, "grad_norm": 1.0172486305236816, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8041375279426575, "reward_std": 0.16731977462768555, "rewards/accuracy_reward": 0.8041375875473022, "rewards/format_reward": 1.0, "step": 2330 }, { "completion_length": 298.4387741088867, "epoch": 0.23456603773584905, "grad_norm": 0.9853776693344116, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6020407676696777, "reward_std": 0.288910910487175, "rewards/accuracy_reward": 0.6326530575752258, "rewards/format_reward": 0.9693877398967743, "step": 2331 }, { "completion_length": 211.31632232666016, "epoch": 0.23466666666666666, "grad_norm": 0.524371325969696, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7006802558898926, "reward_std": 0.09351196140050888, "rewards/accuracy_reward": 0.710884302854538, "rewards/format_reward": 0.9897959232330322, "step": 2332 }, { "completion_length": 375.89794921875, "epoch": 0.23476729559748427, "grad_norm": 0.22509650886058807, "kl": 0.0506591796875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6824041604995728, "reward_std": 0.06883978843688965, "rewards/accuracy_reward": 0.6926082074642181, "rewards/format_reward": 0.9897959232330322, "step": 2333 }, { "completion_length": 303.4897766113281, "epoch": 0.23486792452830188, "grad_norm": 1.00477933883667, "kl": 0.0447998046875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.60517156124115, "reward_std": 0.1319853775203228, "rewards/accuracy_reward": 0.6153756678104401, "rewards/format_reward": 0.9897959232330322, "step": 2334 }, { "completion_length": 245.0816192626953, "epoch": 0.2349685534591195, "grad_norm": 1.608890414237976, "kl": 0.0689697265625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7238181829452515, "reward_std": 0.2543286234140396, "rewards/accuracy_reward": 0.7340221703052521, "rewards/format_reward": 0.9897959232330322, "step": 2335 }, { "completion_length": 295.49998474121094, "epoch": 0.2350691823899371, "grad_norm": 0.5986706018447876, "kl": 0.0579833984375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6938774585723877, "reward_std": 0.23038649559020996, "rewards/accuracy_reward": 0.6938775181770325, "rewards/format_reward": 1.0, "step": 2336 }, { "completion_length": 179.34693145751953, "epoch": 0.2351698113207547, "grad_norm": 0.8121424913406372, "kl": 0.0467529296875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7983478903770447, "reward_std": 0.10485968738794327, "rewards/accuracy_reward": 0.7983478903770447, "rewards/format_reward": 1.0, "step": 2337 }, { "completion_length": 247.25509643554688, "epoch": 0.23527044025157232, "grad_norm": 0.8598008155822754, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.4897959232330322, "reward_std": 0.19220630824565887, "rewards/accuracy_reward": 0.4897959232330322, "rewards/format_reward": 1.0, "step": 2338 }, { "completion_length": 319.6122283935547, "epoch": 0.23537106918238992, "grad_norm": 0.7105712294578552, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6885964274406433, "reward_std": 0.2197653278708458, "rewards/accuracy_reward": 0.6885964572429657, "rewards/format_reward": 1.0, "step": 2339 }, { "completion_length": 305.2550964355469, "epoch": 0.23547169811320753, "grad_norm": 1.4928010702133179, "kl": 0.0753173828125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5816102623939514, "reward_std": 0.29729539155960083, "rewards/accuracy_reward": 0.6326307356357574, "rewards/format_reward": 0.9489795565605164, "step": 2340 }, { "completion_length": 254.48978424072266, "epoch": 0.23557232704402517, "grad_norm": 0.945231556892395, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.78950434923172, "reward_std": 0.18749652802944183, "rewards/accuracy_reward": 0.7997084558010101, "rewards/format_reward": 0.9897959232330322, "step": 2341 }, { "completion_length": 260.29590606689453, "epoch": 0.23567295597484278, "grad_norm": 1.0825648307800293, "kl": 0.0635986328125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6448339223861694, "reward_std": 0.31463225185871124, "rewards/accuracy_reward": 0.6754461526870728, "rewards/format_reward": 0.9693877398967743, "step": 2342 }, { "completion_length": 230.66326141357422, "epoch": 0.2357735849056604, "grad_norm": 0.7627806663513184, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7622946500778198, "reward_std": 0.20410548895597458, "rewards/accuracy_reward": 0.7929068803787231, "rewards/format_reward": 0.9693877398967743, "step": 2343 }, { "completion_length": 228.3571319580078, "epoch": 0.235874213836478, "grad_norm": 0.5483154058456421, "kl": 0.0596923828125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7361515164375305, "reward_std": 0.1569828912615776, "rewards/accuracy_reward": 0.7361516058444977, "rewards/format_reward": 1.0, "step": 2344 }, { "completion_length": 238.14285278320312, "epoch": 0.2359748427672956, "grad_norm": 0.5124054551124573, "kl": 0.0626220703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8061224222183228, "reward_std": 0.09217509999871254, "rewards/accuracy_reward": 0.8163264989852905, "rewards/format_reward": 0.9897959232330322, "step": 2345 }, { "completion_length": 174.7959213256836, "epoch": 0.23607547169811322, "grad_norm": 0.9616469144821167, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.780825674533844, "reward_std": 0.053174134343862534, "rewards/accuracy_reward": 0.7808257341384888, "rewards/format_reward": 1.0, "step": 2346 }, { "completion_length": 238.75509643554688, "epoch": 0.23617610062893082, "grad_norm": 0.5881441235542297, "kl": 0.0576171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6615429520606995, "reward_std": 0.11308180540800095, "rewards/accuracy_reward": 0.6615430414676666, "rewards/format_reward": 1.0, "step": 2347 }, { "completion_length": 277.4081573486328, "epoch": 0.23627672955974843, "grad_norm": 0.5817590355873108, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7623085975646973, "reward_std": 0.18397199362516403, "rewards/accuracy_reward": 0.7623086273670197, "rewards/format_reward": 1.0, "step": 2348 }, { "completion_length": 263.9183654785156, "epoch": 0.23637735849056604, "grad_norm": 0.9473773837089539, "kl": 0.0640869140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.694403052330017, "reward_std": 0.1993589997291565, "rewards/accuracy_reward": 0.6944030225276947, "rewards/format_reward": 1.0, "step": 2349 }, { "completion_length": 220.20407104492188, "epoch": 0.23647798742138365, "grad_norm": 1.1073143482208252, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6696144938468933, "reward_std": 0.15222929418087006, "rewards/accuracy_reward": 0.6798186004161835, "rewards/format_reward": 0.9897959232330322, "step": 2350 }, { "completion_length": 269.4081573486328, "epoch": 0.23657861635220126, "grad_norm": 0.5225408673286438, "kl": 0.0726318359375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7031705379486084, "reward_std": 0.11464637890458107, "rewards/accuracy_reward": 0.7133745849132538, "rewards/format_reward": 0.9897959232330322, "step": 2351 }, { "completion_length": 245.09182739257812, "epoch": 0.23667924528301887, "grad_norm": 0.8116879463195801, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.673112392425537, "reward_std": 0.2069701850414276, "rewards/accuracy_reward": 0.683316558599472, "rewards/format_reward": 0.9897959232330322, "step": 2352 }, { "completion_length": 289.03060150146484, "epoch": 0.23677987421383648, "grad_norm": 0.7734878659248352, "kl": 0.0753173828125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5312924981117249, "reward_std": 0.23145636916160583, "rewards/accuracy_reward": 0.5414965748786926, "rewards/format_reward": 0.9897959232330322, "step": 2353 }, { "completion_length": 201.02040100097656, "epoch": 0.2368805031446541, "grad_norm": 0.63570636510849, "kl": 0.0584716796875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8588435053825378, "reward_std": 0.16070926934480667, "rewards/accuracy_reward": 0.8588435053825378, "rewards/format_reward": 1.0, "step": 2354 }, { "completion_length": 251.10203552246094, "epoch": 0.2369811320754717, "grad_norm": 1.0712369680404663, "kl": 0.0704345703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7466261982917786, "reward_std": 0.17783334851264954, "rewards/accuracy_reward": 0.7772384583950043, "rewards/format_reward": 0.9693877398967743, "step": 2355 }, { "completion_length": 228.52040100097656, "epoch": 0.2370817610062893, "grad_norm": 0.6769901514053345, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7182170748710632, "reward_std": 0.1397838220000267, "rewards/accuracy_reward": 0.7284212112426758, "rewards/format_reward": 0.9897959232330322, "step": 2356 }, { "completion_length": 225.948974609375, "epoch": 0.2371823899371069, "grad_norm": 0.8262597918510437, "kl": 0.0594482421875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7867104411125183, "reward_std": 0.2088654860854149, "rewards/accuracy_reward": 0.7969144582748413, "rewards/format_reward": 0.9897959232330322, "step": 2357 }, { "completion_length": 246.38774871826172, "epoch": 0.23728301886792452, "grad_norm": 1.2766445875167847, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6417545080184937, "reward_std": 0.2492651864886284, "rewards/accuracy_reward": 0.6519585847854614, "rewards/format_reward": 0.9897959232330322, "step": 2358 }, { "completion_length": 238.6530532836914, "epoch": 0.23738364779874213, "grad_norm": 0.7865627408027649, "kl": 0.0709228515625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7899659872055054, "reward_std": 0.04775561671704054, "rewards/accuracy_reward": 0.7899660170078278, "rewards/format_reward": 1.0, "step": 2359 }, { "completion_length": 213.38775634765625, "epoch": 0.23748427672955974, "grad_norm": 1.4068121910095215, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6768706440925598, "reward_std": 0.19167154282331467, "rewards/accuracy_reward": 0.6870748102664948, "rewards/format_reward": 0.9897959232330322, "step": 2360 }, { "completion_length": 224.62244415283203, "epoch": 0.23758490566037735, "grad_norm": 0.6545631289482117, "kl": 0.0794677734375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6837043166160583, "reward_std": 0.09922244399785995, "rewards/accuracy_reward": 0.6837043464183807, "rewards/format_reward": 1.0, "step": 2361 }, { "completion_length": 283.5305938720703, "epoch": 0.23768553459119496, "grad_norm": 0.5033406615257263, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6594310998916626, "reward_std": 0.1437322050333023, "rewards/accuracy_reward": 0.6594310104846954, "rewards/format_reward": 1.0, "step": 2362 }, { "completion_length": 215.9795913696289, "epoch": 0.23778616352201257, "grad_norm": 0.5508577227592468, "kl": 0.0706787109375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7021767497062683, "reward_std": 0.1281835436820984, "rewards/accuracy_reward": 0.7225848883390427, "rewards/format_reward": 0.9795918166637421, "step": 2363 }, { "completion_length": 271.6938705444336, "epoch": 0.23788679245283018, "grad_norm": 0.9366590976715088, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7419589757919312, "reward_std": 0.18880998343229294, "rewards/accuracy_reward": 0.7521630525588989, "rewards/format_reward": 0.9897959232330322, "step": 2364 }, { "completion_length": 238.16326141357422, "epoch": 0.2379874213836478, "grad_norm": 0.8166690468788147, "kl": 0.056640625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6323736310005188, "reward_std": 0.3191141411662102, "rewards/accuracy_reward": 0.6527817845344543, "rewards/format_reward": 0.9795918166637421, "step": 2365 }, { "completion_length": 173.17346954345703, "epoch": 0.23808805031446542, "grad_norm": 0.9407158493995667, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7928571701049805, "reward_std": 0.11021963134407997, "rewards/accuracy_reward": 0.7928571403026581, "rewards/format_reward": 1.0, "step": 2366 }, { "completion_length": 224.6734619140625, "epoch": 0.23818867924528303, "grad_norm": 0.7900789380073547, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7473657131195068, "reward_std": 0.20877981930971146, "rewards/accuracy_reward": 0.7575699090957642, "rewards/format_reward": 0.9897959232330322, "step": 2367 }, { "completion_length": 276.3571319580078, "epoch": 0.23828930817610064, "grad_norm": 0.8503798842430115, "kl": 0.0672607421875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.4396510124206543, "reward_std": 0.1590457558631897, "rewards/accuracy_reward": 0.4396510422229767, "rewards/format_reward": 1.0, "step": 2368 }, { "completion_length": 282.2244873046875, "epoch": 0.23838993710691825, "grad_norm": 1.0722503662109375, "kl": 0.0430908203125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.771754264831543, "reward_std": 0.18187753856182098, "rewards/accuracy_reward": 0.7819583415985107, "rewards/format_reward": 0.9897959232330322, "step": 2369 }, { "completion_length": 230.12244415283203, "epoch": 0.23849056603773586, "grad_norm": 0.6162856221199036, "kl": 0.0596923828125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8142509460449219, "reward_std": 0.11650194600224495, "rewards/accuracy_reward": 0.8142509758472443, "rewards/format_reward": 1.0, "step": 2370 }, { "completion_length": 260.7244873046875, "epoch": 0.23859119496855347, "grad_norm": 0.8643386960029602, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8272108435630798, "reward_std": 0.1217300109565258, "rewards/accuracy_reward": 0.8272108435630798, "rewards/format_reward": 1.0, "step": 2371 }, { "completion_length": 247.27550506591797, "epoch": 0.23869182389937108, "grad_norm": 0.62510746717453, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8929380774497986, "reward_std": 0.16771623492240906, "rewards/accuracy_reward": 0.8929380774497986, "rewards/format_reward": 1.0, "step": 2372 }, { "completion_length": 286.3775405883789, "epoch": 0.23879245283018868, "grad_norm": 8.573556900024414, "kl": 0.0914306640625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.665816307067871, "reward_std": 0.2488970160484314, "rewards/accuracy_reward": 0.6760204136371613, "rewards/format_reward": 0.9897959232330322, "step": 2373 }, { "completion_length": 223.35713958740234, "epoch": 0.2388930817610063, "grad_norm": 0.6944313645362854, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7871719002723694, "reward_std": 0.11941486597061157, "rewards/accuracy_reward": 0.7871719896793365, "rewards/format_reward": 1.0, "step": 2374 }, { "completion_length": 170.14286041259766, "epoch": 0.2389937106918239, "grad_norm": 1.6740789413452148, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8480927348136902, "reward_std": 0.10186957195401192, "rewards/accuracy_reward": 0.8480927646160126, "rewards/format_reward": 1.0, "step": 2375 }, { "completion_length": 204.95917510986328, "epoch": 0.2390943396226415, "grad_norm": 0.614091157913208, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.747165560722351, "reward_std": 0.14398646354675293, "rewards/accuracy_reward": 0.7573696374893188, "rewards/format_reward": 0.9897959232330322, "step": 2376 }, { "completion_length": 273.83673095703125, "epoch": 0.23919496855345912, "grad_norm": 0.8442245721817017, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5775798559188843, "reward_std": 0.14089996367692947, "rewards/accuracy_reward": 0.5775798857212067, "rewards/format_reward": 1.0, "step": 2377 }, { "completion_length": 168.4897918701172, "epoch": 0.23929559748427673, "grad_norm": 13.507450103759766, "kl": 0.449951171875, "learning_rate": 1e-06, "loss": 0.018, "reward": 1.7438533902168274, "reward_std": 0.1569199562072754, "rewards/accuracy_reward": 0.7540575861930847, "rewards/format_reward": 0.9897959232330322, "step": 2378 }, { "completion_length": 166.05101776123047, "epoch": 0.23939622641509434, "grad_norm": 1.0119961500167847, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8601440787315369, "reward_std": 0.1031133383512497, "rewards/accuracy_reward": 0.8703481554985046, "rewards/format_reward": 0.9897959232330322, "step": 2379 }, { "completion_length": 212.58163452148438, "epoch": 0.23949685534591195, "grad_norm": 1.092867374420166, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8037415146827698, "reward_std": 0.23430738598108292, "rewards/accuracy_reward": 0.8139455914497375, "rewards/format_reward": 0.9897959232330322, "step": 2380 }, { "completion_length": 208.1734619140625, "epoch": 0.23959748427672956, "grad_norm": 1.0663968324661255, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6858914494514465, "reward_std": 0.10759404301643372, "rewards/accuracy_reward": 0.6858914792537689, "rewards/format_reward": 1.0, "step": 2381 }, { "completion_length": 244.53060913085938, "epoch": 0.23969811320754716, "grad_norm": 0.9473475217819214, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.710702121257782, "reward_std": 0.15184936299920082, "rewards/accuracy_reward": 0.7107021510601044, "rewards/format_reward": 1.0, "step": 2382 }, { "completion_length": 197.4693832397461, "epoch": 0.23979874213836477, "grad_norm": 1.3604813814163208, "kl": 0.105224609375, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8619667887687683, "reward_std": 0.17977871745824814, "rewards/accuracy_reward": 0.8619668185710907, "rewards/format_reward": 1.0, "step": 2383 }, { "completion_length": 309.1530456542969, "epoch": 0.23989937106918238, "grad_norm": 0.6650534272193909, "kl": 0.0479736328125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7028668522834778, "reward_std": 0.1498907320201397, "rewards/accuracy_reward": 0.7028668522834778, "rewards/format_reward": 1.0, "step": 2384 }, { "completion_length": 288.4795913696289, "epoch": 0.24, "grad_norm": 0.7479869723320007, "kl": 0.0924072265625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6328076124191284, "reward_std": 0.16891717910766602, "rewards/accuracy_reward": 0.6328076720237732, "rewards/format_reward": 1.0, "step": 2385 }, { "completion_length": 238.67346954345703, "epoch": 0.2401006289308176, "grad_norm": 0.8096860647201538, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6466914415359497, "reward_std": 0.18863681703805923, "rewards/accuracy_reward": 0.6466914117336273, "rewards/format_reward": 1.0, "step": 2386 }, { "completion_length": 223.8775405883789, "epoch": 0.2402012578616352, "grad_norm": 0.7943024039268494, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.854786217212677, "reward_std": 0.15185461938381195, "rewards/accuracy_reward": 0.854786217212677, "rewards/format_reward": 1.0, "step": 2387 }, { "completion_length": 251.22447967529297, "epoch": 0.24030188679245282, "grad_norm": 1.4859791994094849, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6448979377746582, "reward_std": 0.28887785226106644, "rewards/accuracy_reward": 0.6755101978778839, "rewards/format_reward": 0.9693877398967743, "step": 2388 }, { "completion_length": 205.66326141357422, "epoch": 0.24040251572327043, "grad_norm": 0.9418445825576782, "kl": 0.048583984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.8481391668319702, "reward_std": 0.16999131068587303, "rewards/accuracy_reward": 0.8685473799705505, "rewards/format_reward": 0.9795918166637421, "step": 2389 }, { "completion_length": 284.5204162597656, "epoch": 0.24050314465408806, "grad_norm": 3.2495977878570557, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.761904776096344, "reward_std": 0.24995000660419464, "rewards/accuracy_reward": 0.7823128998279572, "rewards/format_reward": 0.9795918464660645, "step": 2390 }, { "completion_length": 266.4387741088867, "epoch": 0.24060377358490567, "grad_norm": 0.742795467376709, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6423712372779846, "reward_std": 0.29299743473529816, "rewards/accuracy_reward": 0.6729834973812103, "rewards/format_reward": 0.9693877398967743, "step": 2391 }, { "completion_length": 213.85714721679688, "epoch": 0.24070440251572328, "grad_norm": 2.2632253170013428, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7634353041648865, "reward_std": 0.1150091402232647, "rewards/accuracy_reward": 0.7736393809318542, "rewards/format_reward": 0.9897959232330322, "step": 2392 }, { "completion_length": 192.6836700439453, "epoch": 0.2408050314465409, "grad_norm": 1.1465610265731812, "kl": 0.0650634765625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8915114998817444, "reward_std": 0.16350459307432175, "rewards/accuracy_reward": 0.9017156362533569, "rewards/format_reward": 0.9897959232330322, "step": 2393 }, { "completion_length": 268.77549743652344, "epoch": 0.2409056603773585, "grad_norm": 0.8969971537590027, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7448979020118713, "reward_std": 0.1292334496974945, "rewards/accuracy_reward": 0.7653060853481293, "rewards/format_reward": 0.9795918166637421, "step": 2394 }, { "completion_length": 256.11224365234375, "epoch": 0.2410062893081761, "grad_norm": 0.6603329181671143, "kl": 0.0657958984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7348395586013794, "reward_std": 0.11692693829536438, "rewards/accuracy_reward": 0.7348396182060242, "rewards/format_reward": 1.0, "step": 2395 }, { "completion_length": 220.91836547851562, "epoch": 0.24110691823899372, "grad_norm": 0.8195822834968567, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.797732412815094, "reward_std": 0.17064954340457916, "rewards/accuracy_reward": 0.8079364895820618, "rewards/format_reward": 0.9897959232330322, "step": 2396 }, { "completion_length": 173.6734619140625, "epoch": 0.24120754716981133, "grad_norm": 0.9184010624885559, "kl": 0.1103515625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7300091981887817, "reward_std": 0.13988545536994934, "rewards/accuracy_reward": 0.7402133345603943, "rewards/format_reward": 0.9897959232330322, "step": 2397 }, { "completion_length": 225.78571319580078, "epoch": 0.24130817610062893, "grad_norm": 0.3920661211013794, "kl": 0.0474853515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7576530575752258, "reward_std": 0.09735377505421638, "rewards/accuracy_reward": 0.7678571343421936, "rewards/format_reward": 0.9897959232330322, "step": 2398 }, { "completion_length": 283.6428527832031, "epoch": 0.24140880503144654, "grad_norm": 1.0022791624069214, "kl": 0.0657958984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.719008982181549, "reward_std": 0.2249496653676033, "rewards/accuracy_reward": 0.7496212422847748, "rewards/format_reward": 0.9693877398967743, "step": 2399 }, { "completion_length": 307.5306091308594, "epoch": 0.24150943396226415, "grad_norm": 0.5823793411254883, "kl": 0.0594482421875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.3787171244621277, "reward_std": 0.0413987822830677, "rewards/accuracy_reward": 0.37871718406677246, "rewards/format_reward": 1.0, "step": 2400 }, { "completion_length": 248.63265228271484, "epoch": 0.24161006289308176, "grad_norm": 1.9824553728103638, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5981373190879822, "reward_std": 0.15673363953828812, "rewards/accuracy_reward": 0.6083414256572723, "rewards/format_reward": 0.9897959232330322, "step": 2401 }, { "completion_length": 207.83673095703125, "epoch": 0.24171069182389937, "grad_norm": 0.30424201488494873, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8755102157592773, "reward_std": 0.04025306552648544, "rewards/accuracy_reward": 0.875510185956955, "rewards/format_reward": 1.0, "step": 2402 }, { "completion_length": 218.7244873046875, "epoch": 0.24181132075471698, "grad_norm": 0.8278974294662476, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8240562677383423, "reward_std": 0.23331283405423164, "rewards/accuracy_reward": 0.8444644808769226, "rewards/format_reward": 0.9795918166637421, "step": 2403 }, { "completion_length": 215.83672332763672, "epoch": 0.2419119496855346, "grad_norm": 6.4234466552734375, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7200371026992798, "reward_std": 0.2394474446773529, "rewards/accuracy_reward": 0.7302411794662476, "rewards/format_reward": 0.9897959232330322, "step": 2404 }, { "completion_length": 269.1428527832031, "epoch": 0.2420125786163522, "grad_norm": 1.1594860553741455, "kl": 0.0592041015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7007214426994324, "reward_std": 0.1993509754538536, "rewards/accuracy_reward": 0.7211296558380127, "rewards/format_reward": 0.9795918464660645, "step": 2405 }, { "completion_length": 241.15306091308594, "epoch": 0.2421132075471698, "grad_norm": 0.16371016204357147, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7040815949440002, "reward_std": 0.026997461915016174, "rewards/accuracy_reward": 0.7040815949440002, "rewards/format_reward": 1.0, "step": 2406 }, { "completion_length": 281.6122360229492, "epoch": 0.24221383647798742, "grad_norm": 0.6366117000579834, "kl": 0.05615234375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.472551167011261, "reward_std": 0.12246682122349739, "rewards/accuracy_reward": 0.47255125641822815, "rewards/format_reward": 1.0, "step": 2407 }, { "completion_length": 259.2040786743164, "epoch": 0.24231446540880502, "grad_norm": 0.7191949486732483, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6810230016708374, "reward_std": 0.18964799493551254, "rewards/accuracy_reward": 0.69122713804245, "rewards/format_reward": 0.9897959232330322, "step": 2408 }, { "completion_length": 211.32652282714844, "epoch": 0.24241509433962263, "grad_norm": 0.983849823474884, "kl": 0.116455078125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7653060555458069, "reward_std": 0.1968383491039276, "rewards/accuracy_reward": 0.7653061151504517, "rewards/format_reward": 1.0, "step": 2409 }, { "completion_length": 221.6428451538086, "epoch": 0.24251572327044024, "grad_norm": 1.165387511253357, "kl": 0.056884765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.885811448097229, "reward_std": 0.12747101671993732, "rewards/accuracy_reward": 0.8858114182949066, "rewards/format_reward": 1.0, "step": 2410 }, { "completion_length": 202.1530532836914, "epoch": 0.24261635220125785, "grad_norm": 1.7300411462783813, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8571428060531616, "reward_std": 0.1842476800084114, "rewards/accuracy_reward": 0.857142835855484, "rewards/format_reward": 1.0, "step": 2411 }, { "completion_length": 286.85713958740234, "epoch": 0.24271698113207546, "grad_norm": 0.69474858045578, "kl": 0.0631103515625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6186850666999817, "reward_std": 0.19777143746614456, "rewards/accuracy_reward": 0.6288890838623047, "rewards/format_reward": 0.9897959232330322, "step": 2412 }, { "completion_length": 225.97958374023438, "epoch": 0.24281761006289307, "grad_norm": 0.6181349158287048, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.792964518070221, "reward_std": 0.19631556048989296, "rewards/accuracy_reward": 0.792964518070221, "rewards/format_reward": 1.0, "step": 2413 }, { "completion_length": 215.9387664794922, "epoch": 0.2429182389937107, "grad_norm": 1.0305416584014893, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7471655011177063, "reward_std": 0.1979946419596672, "rewards/accuracy_reward": 0.7675736844539642, "rewards/format_reward": 0.9795918464660645, "step": 2414 }, { "completion_length": 307.37754821777344, "epoch": 0.24301886792452831, "grad_norm": 0.8382862210273743, "kl": 0.0513916015625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6502451300621033, "reward_std": 0.18871047347784042, "rewards/accuracy_reward": 0.6502451747655869, "rewards/format_reward": 1.0, "step": 2415 }, { "completion_length": 245.98979949951172, "epoch": 0.24311949685534592, "grad_norm": 0.6882683038711548, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6768707633018494, "reward_std": 0.21473411843180656, "rewards/accuracy_reward": 0.6870747804641724, "rewards/format_reward": 0.9897959232330322, "step": 2416 }, { "completion_length": 266.2040710449219, "epoch": 0.24322012578616353, "grad_norm": 0.590350329875946, "kl": 0.0574951171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5806607604026794, "reward_std": 0.07078767288476229, "rewards/accuracy_reward": 0.590864896774292, "rewards/format_reward": 0.9897959232330322, "step": 2417 }, { "completion_length": 289.6938781738281, "epoch": 0.24332075471698114, "grad_norm": 0.6649776101112366, "kl": 0.0655517578125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.4854251146316528, "reward_std": 0.24952631443738937, "rewards/accuracy_reward": 0.5058332681655884, "rewards/format_reward": 0.9795918464660645, "step": 2418 }, { "completion_length": 298.84693908691406, "epoch": 0.24342138364779875, "grad_norm": 1.1406903266906738, "kl": 0.0504150390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6707263588905334, "reward_std": 0.30613547563552856, "rewards/accuracy_reward": 0.691134512424469, "rewards/format_reward": 0.9795918464660645, "step": 2419 }, { "completion_length": 252.06122589111328, "epoch": 0.24352201257861636, "grad_norm": 1.1432011127471924, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.596103847026825, "reward_std": 0.19838455319404602, "rewards/accuracy_reward": 0.6165120601654053, "rewards/format_reward": 0.9795918166637421, "step": 2420 }, { "completion_length": 278.6530532836914, "epoch": 0.24362264150943397, "grad_norm": 2.47381591796875, "kl": 0.172607421875, "learning_rate": 1e-06, "loss": 0.0069, "reward": 1.479432463645935, "reward_std": 0.16932859271764755, "rewards/accuracy_reward": 0.48963651061058044, "rewards/format_reward": 0.9897959232330322, "step": 2421 }, { "completion_length": 270.89794921875, "epoch": 0.24372327044025158, "grad_norm": 0.4098820984363556, "kl": 0.0538330078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6874149441719055, "reward_std": 0.06910638883709908, "rewards/accuracy_reward": 0.6874149441719055, "rewards/format_reward": 1.0, "step": 2422 }, { "completion_length": 227.4795913696289, "epoch": 0.24382389937106919, "grad_norm": 1.1055463552474976, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.742807686328888, "reward_std": 0.2002391442656517, "rewards/accuracy_reward": 0.7428077161312103, "rewards/format_reward": 1.0, "step": 2423 }, { "completion_length": 267.4795837402344, "epoch": 0.2439245283018868, "grad_norm": 0.994946300983429, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.692801833152771, "reward_std": 0.2341349795460701, "rewards/accuracy_reward": 0.7234141230583191, "rewards/format_reward": 0.9693877398967743, "step": 2424 }, { "completion_length": 223.61223602294922, "epoch": 0.2440251572327044, "grad_norm": 1.2252345085144043, "kl": 0.099609375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7262643575668335, "reward_std": 0.18751110136508942, "rewards/accuracy_reward": 0.7262643873691559, "rewards/format_reward": 1.0, "step": 2425 }, { "completion_length": 219.07142639160156, "epoch": 0.244125786163522, "grad_norm": 0.8768607974052429, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5731292366981506, "reward_std": 0.22377263009548187, "rewards/accuracy_reward": 0.6139455735683441, "rewards/format_reward": 0.9591836333274841, "step": 2426 }, { "completion_length": 208.24488830566406, "epoch": 0.24422641509433962, "grad_norm": 0.7841758728027344, "kl": 0.0662841796875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7396501302719116, "reward_std": 0.1008220836520195, "rewards/accuracy_reward": 0.7498542070388794, "rewards/format_reward": 0.9897959232330322, "step": 2427 }, { "completion_length": 206.2040786743164, "epoch": 0.24432704402515723, "grad_norm": 0.7781946063041687, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7653061151504517, "reward_std": 0.0952342338860035, "rewards/accuracy_reward": 0.7755101919174194, "rewards/format_reward": 0.9897959232330322, "step": 2428 }, { "completion_length": 215.89795684814453, "epoch": 0.24442767295597484, "grad_norm": 1.0013372898101807, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6435518860816956, "reward_std": 0.3844367265701294, "rewards/accuracy_reward": 0.71498042345047, "rewards/format_reward": 0.9285714030265808, "step": 2429 }, { "completion_length": 229.79591369628906, "epoch": 0.24452830188679245, "grad_norm": 0.9240533113479614, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6203286051750183, "reward_std": 0.28348881751298904, "rewards/accuracy_reward": 0.6917572021484375, "rewards/format_reward": 0.9285714030265808, "step": 2430 }, { "completion_length": 203.9795913696289, "epoch": 0.24462893081761006, "grad_norm": 0.9512033462524414, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7857142686843872, "reward_std": 0.20016494393348694, "rewards/accuracy_reward": 0.8163265287876129, "rewards/format_reward": 0.9693877398967743, "step": 2431 }, { "completion_length": 222.91836547851562, "epoch": 0.24472955974842767, "grad_norm": 0.9697009325027466, "kl": 0.0677490234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.622448980808258, "reward_std": 0.18887970224022865, "rewards/accuracy_reward": 0.6326530426740646, "rewards/format_reward": 0.9897959232330322, "step": 2432 }, { "completion_length": 263.7959213256836, "epoch": 0.24483018867924528, "grad_norm": 0.7742537260055542, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6037436127662659, "reward_std": 0.16567827761173248, "rewards/accuracy_reward": 0.6241517663002014, "rewards/format_reward": 0.9795918166637421, "step": 2433 }, { "completion_length": 215.89795684814453, "epoch": 0.24493081761006288, "grad_norm": 1.3405545949935913, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6735435724258423, "reward_std": 0.27531111240386963, "rewards/accuracy_reward": 0.7449721395969391, "rewards/format_reward": 0.9285714030265808, "step": 2434 }, { "completion_length": 282.6428527832031, "epoch": 0.2450314465408805, "grad_norm": 0.6695588827133179, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.610823392868042, "reward_std": 0.19176309555768967, "rewards/accuracy_reward": 0.6312316358089447, "rewards/format_reward": 0.9795918166637421, "step": 2435 }, { "completion_length": 211.7551040649414, "epoch": 0.2451320754716981, "grad_norm": 0.6471662521362305, "kl": 0.0582275390625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8045674562454224, "reward_std": 0.19865145534276962, "rewards/accuracy_reward": 0.8249756693840027, "rewards/format_reward": 0.9795918464660645, "step": 2436 }, { "completion_length": 262.1938781738281, "epoch": 0.2452327044025157, "grad_norm": 0.9660041928291321, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5674587488174438, "reward_std": 0.18197201192378998, "rewards/accuracy_reward": 0.5776629000902176, "rewards/format_reward": 0.9897959232330322, "step": 2437 }, { "completion_length": 232.51020050048828, "epoch": 0.24533333333333332, "grad_norm": 1.2993649244308472, "kl": 0.0711669921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.806122362613678, "reward_std": 0.20312153548002243, "rewards/accuracy_reward": 0.8571428656578064, "rewards/format_reward": 0.9489795565605164, "step": 2438 }, { "completion_length": 261.64286041259766, "epoch": 0.24543396226415096, "grad_norm": 1.4612596035003662, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5564900040626526, "reward_std": 0.372116819024086, "rewards/accuracy_reward": 0.5871022939682007, "rewards/format_reward": 0.9693877398967743, "step": 2439 }, { "completion_length": 223.88774871826172, "epoch": 0.24553459119496857, "grad_norm": 1.3713712692260742, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5734001994132996, "reward_std": 0.20605242252349854, "rewards/accuracy_reward": 0.6040124446153641, "rewards/format_reward": 0.9693877398967743, "step": 2440 }, { "completion_length": 219.33673095703125, "epoch": 0.24563522012578617, "grad_norm": 0.9948596954345703, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.4837717413902283, "reward_std": 0.22405144572257996, "rewards/accuracy_reward": 0.514384001493454, "rewards/format_reward": 0.9693877398967743, "step": 2441 }, { "completion_length": 207.99999237060547, "epoch": 0.24573584905660378, "grad_norm": 1.1443240642547607, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7557822465896606, "reward_std": 0.14502763003110886, "rewards/accuracy_reward": 0.7557823061943054, "rewards/format_reward": 1.0, "step": 2442 }, { "completion_length": 242.03060150146484, "epoch": 0.2458364779874214, "grad_norm": 0.6956671476364136, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7965015172958374, "reward_std": 0.21728675067424774, "rewards/accuracy_reward": 0.8067055344581604, "rewards/format_reward": 0.9897959232330322, "step": 2443 }, { "completion_length": 211.7755126953125, "epoch": 0.245937106918239, "grad_norm": 1.9231857061386108, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7219629883766174, "reward_std": 0.22610720992088318, "rewards/accuracy_reward": 0.7423712611198425, "rewards/format_reward": 0.9795918166637421, "step": 2444 }, { "completion_length": 256.3673324584961, "epoch": 0.2460377358490566, "grad_norm": 0.6569967865943909, "kl": 0.046875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5813410878181458, "reward_std": 0.1335512027144432, "rewards/accuracy_reward": 0.6017492711544037, "rewards/format_reward": 0.9795918464660645, "step": 2445 }, { "completion_length": 188.448974609375, "epoch": 0.24613836477987422, "grad_norm": 0.7269783020019531, "kl": 0.0550537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7827574610710144, "reward_std": 0.12481331452727318, "rewards/accuracy_reward": 0.8133697509765625, "rewards/format_reward": 0.9693877398967743, "step": 2446 }, { "completion_length": 267.55101013183594, "epoch": 0.24623899371069183, "grad_norm": 1.721987247467041, "kl": 0.0474853515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.5319109559059143, "reward_std": 0.24508290737867355, "rewards/accuracy_reward": 0.5319109559059143, "rewards/format_reward": 1.0, "step": 2447 }, { "completion_length": 225.39795684814453, "epoch": 0.24633962264150944, "grad_norm": 1.2253296375274658, "kl": 0.1123046875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.714285671710968, "reward_std": 0.21289166808128357, "rewards/accuracy_reward": 0.7346938848495483, "rewards/format_reward": 0.9795918166637421, "step": 2448 }, { "completion_length": 180.9081573486328, "epoch": 0.24644025157232705, "grad_norm": 0.7608236074447632, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8221967816352844, "reward_std": 0.2082248404622078, "rewards/accuracy_reward": 0.8426049947738647, "rewards/format_reward": 0.9795918464660645, "step": 2449 }, { "completion_length": 265.2550964355469, "epoch": 0.24654088050314465, "grad_norm": 0.8367362022399902, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6388753652572632, "reward_std": 0.19936193898320198, "rewards/accuracy_reward": 0.6694875955581665, "rewards/format_reward": 0.9693877398967743, "step": 2450 }, { "completion_length": 251.66326904296875, "epoch": 0.24664150943396226, "grad_norm": 1.2061413526535034, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6122448444366455, "reward_std": 0.2260405644774437, "rewards/accuracy_reward": 0.6734693646430969, "rewards/format_reward": 0.9387754797935486, "step": 2451 }, { "completion_length": 206.01020050048828, "epoch": 0.24674213836477987, "grad_norm": 0.7220525145530701, "kl": 0.0548095703125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6836734414100647, "reward_std": 0.12662474811077118, "rewards/accuracy_reward": 0.6938775181770325, "rewards/format_reward": 0.9897959232330322, "step": 2452 }, { "completion_length": 325.39796447753906, "epoch": 0.24684276729559748, "grad_norm": 0.8021589517593384, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.634110689163208, "reward_std": 0.3151169568300247, "rewards/accuracy_reward": 0.6443148553371429, "rewards/format_reward": 0.9897959232330322, "step": 2453 }, { "completion_length": 314.7653045654297, "epoch": 0.2469433962264151, "grad_norm": 0.6338931322097778, "kl": 0.059814453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.778911530971527, "reward_std": 0.1726902350783348, "rewards/accuracy_reward": 0.7993196845054626, "rewards/format_reward": 0.9795918464660645, "step": 2454 }, { "completion_length": 296.4285583496094, "epoch": 0.2470440251572327, "grad_norm": 0.5291550755500793, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6222641468048096, "reward_std": 0.12285229191184044, "rewards/accuracy_reward": 0.6324682235717773, "rewards/format_reward": 0.9897959232330322, "step": 2455 }, { "completion_length": 216.70407104492188, "epoch": 0.2471446540880503, "grad_norm": 0.5084607005119324, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.836734652519226, "reward_std": 0.1427408866584301, "rewards/accuracy_reward": 0.8367346823215485, "rewards/format_reward": 1.0, "step": 2456 }, { "completion_length": 278.66326904296875, "epoch": 0.24724528301886792, "grad_norm": 0.36095643043518066, "kl": 0.0689697265625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.754638135433197, "reward_std": 0.15698172897100449, "rewards/accuracy_reward": 0.7852504253387451, "rewards/format_reward": 0.9693877398967743, "step": 2457 }, { "completion_length": 272.0816192626953, "epoch": 0.24734591194968553, "grad_norm": 0.7128535509109497, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7424576878547668, "reward_std": 0.19376573711633682, "rewards/accuracy_reward": 0.7424577474594116, "rewards/format_reward": 1.0, "step": 2458 }, { "completion_length": 231.95917510986328, "epoch": 0.24744654088050314, "grad_norm": 0.7822930216789246, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8392710089683533, "reward_std": 0.17930163070559502, "rewards/accuracy_reward": 0.8494750559329987, "rewards/format_reward": 0.9897959232330322, "step": 2459 }, { "completion_length": 227.2244873046875, "epoch": 0.24754716981132074, "grad_norm": 0.6785222291946411, "kl": 0.062744140625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7089687585830688, "reward_std": 0.14105485752224922, "rewards/accuracy_reward": 0.7089687287807465, "rewards/format_reward": 1.0, "step": 2460 }, { "completion_length": 270.57142639160156, "epoch": 0.24764779874213835, "grad_norm": 1.1103999614715576, "kl": 0.051025390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6535905599594116, "reward_std": 0.3872256428003311, "rewards/accuracy_reward": 0.6842029094696045, "rewards/format_reward": 0.9693877398967743, "step": 2461 }, { "completion_length": 215.2448959350586, "epoch": 0.24774842767295596, "grad_norm": 0.5331820249557495, "kl": 0.05078125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.9545499682426453, "reward_std": 0.08466050587594509, "rewards/accuracy_reward": 0.95455002784729, "rewards/format_reward": 1.0, "step": 2462 }, { "completion_length": 223.4183578491211, "epoch": 0.2478490566037736, "grad_norm": 0.7705176472663879, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7755101919174194, "reward_std": 0.15069952607154846, "rewards/accuracy_reward": 0.7755101919174194, "rewards/format_reward": 1.0, "step": 2463 }, { "completion_length": 192.81632232666016, "epoch": 0.2479496855345912, "grad_norm": 0.5813769698143005, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7081632614135742, "reward_std": 0.09460558742284775, "rewards/accuracy_reward": 0.718367338180542, "rewards/format_reward": 0.9897959232330322, "step": 2464 }, { "completion_length": 259.72447967529297, "epoch": 0.24805031446540882, "grad_norm": 1.3291155099868774, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.516836702823639, "reward_std": 0.23960064351558685, "rewards/accuracy_reward": 0.5270408242940903, "rewards/format_reward": 0.9897959232330322, "step": 2465 }, { "completion_length": 316.8673400878906, "epoch": 0.24815094339622643, "grad_norm": 0.8010284900665283, "kl": 0.0391845703125, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7712385654449463, "reward_std": 0.2257491648197174, "rewards/accuracy_reward": 0.7712386250495911, "rewards/format_reward": 1.0, "step": 2466 }, { "completion_length": 236.12244415283203, "epoch": 0.24825157232704403, "grad_norm": 0.595206081867218, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7858113646507263, "reward_std": 0.08671493455767632, "rewards/accuracy_reward": 0.7858114242553711, "rewards/format_reward": 1.0, "step": 2467 }, { "completion_length": 225.82653045654297, "epoch": 0.24835220125786164, "grad_norm": 0.8008219003677368, "kl": 0.052001953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.594745934009552, "reward_std": 0.16061829030513763, "rewards/accuracy_reward": 0.6049500405788422, "rewards/format_reward": 0.9897959232330322, "step": 2468 }, { "completion_length": 216.81632232666016, "epoch": 0.24845283018867925, "grad_norm": 0.5358607769012451, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.4609239101409912, "reward_std": 0.11423376947641373, "rewards/accuracy_reward": 0.4711280018091202, "rewards/format_reward": 0.9897959232330322, "step": 2469 }, { "completion_length": 168.39794921875, "epoch": 0.24855345911949686, "grad_norm": 2.1487369537353516, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.625244677066803, "reward_std": 0.24219270050525665, "rewards/accuracy_reward": 0.6354486644268036, "rewards/format_reward": 0.9897959232330322, "step": 2470 }, { "completion_length": 257.55101013183594, "epoch": 0.24865408805031447, "grad_norm": 0.6267234683036804, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6090874671936035, "reward_std": 0.14732308685779572, "rewards/accuracy_reward": 0.6090875267982483, "rewards/format_reward": 1.0, "step": 2471 }, { "completion_length": 285.948974609375, "epoch": 0.24875471698113208, "grad_norm": 0.8008359670639038, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6266241669654846, "reward_std": 0.2862475737929344, "rewards/accuracy_reward": 0.6470324397087097, "rewards/format_reward": 0.9795918166637421, "step": 2472 }, { "completion_length": 304.14283752441406, "epoch": 0.2488553459119497, "grad_norm": 2.179551124572754, "kl": 0.059326171875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.483949065208435, "reward_std": 0.22242549434304237, "rewards/accuracy_reward": 0.4839489758014679, "rewards/format_reward": 1.0, "step": 2473 }, { "completion_length": 199.62244415283203, "epoch": 0.2489559748427673, "grad_norm": 0.44851163029670715, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8515509366989136, "reward_std": 0.11042015627026558, "rewards/accuracy_reward": 0.8515509068965912, "rewards/format_reward": 1.0, "step": 2474 }, { "completion_length": 148.91836547851562, "epoch": 0.2490566037735849, "grad_norm": 3.0006585121154785, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7756222486495972, "reward_std": 0.19522815942764282, "rewards/accuracy_reward": 0.7960304915904999, "rewards/format_reward": 0.9795918464660645, "step": 2475 }, { "completion_length": 205.5204086303711, "epoch": 0.24915723270440251, "grad_norm": 0.3494037687778473, "kl": 0.0635986328125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8673469424247742, "reward_std": 0.09217510372400284, "rewards/accuracy_reward": 0.8775510191917419, "rewards/format_reward": 0.9897959232330322, "step": 2476 }, { "completion_length": 182.38774871826172, "epoch": 0.24925786163522012, "grad_norm": 0.8063612580299377, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7273192405700684, "reward_std": 0.153999425470829, "rewards/accuracy_reward": 0.7477274537086487, "rewards/format_reward": 0.9795918166637421, "step": 2477 }, { "completion_length": 278.55101776123047, "epoch": 0.24935849056603773, "grad_norm": 0.6200331449508667, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5961304306983948, "reward_std": 0.08906252682209015, "rewards/accuracy_reward": 0.6063344776630402, "rewards/format_reward": 0.9897959232330322, "step": 2478 }, { "completion_length": 217.06122589111328, "epoch": 0.24945911949685534, "grad_norm": 1.5921852588653564, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.578705608844757, "reward_std": 0.0659047793596983, "rewards/accuracy_reward": 0.5787057727575302, "rewards/format_reward": 1.0, "step": 2479 }, { "completion_length": 218.17346954345703, "epoch": 0.24955974842767295, "grad_norm": 1.102763056755066, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7406846284866333, "reward_std": 0.16049721091985703, "rewards/accuracy_reward": 0.7406845390796661, "rewards/format_reward": 1.0, "step": 2480 }, { "completion_length": 162.37754821777344, "epoch": 0.24966037735849056, "grad_norm": 0.702271580696106, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8774883151054382, "reward_std": 0.16818199306726456, "rewards/accuracy_reward": 0.8876924216747284, "rewards/format_reward": 0.9897959232330322, "step": 2481 }, { "completion_length": 295.5918273925781, "epoch": 0.24976100628930817, "grad_norm": 0.9381111264228821, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6706753373146057, "reward_std": 0.175599105656147, "rewards/accuracy_reward": 0.6910835206508636, "rewards/format_reward": 0.9795918464660645, "step": 2482 }, { "completion_length": 202.09183502197266, "epoch": 0.24986163522012578, "grad_norm": 0.7064167261123657, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.867779791355133, "reward_std": 0.1512804739177227, "rewards/accuracy_reward": 0.8779838979244232, "rewards/format_reward": 0.9897959232330322, "step": 2483 }, { "completion_length": 170.66326141357422, "epoch": 0.2499622641509434, "grad_norm": 8.542491912841797, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8287537097930908, "reward_std": 0.17239182442426682, "rewards/accuracy_reward": 0.8389578461647034, "rewards/format_reward": 0.9897959232330322, "step": 2484 }, { "completion_length": 188.64285278320312, "epoch": 0.250062893081761, "grad_norm": 14.601374626159668, "kl": 0.2479248046875, "learning_rate": 1e-06, "loss": 0.0099, "reward": 1.9387754201889038, "reward_std": 0.09899068437516689, "rewards/accuracy_reward": 0.9591836631298065, "rewards/format_reward": 0.9795918166637421, "step": 2485 }, { "completion_length": 164.03060913085938, "epoch": 0.25016352201257863, "grad_norm": 21.405004501342773, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.804213047027588, "reward_std": 0.14892616122961044, "rewards/accuracy_reward": 0.8042130172252655, "rewards/format_reward": 1.0, "step": 2486 }, { "completion_length": 196.4591827392578, "epoch": 0.2502641509433962, "grad_norm": 0.6055439710617065, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6971938014030457, "reward_std": 0.13325489684939384, "rewards/accuracy_reward": 0.6971938163042068, "rewards/format_reward": 1.0, "step": 2487 }, { "completion_length": 203.2040786743164, "epoch": 0.25036477987421385, "grad_norm": 0.8034802675247192, "kl": 0.0684814453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7194201350212097, "reward_std": 0.20909802615642548, "rewards/accuracy_reward": 0.7296242415904999, "rewards/format_reward": 0.9897959232330322, "step": 2488 }, { "completion_length": 227.4591827392578, "epoch": 0.25046540880503143, "grad_norm": 0.6929945945739746, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7341372966766357, "reward_std": 0.12151839211583138, "rewards/accuracy_reward": 0.7341372668743134, "rewards/format_reward": 1.0, "step": 2489 }, { "completion_length": 294.6326446533203, "epoch": 0.25056603773584907, "grad_norm": 1.050443410873413, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5568218231201172, "reward_std": 0.18596754223108292, "rewards/accuracy_reward": 0.5670259147882462, "rewards/format_reward": 0.9897959232330322, "step": 2490 }, { "completion_length": 213.6938705444336, "epoch": 0.25066666666666665, "grad_norm": 0.4250561594963074, "kl": 0.048095703125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6239066123962402, "reward_std": 0.06903441390022635, "rewards/accuracy_reward": 0.623906672000885, "rewards/format_reward": 1.0, "step": 2491 }, { "completion_length": 217.91836547851562, "epoch": 0.2507672955974843, "grad_norm": 1.3031562566757202, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6605583429336548, "reward_std": 0.14158008992671967, "rewards/accuracy_reward": 0.6605583429336548, "rewards/format_reward": 1.0, "step": 2492 }, { "completion_length": 199.38775634765625, "epoch": 0.25086792452830187, "grad_norm": 0.6610713601112366, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.60966295003891, "reward_std": 0.17707154899835587, "rewards/accuracy_reward": 0.6198670268058777, "rewards/format_reward": 0.9897959232330322, "step": 2493 }, { "completion_length": 277.4693908691406, "epoch": 0.2509685534591195, "grad_norm": 0.6252409219741821, "kl": 0.059814453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7755101919174194, "reward_std": 0.1270286701619625, "rewards/accuracy_reward": 0.7755101919174194, "rewards/format_reward": 1.0, "step": 2494 }, { "completion_length": 125.95917892456055, "epoch": 0.2510691823899371, "grad_norm": 0.33180779218673706, "kl": 0.0692138671875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8885639309883118, "reward_std": 0.04260341078042984, "rewards/accuracy_reward": 0.8885640501976013, "rewards/format_reward": 1.0, "step": 2495 }, { "completion_length": 202.2040786743164, "epoch": 0.2511698113207547, "grad_norm": 0.6314974427223206, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6836734414100647, "reward_std": 0.2642207369208336, "rewards/accuracy_reward": 0.7346938848495483, "rewards/format_reward": 0.9489795863628387, "step": 2496 }, { "completion_length": 220.95917510986328, "epoch": 0.2512704402515723, "grad_norm": 0.9444795846939087, "kl": 0.0614013671875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5474320650100708, "reward_std": 0.281751312315464, "rewards/accuracy_reward": 0.5576362013816833, "rewards/format_reward": 0.9897959232330322, "step": 2497 }, { "completion_length": 218.36734008789062, "epoch": 0.25137106918238994, "grad_norm": 2.0953071117401123, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.59183669090271, "reward_std": 0.2057349532842636, "rewards/accuracy_reward": 0.5918367356061935, "rewards/format_reward": 1.0, "step": 2498 }, { "completion_length": 232.03060150146484, "epoch": 0.2514716981132075, "grad_norm": 0.5273289084434509, "kl": 0.06103515625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6934688687324524, "reward_std": 0.12354278936982155, "rewards/accuracy_reward": 0.7036730051040649, "rewards/format_reward": 0.9897959232330322, "step": 2499 }, { "completion_length": 277.86734771728516, "epoch": 0.25157232704402516, "grad_norm": 3.287761688232422, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7567864656448364, "reward_std": 0.16716469824314117, "rewards/accuracy_reward": 0.7567865252494812, "rewards/format_reward": 1.0, "step": 2500 }, { "completion_length": 285.2244873046875, "epoch": 0.2516729559748428, "grad_norm": 0.7432020902633667, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.4488338232040405, "reward_std": 0.23254158347845078, "rewards/accuracy_reward": 0.489650160074234, "rewards/format_reward": 0.9591836333274841, "step": 2501 }, { "completion_length": 212.8571319580078, "epoch": 0.2517735849056604, "grad_norm": 1.0234845876693726, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7306121587753296, "reward_std": 0.1461247205734253, "rewards/accuracy_reward": 0.7306122481822968, "rewards/format_reward": 1.0, "step": 2502 }, { "completion_length": 222.75509643554688, "epoch": 0.251874213836478, "grad_norm": 1.0366374254226685, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.62627774477005, "reward_std": 0.15410403162240982, "rewards/accuracy_reward": 0.6364818513393402, "rewards/format_reward": 0.9897959232330322, "step": 2503 }, { "completion_length": 286.7244873046875, "epoch": 0.2519748427672956, "grad_norm": 0.8564579486846924, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.583802342414856, "reward_std": 0.2168661653995514, "rewards/accuracy_reward": 0.6144146025180817, "rewards/format_reward": 0.9693877398967743, "step": 2504 }, { "completion_length": 209.26529693603516, "epoch": 0.25207547169811323, "grad_norm": 0.9490923881530762, "kl": 0.0587158203125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7576530575752258, "reward_std": 0.18489515781402588, "rewards/accuracy_reward": 0.7576530575752258, "rewards/format_reward": 1.0, "step": 2505 }, { "completion_length": 184.2244873046875, "epoch": 0.2521761006289308, "grad_norm": 0.8917577266693115, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8252679705619812, "reward_std": 0.12237133085727692, "rewards/accuracy_reward": 0.8252679705619812, "rewards/format_reward": 1.0, "step": 2506 }, { "completion_length": 225.62244415283203, "epoch": 0.25227672955974845, "grad_norm": 0.690298318862915, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6798667907714844, "reward_std": 0.1378183215856552, "rewards/accuracy_reward": 0.710479199886322, "rewards/format_reward": 0.9693877398967743, "step": 2507 }, { "completion_length": 267.07142639160156, "epoch": 0.25237735849056603, "grad_norm": 0.8300545811653137, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5643867254257202, "reward_std": 0.15746884793043137, "rewards/accuracy_reward": 0.5643865764141083, "rewards/format_reward": 1.0, "step": 2508 }, { "completion_length": 248.1938705444336, "epoch": 0.25247798742138367, "grad_norm": 0.7966108918190002, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6683065295219421, "reward_std": 0.16202708333730698, "rewards/accuracy_reward": 0.6785105764865875, "rewards/format_reward": 0.9897959232330322, "step": 2509 }, { "completion_length": 208.88774871826172, "epoch": 0.25257861635220125, "grad_norm": 0.704328179359436, "kl": 0.056884765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.831713616847992, "reward_std": 0.15934231132268906, "rewards/accuracy_reward": 0.8521217703819275, "rewards/format_reward": 0.9795918166637421, "step": 2510 }, { "completion_length": 221.36734008789062, "epoch": 0.2526792452830189, "grad_norm": 0.9246214032173157, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6320026516914368, "reward_std": 0.1594044491648674, "rewards/accuracy_reward": 0.6422067582607269, "rewards/format_reward": 0.9897959232330322, "step": 2511 }, { "completion_length": 270.52040100097656, "epoch": 0.25277987421383646, "grad_norm": 0.5959354639053345, "kl": 0.0592041015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6678429245948792, "reward_std": 0.11268695816397667, "rewards/accuracy_reward": 0.6882511377334595, "rewards/format_reward": 0.9795918464660645, "step": 2512 }, { "completion_length": 290.37754821777344, "epoch": 0.2528805031446541, "grad_norm": 0.6118751168251038, "kl": 0.0457763671875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6206666827201843, "reward_std": 0.2522326558828354, "rewards/accuracy_reward": 0.65127894282341, "rewards/format_reward": 0.9693877398967743, "step": 2513 }, { "completion_length": 270.31632232666016, "epoch": 0.2529811320754717, "grad_norm": 0.7435104846954346, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6348440051078796, "reward_std": 0.19658073782920837, "rewards/accuracy_reward": 0.6348440647125244, "rewards/format_reward": 1.0, "step": 2514 }, { "completion_length": 218.62245178222656, "epoch": 0.2530817610062893, "grad_norm": 1.7125662565231323, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.5765565037727356, "reward_std": 0.19783930480480194, "rewards/accuracy_reward": 0.5867606401443481, "rewards/format_reward": 0.9897959232330322, "step": 2515 }, { "completion_length": 256.64286041259766, "epoch": 0.2531823899371069, "grad_norm": 1.1332577466964722, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6272345781326294, "reward_std": 0.2091360166668892, "rewards/accuracy_reward": 0.6272346377372742, "rewards/format_reward": 1.0, "step": 2516 }, { "completion_length": 265.29590606689453, "epoch": 0.25328301886792454, "grad_norm": 0.8383105397224426, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7979591488838196, "reward_std": 0.14580762386322021, "rewards/accuracy_reward": 0.8081632852554321, "rewards/format_reward": 0.9897959232330322, "step": 2517 }, { "completion_length": 289.3877487182617, "epoch": 0.2533836477987421, "grad_norm": 0.7916789054870605, "kl": 0.0677490234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.673469364643097, "reward_std": 0.2428746223449707, "rewards/accuracy_reward": 0.6836734712123871, "rewards/format_reward": 0.9897959232330322, "step": 2518 }, { "completion_length": 238.70407104492188, "epoch": 0.25348427672955975, "grad_norm": 1.2547756433486938, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6073858141899109, "reward_std": 0.17375026643276215, "rewards/accuracy_reward": 0.6175898760557175, "rewards/format_reward": 0.9897959232330322, "step": 2519 }, { "completion_length": 248.15306091308594, "epoch": 0.25358490566037734, "grad_norm": 0.9924020767211914, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7970521450042725, "reward_std": 0.20502134412527084, "rewards/accuracy_reward": 0.7970521152019501, "rewards/format_reward": 1.0, "step": 2520 }, { "completion_length": 303.9897918701172, "epoch": 0.25368553459119497, "grad_norm": 0.8448767066001892, "kl": 0.052001953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6880660653114319, "reward_std": 0.1783740632236004, "rewards/accuracy_reward": 0.6880661845207214, "rewards/format_reward": 1.0, "step": 2521 }, { "completion_length": 237.02040100097656, "epoch": 0.25378616352201255, "grad_norm": 0.6954077482223511, "kl": 0.0738525390625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6744412183761597, "reward_std": 0.08356357552111149, "rewards/accuracy_reward": 0.684645265340805, "rewards/format_reward": 0.9897959232330322, "step": 2522 }, { "completion_length": 211.10203552246094, "epoch": 0.2538867924528302, "grad_norm": 1.2346177101135254, "kl": 0.1175537109375, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.844995141029358, "reward_std": 0.18998994678258896, "rewards/accuracy_reward": 0.8551992177963257, "rewards/format_reward": 0.9897959232330322, "step": 2523 }, { "completion_length": 292.69386291503906, "epoch": 0.25398742138364777, "grad_norm": 0.5029512047767639, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7263761162757874, "reward_std": 0.10782880336046219, "rewards/accuracy_reward": 0.7365802228450775, "rewards/format_reward": 0.9897959232330322, "step": 2524 }, { "completion_length": 338.9387664794922, "epoch": 0.2540880503144654, "grad_norm": 0.8220095634460449, "kl": 0.0596923828125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.696790337562561, "reward_std": 0.14996028877794743, "rewards/accuracy_reward": 0.7171984910964966, "rewards/format_reward": 0.9795918166637421, "step": 2525 }, { "completion_length": 222.4693832397461, "epoch": 0.25418867924528304, "grad_norm": 1.0558563470840454, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.591927945613861, "reward_std": 0.10753342509269714, "rewards/accuracy_reward": 0.5919279456138611, "rewards/format_reward": 1.0, "step": 2526 }, { "completion_length": 199.87754821777344, "epoch": 0.2542893081761006, "grad_norm": 0.8900222778320312, "kl": 0.0941162109375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.82822984457016, "reward_std": 0.16261007636785507, "rewards/accuracy_reward": 0.8384338319301605, "rewards/format_reward": 0.9897959232330322, "step": 2527 }, { "completion_length": 211.14286041259766, "epoch": 0.25438993710691826, "grad_norm": 0.7230311036109924, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.726627767086029, "reward_std": 0.09755883365869522, "rewards/accuracy_reward": 0.726627767086029, "rewards/format_reward": 1.0, "step": 2528 }, { "completion_length": 241.04080963134766, "epoch": 0.25449056603773584, "grad_norm": 1.8963618278503418, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6602115035057068, "reward_std": 0.24836893379688263, "rewards/accuracy_reward": 0.670415610074997, "rewards/format_reward": 0.9897959232330322, "step": 2529 }, { "completion_length": 239.2040786743164, "epoch": 0.2545911949685535, "grad_norm": 1.1987463235855103, "kl": 0.0626220703125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7557944059371948, "reward_std": 0.21875247359275818, "rewards/accuracy_reward": 0.765998512506485, "rewards/format_reward": 0.9897959232330322, "step": 2530 }, { "completion_length": 244.4591827392578, "epoch": 0.25469182389937106, "grad_norm": 1.0590553283691406, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5958622694015503, "reward_std": 0.19171466678380966, "rewards/accuracy_reward": 0.5958622992038727, "rewards/format_reward": 1.0, "step": 2531 }, { "completion_length": 243.39794921875, "epoch": 0.2547924528301887, "grad_norm": 0.7143878936767578, "kl": 0.045166015625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7741496562957764, "reward_std": 0.10398724675178528, "rewards/accuracy_reward": 0.7741495966911316, "rewards/format_reward": 1.0, "step": 2532 }, { "completion_length": 204.7244873046875, "epoch": 0.2548930817610063, "grad_norm": 0.7671089768409729, "kl": 0.0550537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7898489236831665, "reward_std": 0.14850438386201859, "rewards/accuracy_reward": 0.7898489236831665, "rewards/format_reward": 1.0, "step": 2533 }, { "completion_length": 264.1836700439453, "epoch": 0.2549937106918239, "grad_norm": 52.58388900756836, "kl": 1.9296875, "learning_rate": 1e-06, "loss": 0.0769, "reward": 1.6787940859794617, "reward_std": 0.19679765403270721, "rewards/accuracy_reward": 0.6992023289203644, "rewards/format_reward": 0.9795918166637421, "step": 2534 }, { "completion_length": 236.6530532836914, "epoch": 0.2550943396226415, "grad_norm": 26.449941635131836, "kl": 0.3016357421875, "learning_rate": 1e-06, "loss": 0.0121, "reward": 1.7675294876098633, "reward_std": 0.2009505331516266, "rewards/accuracy_reward": 0.777733564376831, "rewards/format_reward": 0.9897959232330322, "step": 2535 }, { "completion_length": 206.35713958740234, "epoch": 0.25519496855345913, "grad_norm": 1.48262619972229, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6862244606018066, "reward_std": 0.1431613489985466, "rewards/accuracy_reward": 0.6862244606018066, "rewards/format_reward": 1.0, "step": 2536 }, { "completion_length": 242.32653045654297, "epoch": 0.2552955974842767, "grad_norm": 1.0376980304718018, "kl": 0.044921875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6910430192947388, "reward_std": 0.3039446771144867, "rewards/accuracy_reward": 0.6910430490970612, "rewards/format_reward": 1.0, "step": 2537 }, { "completion_length": 220.7244873046875, "epoch": 0.25539622641509435, "grad_norm": 1.0496333837509155, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6842575073242188, "reward_std": 0.1895861104130745, "rewards/accuracy_reward": 0.6842575073242188, "rewards/format_reward": 1.0, "step": 2538 }, { "completion_length": 231.28570556640625, "epoch": 0.25549685534591193, "grad_norm": 0.6257492899894714, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7125040888786316, "reward_std": 0.12285054102540016, "rewards/accuracy_reward": 0.7227080762386322, "rewards/format_reward": 0.9897959232330322, "step": 2539 }, { "completion_length": 207.7142791748047, "epoch": 0.25559748427672957, "grad_norm": 0.9731805324554443, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7827674746513367, "reward_std": 0.2363586127758026, "rewards/accuracy_reward": 0.7827673554420471, "rewards/format_reward": 1.0, "step": 2540 }, { "completion_length": 241.38774871826172, "epoch": 0.25569811320754715, "grad_norm": 1.4057953357696533, "kl": 0.0753173828125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.602638065814972, "reward_std": 0.23227312415838242, "rewards/accuracy_reward": 0.6128421723842621, "rewards/format_reward": 0.9897959232330322, "step": 2541 }, { "completion_length": 253.9387664794922, "epoch": 0.2557987421383648, "grad_norm": 0.6522101163864136, "kl": 0.0418701171875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.61648690700531, "reward_std": 0.1884230375289917, "rewards/accuracy_reward": 0.6266909539699554, "rewards/format_reward": 0.9897959232330322, "step": 2542 }, { "completion_length": 271.1632614135742, "epoch": 0.25589937106918237, "grad_norm": 1.7136070728302002, "kl": 0.115966796875, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.6822764873504639, "reward_std": 0.1812443658709526, "rewards/accuracy_reward": 0.6924805343151093, "rewards/format_reward": 0.9897959232330322, "step": 2543 }, { "completion_length": 181.2959213256836, "epoch": 0.256, "grad_norm": 0.3981655538082123, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.90136057138443, "reward_std": 0.07199322991073132, "rewards/accuracy_reward": 0.9115646481513977, "rewards/format_reward": 0.9897959232330322, "step": 2544 }, { "completion_length": 214.75509643554688, "epoch": 0.2561006289308176, "grad_norm": 0.7068315744400024, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.761489450931549, "reward_std": 0.19989661127328873, "rewards/accuracy_reward": 0.7614895403385162, "rewards/format_reward": 1.0, "step": 2545 }, { "completion_length": 239.40816497802734, "epoch": 0.2562012578616352, "grad_norm": 0.6424553990364075, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.525474727153778, "reward_std": 0.17071111500263214, "rewards/accuracy_reward": 0.5356788635253906, "rewards/format_reward": 0.9897959232330322, "step": 2546 }, { "completion_length": 193.2653045654297, "epoch": 0.2563018867924528, "grad_norm": 0.8254822492599487, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.701900064945221, "reward_std": 0.17405905947089195, "rewards/accuracy_reward": 0.7019001096487045, "rewards/format_reward": 1.0, "step": 2547 }, { "completion_length": 217.04080963134766, "epoch": 0.25640251572327044, "grad_norm": 4.0289692878723145, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6097270250320435, "reward_std": 0.20333971083164215, "rewards/accuracy_reward": 0.6403392553329468, "rewards/format_reward": 0.9693877398967743, "step": 2548 }, { "completion_length": 260.9387664794922, "epoch": 0.2565031446540881, "grad_norm": 0.46902570128440857, "kl": 0.05908203125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6632652878761292, "reward_std": 0.1045607216656208, "rewards/accuracy_reward": 0.6632653027772903, "rewards/format_reward": 1.0, "step": 2549 }, { "completion_length": 295.20408630371094, "epoch": 0.25660377358490566, "grad_norm": 1.2971261739730835, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.559912621974945, "reward_std": 0.24190246313810349, "rewards/accuracy_reward": 0.5599125325679779, "rewards/format_reward": 1.0, "step": 2550 }, { "completion_length": 247.10204315185547, "epoch": 0.2567044025157233, "grad_norm": 0.9252428412437439, "kl": 0.0762939453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.749013900756836, "reward_std": 0.3324781209230423, "rewards/accuracy_reward": 0.7694220542907715, "rewards/format_reward": 0.9795918464660645, "step": 2551 }, { "completion_length": 241.9693832397461, "epoch": 0.2568050314465409, "grad_norm": 0.9374364018440247, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5056688785552979, "reward_std": 0.1078929714858532, "rewards/accuracy_reward": 0.5158730000257492, "rewards/format_reward": 0.9897959232330322, "step": 2552 }, { "completion_length": 292.4897918701172, "epoch": 0.2569056603773585, "grad_norm": 0.5716007351875305, "kl": 0.044677734375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7190322279930115, "reward_std": 0.2329864725470543, "rewards/accuracy_reward": 0.7496444582939148, "rewards/format_reward": 0.9693877398967743, "step": 2553 }, { "completion_length": 272.948974609375, "epoch": 0.2570062893081761, "grad_norm": 0.6233465075492859, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.678458571434021, "reward_std": 0.22174514830112457, "rewards/accuracy_reward": 0.7192749381065369, "rewards/format_reward": 0.9591836631298065, "step": 2554 }, { "completion_length": 245.23468017578125, "epoch": 0.25710691823899373, "grad_norm": 1.0239051580429077, "kl": 0.04638671875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7122449278831482, "reward_std": 0.2772202715277672, "rewards/accuracy_reward": 0.7326530516147614, "rewards/format_reward": 0.9795918464660645, "step": 2555 }, { "completion_length": 167.05101776123047, "epoch": 0.2572075471698113, "grad_norm": 0.7331268787384033, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8188583254814148, "reward_std": 0.0837540328502655, "rewards/accuracy_reward": 0.8188583850860596, "rewards/format_reward": 1.0, "step": 2556 }, { "completion_length": 242.40816497802734, "epoch": 0.25730817610062895, "grad_norm": 1.298459768295288, "kl": 0.0716552734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.655479907989502, "reward_std": 0.2559491842985153, "rewards/accuracy_reward": 0.6860921084880829, "rewards/format_reward": 0.9693877398967743, "step": 2557 }, { "completion_length": 277.8367233276367, "epoch": 0.25740880503144653, "grad_norm": 1.183247447013855, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5714285373687744, "reward_std": 0.25005796551704407, "rewards/accuracy_reward": 0.6122449040412903, "rewards/format_reward": 0.9591836333274841, "step": 2558 }, { "completion_length": 219.78571319580078, "epoch": 0.25750943396226417, "grad_norm": 0.5520246028900146, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6193546056747437, "reward_std": 0.09878174494951963, "rewards/accuracy_reward": 0.6193546056747437, "rewards/format_reward": 1.0, "step": 2559 }, { "completion_length": 341.79591369628906, "epoch": 0.25761006289308175, "grad_norm": 1.8429110050201416, "kl": 0.0552978515625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.4734681844711304, "reward_std": 0.2532818168401718, "rewards/accuracy_reward": 0.4836723208427429, "rewards/format_reward": 0.9897959232330322, "step": 2560 }, { "completion_length": 211.01020050048828, "epoch": 0.2577106918238994, "grad_norm": 0.6058098077774048, "kl": 0.0570068359375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7668522000312805, "reward_std": 0.15088435262441635, "rewards/accuracy_reward": 0.7668522000312805, "rewards/format_reward": 1.0, "step": 2561 }, { "completion_length": 264.3367233276367, "epoch": 0.25781132075471697, "grad_norm": 0.848211944103241, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6296703219413757, "reward_std": 0.29529012739658356, "rewards/accuracy_reward": 0.6398743987083435, "rewards/format_reward": 0.9897959232330322, "step": 2562 }, { "completion_length": 209.4897918701172, "epoch": 0.2579119496855346, "grad_norm": 1.1369483470916748, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7216553092002869, "reward_std": 0.18177351355552673, "rewards/accuracy_reward": 0.7318593859672546, "rewards/format_reward": 0.9897959232330322, "step": 2563 }, { "completion_length": 199.56122589111328, "epoch": 0.2580125786163522, "grad_norm": 1.1588729619979858, "kl": 0.0538330078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6686264872550964, "reward_std": 0.20769929140806198, "rewards/accuracy_reward": 0.6788305640220642, "rewards/format_reward": 0.9897959232330322, "step": 2564 }, { "completion_length": 218.03060913085938, "epoch": 0.2581132075471698, "grad_norm": 0.8559512495994568, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.750012218952179, "reward_std": 0.17825184017419815, "rewards/accuracy_reward": 0.7704204320907593, "rewards/format_reward": 0.9795918464660645, "step": 2565 }, { "completion_length": 180.87754821777344, "epoch": 0.2582138364779874, "grad_norm": 0.7780479192733765, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.670238196849823, "reward_std": 0.16581640392541885, "rewards/accuracy_reward": 0.6702382266521454, "rewards/format_reward": 1.0, "step": 2566 }, { "completion_length": 210.59182739257812, "epoch": 0.25831446540880504, "grad_norm": 0.6089844703674316, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7902494072914124, "reward_std": 0.14988336712121964, "rewards/accuracy_reward": 0.7902494072914124, "rewards/format_reward": 1.0, "step": 2567 }, { "completion_length": 205.38774871826172, "epoch": 0.2584150943396226, "grad_norm": 0.7025158405303955, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.9384839534759521, "reward_std": 0.10396317392587662, "rewards/accuracy_reward": 0.9486880302429199, "rewards/format_reward": 0.9897959232330322, "step": 2568 }, { "completion_length": 195.83673095703125, "epoch": 0.25851572327044026, "grad_norm": 0.6429625749588013, "kl": 0.0498046875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7040815949440002, "reward_std": 0.13155816867947578, "rewards/accuracy_reward": 0.7040816247463226, "rewards/format_reward": 1.0, "step": 2569 }, { "completion_length": 288.98978424072266, "epoch": 0.25861635220125784, "grad_norm": 0.8306273221969604, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5979333519935608, "reward_std": 0.14986104518175125, "rewards/accuracy_reward": 0.6081375181674957, "rewards/format_reward": 0.9897959232330322, "step": 2570 }, { "completion_length": 257.7040710449219, "epoch": 0.2587169811320755, "grad_norm": 0.614905834197998, "kl": 0.0506591796875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.677551031112671, "reward_std": 0.20003793388605118, "rewards/accuracy_reward": 0.6775510311126709, "rewards/format_reward": 1.0, "step": 2571 }, { "completion_length": 241.95917510986328, "epoch": 0.25881761006289306, "grad_norm": 0.5318043828010559, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8375678658485413, "reward_std": 0.12276042625308037, "rewards/accuracy_reward": 0.8375678360462189, "rewards/format_reward": 1.0, "step": 2572 }, { "completion_length": 236.53060913085938, "epoch": 0.2589182389937107, "grad_norm": 0.8590052127838135, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.674448013305664, "reward_std": 0.12496738322079182, "rewards/accuracy_reward": 0.6744480431079865, "rewards/format_reward": 1.0, "step": 2573 }, { "completion_length": 213.55101013183594, "epoch": 0.25901886792452833, "grad_norm": 2.457002639770508, "kl": 0.0750732421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7734693884849548, "reward_std": 0.20262155681848526, "rewards/accuracy_reward": 0.7938775718212128, "rewards/format_reward": 0.9795918166637421, "step": 2574 }, { "completion_length": 275.52040100097656, "epoch": 0.2591194968553459, "grad_norm": 1.0774561166763306, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6014224290847778, "reward_std": 0.2042771503329277, "rewards/accuracy_reward": 0.6218304932117462, "rewards/format_reward": 0.9795918166637421, "step": 2575 }, { "completion_length": 280.05101013183594, "epoch": 0.25922012578616355, "grad_norm": 0.40068790316581726, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6413994431495667, "reward_std": 0.11349018663167953, "rewards/accuracy_reward": 0.6618075668811798, "rewards/format_reward": 0.9795918166637421, "step": 2576 }, { "completion_length": 290.4285659790039, "epoch": 0.25932075471698113, "grad_norm": 1.9242098331451416, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6754571795463562, "reward_std": 0.22199314087629318, "rewards/accuracy_reward": 0.7060694098472595, "rewards/format_reward": 0.9693877398967743, "step": 2577 }, { "completion_length": 213.53060913085938, "epoch": 0.25942138364779876, "grad_norm": 1.04486882686615, "kl": 0.065185546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8136220574378967, "reward_std": 0.20107095688581467, "rewards/accuracy_reward": 0.8340302109718323, "rewards/format_reward": 0.9795918464660645, "step": 2578 }, { "completion_length": 287.56121826171875, "epoch": 0.25952201257861635, "grad_norm": 0.7579090595245361, "kl": 0.0570068359375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5610432624816895, "reward_std": 0.11534945294260979, "rewards/accuracy_reward": 0.5610433369874954, "rewards/format_reward": 1.0, "step": 2579 }, { "completion_length": 162.88774871826172, "epoch": 0.259622641509434, "grad_norm": 1.1952896118164062, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6672011017799377, "reward_std": 0.22583352029323578, "rewards/accuracy_reward": 0.6774052679538727, "rewards/format_reward": 0.9897959232330322, "step": 2580 }, { "completion_length": 244.28571319580078, "epoch": 0.25972327044025156, "grad_norm": 0.8995382189750671, "kl": 0.05712890625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5683677792549133, "reward_std": 0.1554986536502838, "rewards/accuracy_reward": 0.5785717964172363, "rewards/format_reward": 0.9897959232330322, "step": 2581 }, { "completion_length": 209.6734619140625, "epoch": 0.2598238993710692, "grad_norm": 0.6691484451293945, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.730961263179779, "reward_std": 0.17343929409980774, "rewards/accuracy_reward": 0.7309613823890686, "rewards/format_reward": 1.0, "step": 2582 }, { "completion_length": 273.94898223876953, "epoch": 0.2599245283018868, "grad_norm": 0.7372006177902222, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5119107365608215, "reward_std": 0.2083553448319435, "rewards/accuracy_reward": 0.5119107514619827, "rewards/format_reward": 1.0, "step": 2583 }, { "completion_length": 207.7142791748047, "epoch": 0.2600251572327044, "grad_norm": 1.0552940368652344, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7184724807739258, "reward_std": 0.2212049439549446, "rewards/accuracy_reward": 0.7184726297855377, "rewards/format_reward": 1.0, "step": 2584 }, { "completion_length": 208.84693145751953, "epoch": 0.260125786163522, "grad_norm": 0.6326049566268921, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7346938848495483, "reward_std": 0.18887969478964806, "rewards/accuracy_reward": 0.734693855047226, "rewards/format_reward": 1.0, "step": 2585 }, { "completion_length": 244.61223602294922, "epoch": 0.26022641509433964, "grad_norm": 0.47619956731796265, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8506411910057068, "reward_std": 0.14631860703229904, "rewards/accuracy_reward": 0.8608453273773193, "rewards/format_reward": 0.9897959232330322, "step": 2586 }, { "completion_length": 196.14285278320312, "epoch": 0.2603270440251572, "grad_norm": 1.213678002357483, "kl": 0.0535888671875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.628499984741211, "reward_std": 0.1158481240272522, "rewards/accuracy_reward": 0.6387040913105011, "rewards/format_reward": 0.9897959232330322, "step": 2587 }, { "completion_length": 225.7142791748047, "epoch": 0.26042767295597485, "grad_norm": 0.6666717529296875, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7975881695747375, "reward_std": 0.14820383116602898, "rewards/accuracy_reward": 0.8077921867370605, "rewards/format_reward": 0.9897959232330322, "step": 2588 }, { "completion_length": 186.4591827392578, "epoch": 0.26052830188679243, "grad_norm": 0.3918542265892029, "kl": 0.05712890625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6632652878761292, "reward_std": 0.06517764180898666, "rewards/accuracy_reward": 0.6632652878761292, "rewards/format_reward": 1.0, "step": 2589 }, { "completion_length": 211.7551040649414, "epoch": 0.26062893081761007, "grad_norm": 0.7842116355895996, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.828571379184723, "reward_std": 0.11263052374124527, "rewards/accuracy_reward": 0.8285714089870453, "rewards/format_reward": 1.0, "step": 2590 }, { "completion_length": 241.846923828125, "epoch": 0.26072955974842765, "grad_norm": 0.7570644021034241, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7426970601081848, "reward_std": 0.17958441376686096, "rewards/accuracy_reward": 0.7631052136421204, "rewards/format_reward": 0.9795918464660645, "step": 2591 }, { "completion_length": 215.9285659790039, "epoch": 0.2608301886792453, "grad_norm": 0.6943016052246094, "kl": 0.0596923828125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7431126236915588, "reward_std": 0.11324851587414742, "rewards/accuracy_reward": 0.7533166706562042, "rewards/format_reward": 0.9897959232330322, "step": 2592 }, { "completion_length": 211.6836700439453, "epoch": 0.26093081761006287, "grad_norm": 0.986964762210846, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6954935193061829, "reward_std": 0.20688024908304214, "rewards/accuracy_reward": 0.705697625875473, "rewards/format_reward": 0.9897959232330322, "step": 2593 }, { "completion_length": 217.06121826171875, "epoch": 0.2610314465408805, "grad_norm": 5.230151653289795, "kl": 0.10107421875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7482993006706238, "reward_std": 0.103792954236269, "rewards/accuracy_reward": 0.7482992708683014, "rewards/format_reward": 1.0, "step": 2594 }, { "completion_length": 192.1530532836914, "epoch": 0.2611320754716981, "grad_norm": 1.4287208318710327, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.573829472064972, "reward_std": 0.28200291097164154, "rewards/accuracy_reward": 0.5840336084365845, "rewards/format_reward": 0.9897959232330322, "step": 2595 }, { "completion_length": 207.32652282714844, "epoch": 0.2612327044025157, "grad_norm": 0.4770963191986084, "kl": 0.0692138671875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6845480799674988, "reward_std": 0.09879057481884956, "rewards/accuracy_reward": 0.6845480799674988, "rewards/format_reward": 1.0, "step": 2596 }, { "completion_length": 218.9795913696289, "epoch": 0.2613333333333333, "grad_norm": 0.4393633008003235, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.689990222454071, "reward_std": 0.04901278018951416, "rewards/accuracy_reward": 0.6899902522563934, "rewards/format_reward": 1.0, "step": 2597 }, { "completion_length": 182.25509643554688, "epoch": 0.26143396226415094, "grad_norm": 0.954679548740387, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.9150874614715576, "reward_std": 0.10391998756676912, "rewards/accuracy_reward": 0.9252915680408478, "rewards/format_reward": 0.9897959232330322, "step": 2598 }, { "completion_length": 207.56122589111328, "epoch": 0.2615345911949686, "grad_norm": 0.863519012928009, "kl": 0.0780029296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7133046388626099, "reward_std": 0.19464822113513947, "rewards/accuracy_reward": 0.733712911605835, "rewards/format_reward": 0.9795918464660645, "step": 2599 }, { "completion_length": 140.09183502197266, "epoch": 0.26163522012578616, "grad_norm": 1.008435606956482, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8109533786773682, "reward_std": 0.08724979311227798, "rewards/accuracy_reward": 0.8109534084796906, "rewards/format_reward": 1.0, "step": 2600 }, { "completion_length": 159.04080963134766, "epoch": 0.2617358490566038, "grad_norm": 0.9523258805274963, "kl": 0.10205078125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8080660104751587, "reward_std": 0.14320295304059982, "rewards/accuracy_reward": 0.8080660700798035, "rewards/format_reward": 1.0, "step": 2601 }, { "completion_length": 253.31631469726562, "epoch": 0.2618364779874214, "grad_norm": 0.6842663884162903, "kl": 0.0653076171875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8061224222183228, "reward_std": 0.20006240159273148, "rewards/accuracy_reward": 0.8163264989852905, "rewards/format_reward": 0.9897959232330322, "step": 2602 }, { "completion_length": 214.89795684814453, "epoch": 0.261937106918239, "grad_norm": 1.0037723779678345, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7680143117904663, "reward_std": 0.2549610584974289, "rewards/accuracy_reward": 0.788422554731369, "rewards/format_reward": 0.9795918166637421, "step": 2603 }, { "completion_length": 224.35713958740234, "epoch": 0.2620377358490566, "grad_norm": 0.7052575349807739, "kl": 0.0540771484375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8609328866004944, "reward_std": 0.14098702743649483, "rewards/accuracy_reward": 0.8711369931697845, "rewards/format_reward": 0.9897959232330322, "step": 2604 }, { "completion_length": 219.73468780517578, "epoch": 0.26213836477987423, "grad_norm": 0.6481660604476929, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6895424723625183, "reward_std": 0.18349624425172806, "rewards/accuracy_reward": 0.6997465640306473, "rewards/format_reward": 0.9897959232330322, "step": 2605 }, { "completion_length": 217.01019287109375, "epoch": 0.2622389937106918, "grad_norm": 1.0231597423553467, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6937485933303833, "reward_std": 0.16668188571929932, "rewards/accuracy_reward": 0.7039527893066406, "rewards/format_reward": 0.9897959232330322, "step": 2606 }, { "completion_length": 260.6632537841797, "epoch": 0.26233962264150945, "grad_norm": 0.5490532517433167, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.58933424949646, "reward_std": 0.10252774134278297, "rewards/accuracy_reward": 0.5995383858680725, "rewards/format_reward": 0.9897959232330322, "step": 2607 }, { "completion_length": 204.41836547851562, "epoch": 0.26244025157232703, "grad_norm": 0.7720689177513123, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8385851979255676, "reward_std": 0.14481863752007484, "rewards/accuracy_reward": 0.8385851979255676, "rewards/format_reward": 1.0, "step": 2608 }, { "completion_length": 158.93877410888672, "epoch": 0.26254088050314467, "grad_norm": 4.072691440582275, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7551020383834839, "reward_std": 0.09217509627342224, "rewards/accuracy_reward": 0.7551020383834839, "rewards/format_reward": 1.0, "step": 2609 }, { "completion_length": 214.61224365234375, "epoch": 0.26264150943396225, "grad_norm": 0.6578095555305481, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6809162497520447, "reward_std": 0.17448274791240692, "rewards/accuracy_reward": 0.6809163391590118, "rewards/format_reward": 1.0, "step": 2610 }, { "completion_length": 201.4795913696289, "epoch": 0.2627421383647799, "grad_norm": 0.8796641826629639, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7817872762680054, "reward_std": 0.13204486668109894, "rewards/accuracy_reward": 0.781787246465683, "rewards/format_reward": 1.0, "step": 2611 }, { "completion_length": 191.89795684814453, "epoch": 0.26284276729559747, "grad_norm": 11.810832023620605, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7704081535339355, "reward_std": 0.22943148016929626, "rewards/accuracy_reward": 0.8112244606018066, "rewards/format_reward": 0.9591836631298065, "step": 2612 }, { "completion_length": 255.48979949951172, "epoch": 0.2629433962264151, "grad_norm": 0.74309903383255, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5700878500938416, "reward_std": 0.162979394197464, "rewards/accuracy_reward": 0.5802919864654541, "rewards/format_reward": 0.9897959232330322, "step": 2613 }, { "completion_length": 213.95917510986328, "epoch": 0.2630440251572327, "grad_norm": 0.6891154050827026, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6824586987495422, "reward_std": 0.1370229721069336, "rewards/accuracy_reward": 0.7028668522834778, "rewards/format_reward": 0.9795918166637421, "step": 2614 }, { "completion_length": 259.7653045654297, "epoch": 0.2631446540880503, "grad_norm": 0.4349522292613983, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6591836214065552, "reward_std": 0.09924803301692009, "rewards/accuracy_reward": 0.6591836810112, "rewards/format_reward": 1.0, "step": 2615 }, { "completion_length": 177.60203552246094, "epoch": 0.2632452830188679, "grad_norm": 0.7488835453987122, "kl": 0.09423828125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7511614561080933, "reward_std": 0.14795075356960297, "rewards/accuracy_reward": 0.781773716211319, "rewards/format_reward": 0.9693877398967743, "step": 2616 }, { "completion_length": 257.9183654785156, "epoch": 0.26334591194968554, "grad_norm": 0.9359920024871826, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6673126816749573, "reward_std": 0.19698835164308548, "rewards/accuracy_reward": 0.6775167882442474, "rewards/format_reward": 0.9897959232330322, "step": 2617 }, { "completion_length": 235.24488830566406, "epoch": 0.2634465408805031, "grad_norm": 1.656056523323059, "kl": 0.0538330078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7369614243507385, "reward_std": 0.13302811607718468, "rewards/accuracy_reward": 0.7369614243507385, "rewards/format_reward": 1.0, "step": 2618 }, { "completion_length": 253.1632537841797, "epoch": 0.26354716981132076, "grad_norm": 2.175610303878784, "kl": 0.169921875, "learning_rate": 1e-06, "loss": 0.0068, "reward": 1.6061224937438965, "reward_std": 0.21432608366012573, "rewards/accuracy_reward": 0.6163264811038971, "rewards/format_reward": 0.9897959232330322, "step": 2619 }, { "completion_length": 201.57142639160156, "epoch": 0.26364779874213834, "grad_norm": 0.6951988935470581, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.683948814868927, "reward_std": 0.10631632804870605, "rewards/accuracy_reward": 0.6941529214382172, "rewards/format_reward": 0.9897959232330322, "step": 2620 }, { "completion_length": 249.18366241455078, "epoch": 0.263748427672956, "grad_norm": 1.3206915855407715, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6008657813072205, "reward_std": 0.14545519649982452, "rewards/accuracy_reward": 0.6110698580741882, "rewards/format_reward": 0.9897959232330322, "step": 2621 }, { "completion_length": 190.84693908691406, "epoch": 0.26384905660377356, "grad_norm": 1.4741817712783813, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.4830076098442078, "reward_std": 0.14589298889040947, "rewards/accuracy_reward": 0.49321168661117554, "rewards/format_reward": 0.9897959232330322, "step": 2622 }, { "completion_length": 259.5, "epoch": 0.2639496855345912, "grad_norm": 1.101829171180725, "kl": 0.10205078125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7937383651733398, "reward_std": 0.15414827316999435, "rewards/accuracy_reward": 0.8039424419403076, "rewards/format_reward": 0.9897959232330322, "step": 2623 }, { "completion_length": 249.7040786743164, "epoch": 0.26405031446540883, "grad_norm": 0.8077969551086426, "kl": 0.05810546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5503724217414856, "reward_std": 0.23316887766122818, "rewards/accuracy_reward": 0.5605766475200653, "rewards/format_reward": 0.9897959232330322, "step": 2624 }, { "completion_length": 175.61223602294922, "epoch": 0.2641509433962264, "grad_norm": 0.9280982613563538, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6427181363105774, "reward_std": 0.15466181933879852, "rewards/accuracy_reward": 0.6631264090538025, "rewards/format_reward": 0.9795918166637421, "step": 2625 }, { "completion_length": 229.78570556640625, "epoch": 0.26425157232704405, "grad_norm": 0.7121678590774536, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6794191598892212, "reward_std": 0.1178041510283947, "rewards/accuracy_reward": 0.6998273432254791, "rewards/format_reward": 0.9795918464660645, "step": 2626 }, { "completion_length": 213.32652282714844, "epoch": 0.26435220125786163, "grad_norm": 0.45330971479415894, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8290815949440002, "reward_std": 0.06860040873289108, "rewards/accuracy_reward": 0.8290816247463226, "rewards/format_reward": 1.0, "step": 2627 }, { "completion_length": 175.5, "epoch": 0.26445283018867927, "grad_norm": 0.6821607351303101, "kl": 0.0736083984375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8775509595870972, "reward_std": 0.09670460596680641, "rewards/accuracy_reward": 0.8775509893894196, "rewards/format_reward": 1.0, "step": 2628 }, { "completion_length": 273.2857131958008, "epoch": 0.26455345911949685, "grad_norm": 0.7569553256034851, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6198143362998962, "reward_std": 0.2597558721899986, "rewards/accuracy_reward": 0.6504266560077667, "rewards/format_reward": 0.9693877398967743, "step": 2629 }, { "completion_length": 207.4591827392578, "epoch": 0.2646540880503145, "grad_norm": 0.5592883229255676, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8320132493972778, "reward_std": 0.1706225872039795, "rewards/accuracy_reward": 0.8728295862674713, "rewards/format_reward": 0.9591836631298065, "step": 2630 }, { "completion_length": 180.4897918701172, "epoch": 0.26475471698113207, "grad_norm": 1.0735450983047485, "kl": 0.0552978515625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7250728607177734, "reward_std": 0.2143169417977333, "rewards/accuracy_reward": 0.7352769076824188, "rewards/format_reward": 0.9897959232330322, "step": 2631 }, { "completion_length": 291.56121826171875, "epoch": 0.2648553459119497, "grad_norm": 0.8187764286994934, "kl": 0.05517578125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.522303819656372, "reward_std": 0.17119525745511055, "rewards/accuracy_reward": 0.5631200671195984, "rewards/format_reward": 0.9591836631298065, "step": 2632 }, { "completion_length": 285.73468017578125, "epoch": 0.2649559748427673, "grad_norm": 1.047930359840393, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5139941573143005, "reward_std": 0.24902304261922836, "rewards/accuracy_reward": 0.544606402516365, "rewards/format_reward": 0.9693877398967743, "step": 2633 }, { "completion_length": 212.6938705444336, "epoch": 0.2650566037735849, "grad_norm": 1.1011667251586914, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.70298171043396, "reward_std": 0.20684194564819336, "rewards/accuracy_reward": 0.7131857872009277, "rewards/format_reward": 0.9897959232330322, "step": 2634 }, { "completion_length": 211.7040786743164, "epoch": 0.2651572327044025, "grad_norm": 0.5926901698112488, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8814706802368164, "reward_std": 0.08842267841100693, "rewards/accuracy_reward": 0.8814706802368164, "rewards/format_reward": 1.0, "step": 2635 }, { "completion_length": 148.7551040649414, "epoch": 0.26525786163522014, "grad_norm": 0.9775146245956421, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7441205382347107, "reward_std": 0.09544982016086578, "rewards/accuracy_reward": 0.7441204786300659, "rewards/format_reward": 1.0, "step": 2636 }, { "completion_length": 225.95917510986328, "epoch": 0.2653584905660377, "grad_norm": 0.7735543847084045, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5780174136161804, "reward_std": 0.26383721083402634, "rewards/accuracy_reward": 0.6086297035217285, "rewards/format_reward": 0.9693877398967743, "step": 2637 }, { "completion_length": 186.55101776123047, "epoch": 0.26545911949685536, "grad_norm": 0.9134131669998169, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8095237612724304, "reward_std": 0.19378448650240898, "rewards/accuracy_reward": 0.829931914806366, "rewards/format_reward": 0.9795918166637421, "step": 2638 }, { "completion_length": 182.81632232666016, "epoch": 0.26555974842767294, "grad_norm": 0.9428406953811646, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6719979643821716, "reward_std": 0.20885030925273895, "rewards/accuracy_reward": 0.7026102840900421, "rewards/format_reward": 0.9693877398967743, "step": 2639 }, { "completion_length": 266.7346954345703, "epoch": 0.2656603773584906, "grad_norm": 0.760504424571991, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.649829924106598, "reward_std": 0.23545081168413162, "rewards/accuracy_reward": 0.6804421544075012, "rewards/format_reward": 0.9693877398967743, "step": 2640 }, { "completion_length": 186.34693908691406, "epoch": 0.26576100628930815, "grad_norm": 0.8422397375106812, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7055658102035522, "reward_std": 0.12995292991399765, "rewards/accuracy_reward": 0.7157699167728424, "rewards/format_reward": 0.9897959232330322, "step": 2641 }, { "completion_length": 190.32653045654297, "epoch": 0.2658616352201258, "grad_norm": 1.8632677793502808, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.4947521686553955, "reward_std": 0.1657322496175766, "rewards/accuracy_reward": 0.5049562454223633, "rewards/format_reward": 0.9897959232330322, "step": 2642 }, { "completion_length": 190.5204086303711, "epoch": 0.2659622641509434, "grad_norm": 0.8324465155601501, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8515042066574097, "reward_std": 0.19672931730747223, "rewards/accuracy_reward": 0.8821164071559906, "rewards/format_reward": 0.9693877398967743, "step": 2643 }, { "completion_length": 223.2653045654297, "epoch": 0.266062893081761, "grad_norm": 0.7859304547309875, "kl": 0.0767822265625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7269355058670044, "reward_std": 0.10124587267637253, "rewards/accuracy_reward": 0.7371396124362946, "rewards/format_reward": 0.9897959232330322, "step": 2644 }, { "completion_length": 274.3673400878906, "epoch": 0.2661635220125786, "grad_norm": 0.9855813384056091, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6425291299819946, "reward_std": 0.19139529764652252, "rewards/accuracy_reward": 0.6731413900852203, "rewards/format_reward": 0.9693877398967743, "step": 2645 }, { "completion_length": 197.11224365234375, "epoch": 0.2662641509433962, "grad_norm": 0.785517692565918, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8433089852333069, "reward_std": 0.16064095497131348, "rewards/accuracy_reward": 0.8637171983718872, "rewards/format_reward": 0.9795918464660645, "step": 2646 }, { "completion_length": 187.1530532836914, "epoch": 0.26636477987421386, "grad_norm": 1.2225526571273804, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6870887875556946, "reward_std": 0.26544172316789627, "rewards/accuracy_reward": 0.7074969708919525, "rewards/format_reward": 0.9795918464660645, "step": 2647 }, { "completion_length": 214.2142791748047, "epoch": 0.26646540880503145, "grad_norm": 5.837754726409912, "kl": 0.05712890625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6632652878761292, "reward_std": 0.25449708849191666, "rewards/accuracy_reward": 0.7040816247463226, "rewards/format_reward": 0.9591836631298065, "step": 2648 }, { "completion_length": 221.31632232666016, "epoch": 0.2665660377358491, "grad_norm": 1.3459407091140747, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7857142686843872, "reward_std": 0.23158938437700272, "rewards/accuracy_reward": 0.8163265287876129, "rewards/format_reward": 0.9693877398967743, "step": 2649 }, { "completion_length": 236.91836547851562, "epoch": 0.26666666666666666, "grad_norm": 0.60841965675354, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5571181178092957, "reward_std": 0.050715671153739095, "rewards/accuracy_reward": 0.557118222117424, "rewards/format_reward": 1.0, "step": 2650 }, { "completion_length": 254.1530532836914, "epoch": 0.2667672955974843, "grad_norm": 1.1602402925491333, "kl": 0.056884765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6333333253860474, "reward_std": 0.22663909196853638, "rewards/accuracy_reward": 0.6333333253860474, "rewards/format_reward": 1.0, "step": 2651 }, { "completion_length": 196.9693832397461, "epoch": 0.2668679245283019, "grad_norm": 1.1060634851455688, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.685780942440033, "reward_std": 0.19436807185411453, "rewards/accuracy_reward": 0.7061891555786133, "rewards/format_reward": 0.9795918464660645, "step": 2652 }, { "completion_length": 211.448974609375, "epoch": 0.2669685534591195, "grad_norm": 0.8992443084716797, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6796227097511292, "reward_std": 0.13145430013537407, "rewards/accuracy_reward": 0.6796227097511292, "rewards/format_reward": 1.0, "step": 2653 }, { "completion_length": 237.23468780517578, "epoch": 0.2670691823899371, "grad_norm": 1.2518603801727295, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6909620761871338, "reward_std": 0.1703890711069107, "rewards/accuracy_reward": 0.7113702595233917, "rewards/format_reward": 0.9795918464660645, "step": 2654 }, { "completion_length": 243.23468017578125, "epoch": 0.26716981132075474, "grad_norm": 1.0961214303970337, "kl": 0.094970703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7979149222373962, "reward_std": 0.1629481166601181, "rewards/accuracy_reward": 0.8081191182136536, "rewards/format_reward": 0.9897959232330322, "step": 2655 }, { "completion_length": 296.1224365234375, "epoch": 0.2672704402515723, "grad_norm": 1.668418288230896, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.540816307067871, "reward_std": 0.20457448437809944, "rewards/accuracy_reward": 0.5918367207050323, "rewards/format_reward": 0.9489795863628387, "step": 2656 }, { "completion_length": 167.61223602294922, "epoch": 0.26737106918238995, "grad_norm": 0.8441951870918274, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7448979020118713, "reward_std": 0.17508357018232346, "rewards/accuracy_reward": 0.7551020085811615, "rewards/format_reward": 0.9897959232330322, "step": 2657 }, { "completion_length": 195.07141876220703, "epoch": 0.26747169811320753, "grad_norm": 0.6771680116653442, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8103983998298645, "reward_std": 0.1531810723245144, "rewards/accuracy_reward": 0.8103983998298645, "rewards/format_reward": 1.0, "step": 2658 }, { "completion_length": 160.33673095703125, "epoch": 0.26757232704402517, "grad_norm": 0.5597769618034363, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.814285695552826, "reward_std": 0.06481523811817169, "rewards/accuracy_reward": 0.8142856955528259, "rewards/format_reward": 1.0, "step": 2659 }, { "completion_length": 204.59182739257812, "epoch": 0.26767295597484275, "grad_norm": 0.9586927890777588, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7599257230758667, "reward_std": 0.2154315784573555, "rewards/accuracy_reward": 0.770129919052124, "rewards/format_reward": 0.9897959232330322, "step": 2660 }, { "completion_length": 309.89794921875, "epoch": 0.2677735849056604, "grad_norm": 0.6561679840087891, "kl": 0.0435791015625, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.6611952781677246, "reward_std": 0.1492389738559723, "rewards/accuracy_reward": 0.6816034615039825, "rewards/format_reward": 0.9795918464660645, "step": 2661 }, { "completion_length": 218.09183502197266, "epoch": 0.26787421383647797, "grad_norm": 0.5086390376091003, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.9183672666549683, "reward_std": 0.08393208123743534, "rewards/accuracy_reward": 0.9285714030265808, "rewards/format_reward": 0.9897959232330322, "step": 2662 }, { "completion_length": 260.8163146972656, "epoch": 0.2679748427672956, "grad_norm": 0.8022351264953613, "kl": 0.0643310546875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6233423352241516, "reward_std": 0.2131153792142868, "rewards/accuracy_reward": 0.6437504589557648, "rewards/format_reward": 0.9795918166637421, "step": 2663 }, { "completion_length": 186.7244873046875, "epoch": 0.2680754716981132, "grad_norm": 1.8231104612350464, "kl": 0.06689453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6644657254219055, "reward_std": 0.1347343511879444, "rewards/accuracy_reward": 0.6644657552242279, "rewards/format_reward": 1.0, "step": 2664 }, { "completion_length": 239.60203552246094, "epoch": 0.2681761006289308, "grad_norm": 1.007061243057251, "kl": 0.0938720703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8181414604187012, "reward_std": 0.1720147430896759, "rewards/accuracy_reward": 0.8181414902210236, "rewards/format_reward": 1.0, "step": 2665 }, { "completion_length": 220.54080963134766, "epoch": 0.2682767295597484, "grad_norm": 2.075072765350342, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7372448444366455, "reward_std": 0.19842474907636642, "rewards/accuracy_reward": 0.7474489510059357, "rewards/format_reward": 0.9897959232330322, "step": 2666 }, { "completion_length": 250.89794921875, "epoch": 0.26837735849056604, "grad_norm": 0.7376219034194946, "kl": 0.0601806640625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5209844708442688, "reward_std": 0.15609079599380493, "rewards/accuracy_reward": 0.5209844708442688, "rewards/format_reward": 1.0, "step": 2667 }, { "completion_length": 248.32652282714844, "epoch": 0.2684779874213836, "grad_norm": 0.567412257194519, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.5678327679634094, "reward_std": 0.0463423365727067, "rewards/accuracy_reward": 0.5678328424692154, "rewards/format_reward": 1.0, "step": 2668 }, { "completion_length": 276.17345428466797, "epoch": 0.26857861635220126, "grad_norm": 0.7594252824783325, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7124303579330444, "reward_std": 0.21496668457984924, "rewards/accuracy_reward": 0.7124303877353668, "rewards/format_reward": 1.0, "step": 2669 }, { "completion_length": 156.75510025024414, "epoch": 0.26867924528301884, "grad_norm": 1.2237569093704224, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8130178451538086, "reward_std": 0.1613936945796013, "rewards/accuracy_reward": 0.8232220411300659, "rewards/format_reward": 0.9897959232330322, "step": 2670 }, { "completion_length": 274.9897918701172, "epoch": 0.2687798742138365, "grad_norm": 0.8016756772994995, "kl": 0.0672607421875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7244898080825806, "reward_std": 0.2585868537425995, "rewards/accuracy_reward": 0.7346938848495483, "rewards/format_reward": 0.9897959232330322, "step": 2671 }, { "completion_length": 182.42857360839844, "epoch": 0.2688805031446541, "grad_norm": 1.4035636186599731, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6782312989234924, "reward_std": 0.2281077727675438, "rewards/accuracy_reward": 0.6884353458881378, "rewards/format_reward": 0.9897959232330322, "step": 2672 }, { "completion_length": 193.83673095703125, "epoch": 0.2689811320754717, "grad_norm": 0.8731691241264343, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8306167125701904, "reward_std": 0.07436206378042698, "rewards/accuracy_reward": 0.8306166529655457, "rewards/format_reward": 1.0, "step": 2673 }, { "completion_length": 237.82652282714844, "epoch": 0.26908176100628933, "grad_norm": 1.88546884059906, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7392029166221619, "reward_std": 0.2613200917840004, "rewards/accuracy_reward": 0.7392028272151947, "rewards/format_reward": 1.0, "step": 2674 }, { "completion_length": 150.27550506591797, "epoch": 0.2691823899371069, "grad_norm": 3.814176559448242, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6963268518447876, "reward_std": 0.21492524072527885, "rewards/accuracy_reward": 0.7065309882164001, "rewards/format_reward": 0.9897959232330322, "step": 2675 }, { "completion_length": 240.81632232666016, "epoch": 0.26928301886792455, "grad_norm": 0.44827091693878174, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6904114484786987, "reward_std": 0.0976775735616684, "rewards/accuracy_reward": 0.7006154656410217, "rewards/format_reward": 0.9897959232330322, "step": 2676 }, { "completion_length": 213.37754821777344, "epoch": 0.26938364779874213, "grad_norm": 0.6024577021598816, "kl": 0.0443115234375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.8063992261886597, "reward_std": 0.05002867244184017, "rewards/accuracy_reward": 0.8063992559909821, "rewards/format_reward": 1.0, "step": 2677 }, { "completion_length": 264.06121826171875, "epoch": 0.26948427672955977, "grad_norm": 0.90274578332901, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6528080701828003, "reward_std": 0.22434739023447037, "rewards/accuracy_reward": 0.6732161641120911, "rewards/format_reward": 0.9795918464660645, "step": 2678 }, { "completion_length": 304.7346878051758, "epoch": 0.26958490566037735, "grad_norm": 0.8318151831626892, "kl": 0.0684814453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5490606427192688, "reward_std": 0.28635426610708237, "rewards/accuracy_reward": 0.5592647343873978, "rewards/format_reward": 0.9897959232330322, "step": 2679 }, { "completion_length": 195.12244415283203, "epoch": 0.269685534591195, "grad_norm": 5.271459102630615, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.815816342830658, "reward_std": 0.13166808430105448, "rewards/accuracy_reward": 0.826020359992981, "rewards/format_reward": 0.9897959232330322, "step": 2680 }, { "completion_length": 174.86734771728516, "epoch": 0.26978616352201257, "grad_norm": 0.9838129878044128, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7796509265899658, "reward_std": 0.16122691705822945, "rewards/accuracy_reward": 0.7796509265899658, "rewards/format_reward": 1.0, "step": 2681 }, { "completion_length": 182.4693832397461, "epoch": 0.2698867924528302, "grad_norm": 0.7540095448493958, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8095238208770752, "reward_std": 0.14284342527389526, "rewards/accuracy_reward": 0.8197278678417206, "rewards/format_reward": 0.9897959232330322, "step": 2682 }, { "completion_length": 230.7959213256836, "epoch": 0.2699874213836478, "grad_norm": 0.923637330532074, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8214335441589355, "reward_std": 0.1517888717353344, "rewards/accuracy_reward": 0.8316375613212585, "rewards/format_reward": 0.9897959232330322, "step": 2683 }, { "completion_length": 260.94898223876953, "epoch": 0.2700880503144654, "grad_norm": 0.8922394514083862, "kl": 0.0653076171875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7196305990219116, "reward_std": 0.1395331360399723, "rewards/accuracy_reward": 0.7196306884288788, "rewards/format_reward": 1.0, "step": 2684 }, { "completion_length": 263.14286041259766, "epoch": 0.270188679245283, "grad_norm": 0.8703047633171082, "kl": 0.05224609375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7316789031028748, "reward_std": 0.13565117120742798, "rewards/accuracy_reward": 0.7316788733005524, "rewards/format_reward": 1.0, "step": 2685 }, { "completion_length": 231.27550506591797, "epoch": 0.27028930817610064, "grad_norm": 0.49647217988967896, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.67201167345047, "reward_std": 0.04429508559405804, "rewards/accuracy_reward": 0.67201167345047, "rewards/format_reward": 1.0, "step": 2686 }, { "completion_length": 218.39794921875, "epoch": 0.2703899371069182, "grad_norm": 0.6572145223617554, "kl": 0.0799560546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6468271017074585, "reward_std": 0.1313377246260643, "rewards/accuracy_reward": 0.6468271613121033, "rewards/format_reward": 1.0, "step": 2687 }, { "completion_length": 274.9897918701172, "epoch": 0.27049056603773586, "grad_norm": 0.41731420159339905, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5906818509101868, "reward_std": 0.07022402621805668, "rewards/accuracy_reward": 0.5906818509101868, "rewards/format_reward": 1.0, "step": 2688 }, { "completion_length": 224.76531219482422, "epoch": 0.27059119496855344, "grad_norm": 0.4902452528476715, "kl": 0.065185546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7353742122650146, "reward_std": 0.08704865351319313, "rewards/accuracy_reward": 0.7353741228580475, "rewards/format_reward": 1.0, "step": 2689 }, { "completion_length": 199.6734619140625, "epoch": 0.2706918238993711, "grad_norm": 0.6336991786956787, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.777142882347107, "reward_std": 0.02978574763983488, "rewards/accuracy_reward": 0.7771428227424622, "rewards/format_reward": 1.0, "step": 2690 }, { "completion_length": 281.9795837402344, "epoch": 0.27079245283018866, "grad_norm": 0.5593507289886475, "kl": 0.0421142578125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7088288068771362, "reward_std": 0.11133787035942078, "rewards/accuracy_reward": 0.7088288068771362, "rewards/format_reward": 1.0, "step": 2691 }, { "completion_length": 227.94898223876953, "epoch": 0.2708930817610063, "grad_norm": 0.8324235081672668, "kl": 0.056640625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6740182042121887, "reward_std": 0.2382897436618805, "rewards/accuracy_reward": 0.7046304643154144, "rewards/format_reward": 0.9693877398967743, "step": 2692 }, { "completion_length": 195.1836700439453, "epoch": 0.2709937106918239, "grad_norm": 1.151808738708496, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8321428298950195, "reward_std": 0.18175313621759415, "rewards/accuracy_reward": 0.8525510132312775, "rewards/format_reward": 0.9795918464660645, "step": 2693 }, { "completion_length": 256.2550964355469, "epoch": 0.2710943396226415, "grad_norm": 0.5198100805282593, "kl": 0.05615234375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6729348301887512, "reward_std": 0.08012615516781807, "rewards/accuracy_reward": 0.672934889793396, "rewards/format_reward": 1.0, "step": 2694 }, { "completion_length": 171.7551040649414, "epoch": 0.2711949685534591, "grad_norm": 1.193337321281433, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7244897484779358, "reward_std": 0.026997461915016174, "rewards/accuracy_reward": 0.7244897782802582, "rewards/format_reward": 1.0, "step": 2695 }, { "completion_length": 299.57142639160156, "epoch": 0.27129559748427673, "grad_norm": 1.1174414157867432, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5297057628631592, "reward_std": 0.23687326163053513, "rewards/accuracy_reward": 0.5399098545312881, "rewards/format_reward": 0.9897959232330322, "step": 2696 }, { "completion_length": 282.2142791748047, "epoch": 0.27139622641509437, "grad_norm": 0.6614917516708374, "kl": 0.0430908203125, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.8711804151535034, "reward_std": 0.20071996003389359, "rewards/accuracy_reward": 0.871180385351181, "rewards/format_reward": 1.0, "step": 2697 }, { "completion_length": 304.07142639160156, "epoch": 0.27149685534591195, "grad_norm": 0.443770170211792, "kl": 0.0535888671875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8163264989852905, "reward_std": 0.1652088463306427, "rewards/accuracy_reward": 0.8265306055545807, "rewards/format_reward": 0.9897959232330322, "step": 2698 }, { "completion_length": 201.55101776123047, "epoch": 0.2715974842767296, "grad_norm": 0.8229562044143677, "kl": 0.0576171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7179234623908997, "reward_std": 0.19929467141628265, "rewards/accuracy_reward": 0.7179234027862549, "rewards/format_reward": 1.0, "step": 2699 }, { "completion_length": 187.84693908691406, "epoch": 0.27169811320754716, "grad_norm": 0.6721490025520325, "kl": 0.0657958984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8459669351577759, "reward_std": 0.11077907308936119, "rewards/accuracy_reward": 0.8459669351577759, "rewards/format_reward": 1.0, "step": 2700 }, { "completion_length": 208.4795913696289, "epoch": 0.2717987421383648, "grad_norm": 0.9532689452171326, "kl": 0.0704345703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.707646667957306, "reward_std": 0.15653981268405914, "rewards/accuracy_reward": 0.7076466083526611, "rewards/format_reward": 1.0, "step": 2701 }, { "completion_length": 259.0816192626953, "epoch": 0.2718993710691824, "grad_norm": 9.97035026550293, "kl": 0.240966796875, "learning_rate": 1e-06, "loss": 0.0097, "reward": 1.6385669112205505, "reward_std": 0.19837148487567902, "rewards/accuracy_reward": 0.6589751243591309, "rewards/format_reward": 0.9795918166637421, "step": 2702 }, { "completion_length": 279.72447967529297, "epoch": 0.272, "grad_norm": 1.278135061264038, "kl": 0.0504150390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.652774691581726, "reward_std": 0.26549358665943146, "rewards/accuracy_reward": 0.6527747809886932, "rewards/format_reward": 1.0, "step": 2703 }, { "completion_length": 294.9183578491211, "epoch": 0.2721006289308176, "grad_norm": 0.8463279604911804, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5899376273155212, "reward_std": 0.17864550277590752, "rewards/accuracy_reward": 0.600141704082489, "rewards/format_reward": 0.9897959232330322, "step": 2704 }, { "completion_length": 240.61224365234375, "epoch": 0.27220125786163524, "grad_norm": 0.6680793762207031, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6987366676330566, "reward_std": 0.20307405665516853, "rewards/accuracy_reward": 0.7191447913646698, "rewards/format_reward": 0.9795918464660645, "step": 2705 }, { "completion_length": 332.8061218261719, "epoch": 0.2723018867924528, "grad_norm": 1.5003505945205688, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.629108190536499, "reward_std": 0.28642235696315765, "rewards/accuracy_reward": 0.639312207698822, "rewards/format_reward": 0.9897959232330322, "step": 2706 }, { "completion_length": 338.07142639160156, "epoch": 0.27240251572327046, "grad_norm": 0.9953498840332031, "kl": 0.0467529296875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.570638358592987, "reward_std": 0.3610706031322479, "rewards/accuracy_reward": 0.5808425396680832, "rewards/format_reward": 0.9897959232330322, "step": 2707 }, { "completion_length": 343.79591369628906, "epoch": 0.27250314465408804, "grad_norm": 0.7341569066047668, "kl": 0.0484619140625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5812102556228638, "reward_std": 0.23370712250471115, "rewards/accuracy_reward": 0.5914143621921539, "rewards/format_reward": 0.9897959232330322, "step": 2708 }, { "completion_length": 270.94896697998047, "epoch": 0.2726037735849057, "grad_norm": 0.8725680708885193, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6750741004943848, "reward_std": 0.1654334980994463, "rewards/accuracy_reward": 0.6852783262729645, "rewards/format_reward": 0.9897959232330322, "step": 2709 }, { "completion_length": 252.06121826171875, "epoch": 0.27270440251572325, "grad_norm": 0.6219586730003357, "kl": 0.0579833984375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5394583940505981, "reward_std": 0.11642525345087051, "rewards/accuracy_reward": 0.5496624708175659, "rewards/format_reward": 0.9897959232330322, "step": 2710 }, { "completion_length": 271.99999237060547, "epoch": 0.2728050314465409, "grad_norm": 1.942328929901123, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7661281824111938, "reward_std": 0.18894431740045547, "rewards/accuracy_reward": 0.7865363657474518, "rewards/format_reward": 0.9795918464660645, "step": 2711 }, { "completion_length": 225.34693145751953, "epoch": 0.27290566037735847, "grad_norm": 1.3107051849365234, "kl": 0.0780029296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.745383858680725, "reward_std": 0.2448716126382351, "rewards/accuracy_reward": 0.7657919824123383, "rewards/format_reward": 0.9795918166637421, "step": 2712 }, { "completion_length": 274.4285659790039, "epoch": 0.2730062893081761, "grad_norm": 0.4765796363353729, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.612925112247467, "reward_std": 0.14069632440805435, "rewards/accuracy_reward": 0.6129251569509506, "rewards/format_reward": 1.0, "step": 2713 }, { "completion_length": 251.68367767333984, "epoch": 0.2731069182389937, "grad_norm": 1.356801152229309, "kl": 0.0965576171875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6496598720550537, "reward_std": 0.2127891294658184, "rewards/accuracy_reward": 0.6598639488220215, "rewards/format_reward": 0.9897959232330322, "step": 2714 }, { "completion_length": 339.4183654785156, "epoch": 0.2732075471698113, "grad_norm": 0.8424758315086365, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5126387476921082, "reward_std": 0.20100298523902893, "rewards/accuracy_reward": 0.5228428691625595, "rewards/format_reward": 0.9897959232330322, "step": 2715 }, { "completion_length": 253.0204086303711, "epoch": 0.2733081761006289, "grad_norm": 1.6528267860412598, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.73478764295578, "reward_std": 0.22062375396490097, "rewards/accuracy_reward": 0.7449917197227478, "rewards/format_reward": 0.9897959232330322, "step": 2716 }, { "completion_length": 249.69387817382812, "epoch": 0.27340880503144654, "grad_norm": 1.126015305519104, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7831632494926453, "reward_std": 0.2782270163297653, "rewards/accuracy_reward": 0.8137754797935486, "rewards/format_reward": 0.9693877398967743, "step": 2717 }, { "completion_length": 239.11224365234375, "epoch": 0.2735094339622641, "grad_norm": 0.6710326671600342, "kl": 0.13232421875, "learning_rate": 1e-06, "loss": 0.0053, "reward": 1.8458482027053833, "reward_std": 0.2105853334069252, "rewards/accuracy_reward": 0.876460462808609, "rewards/format_reward": 0.9693877398967743, "step": 2718 }, { "completion_length": 280.37755584716797, "epoch": 0.27361006289308176, "grad_norm": 2.3374576568603516, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6715986132621765, "reward_std": 0.18373256921768188, "rewards/accuracy_reward": 0.6818026900291443, "rewards/format_reward": 0.9897959232330322, "step": 2719 }, { "completion_length": 285.29591369628906, "epoch": 0.27371069182389934, "grad_norm": 0.9605891108512878, "kl": 0.0694580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6007421016693115, "reward_std": 0.3342879116535187, "rewards/accuracy_reward": 0.6925788819789886, "rewards/format_reward": 0.9081632494926453, "step": 2720 }, { "completion_length": 170.23468780517578, "epoch": 0.273811320754717, "grad_norm": 0.8254306316375732, "kl": 0.10400390625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.714285671710968, "reward_std": 0.16198477149009705, "rewards/accuracy_reward": 0.7244898080825806, "rewards/format_reward": 0.9897959232330322, "step": 2721 }, { "completion_length": 298.23468017578125, "epoch": 0.2739119496855346, "grad_norm": 0.7274024486541748, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7363945841789246, "reward_std": 0.15821239352226257, "rewards/accuracy_reward": 0.7568026781082153, "rewards/format_reward": 0.9795918464660645, "step": 2722 }, { "completion_length": 175.01020050048828, "epoch": 0.2740125786163522, "grad_norm": 0.7882195711135864, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7070500254631042, "reward_std": 0.07949572801589966, "rewards/accuracy_reward": 0.7172541618347168, "rewards/format_reward": 0.9897959232330322, "step": 2723 }, { "completion_length": 247.7551040649414, "epoch": 0.27411320754716983, "grad_norm": 1.1854808330535889, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6483336091041565, "reward_std": 0.1618269458413124, "rewards/accuracy_reward": 0.6483336389064789, "rewards/format_reward": 1.0, "step": 2724 }, { "completion_length": 238.29590606689453, "epoch": 0.2742138364779874, "grad_norm": 1.5635967254638672, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.807987093925476, "reward_std": 0.22485066950321198, "rewards/accuracy_reward": 0.828395277261734, "rewards/format_reward": 0.9795918464660645, "step": 2725 }, { "completion_length": 268.80611419677734, "epoch": 0.27431446540880505, "grad_norm": 0.8185741901397705, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7551020383834839, "reward_std": 0.16408701613545418, "rewards/accuracy_reward": 0.7653061151504517, "rewards/format_reward": 0.9897959232330322, "step": 2726 }, { "completion_length": 211.54080963134766, "epoch": 0.27441509433962263, "grad_norm": 1.0035685300827026, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6775509715080261, "reward_std": 0.3004048466682434, "rewards/accuracy_reward": 0.718367338180542, "rewards/format_reward": 0.9591836631298065, "step": 2727 }, { "completion_length": 195.7142791748047, "epoch": 0.27451572327044027, "grad_norm": 0.9331764578819275, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.5904434323310852, "reward_std": 0.0902504064142704, "rewards/accuracy_reward": 0.6006475239992142, "rewards/format_reward": 0.9897959232330322, "step": 2728 }, { "completion_length": 240.10203552246094, "epoch": 0.27461635220125785, "grad_norm": 2.0522356033325195, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7761104106903076, "reward_std": 0.23650409281253815, "rewards/accuracy_reward": 0.8067226707935333, "rewards/format_reward": 0.9693877398967743, "step": 2729 }, { "completion_length": 207.2040786743164, "epoch": 0.2747169811320755, "grad_norm": 1.0538675785064697, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6703514456748962, "reward_std": 0.22569436579942703, "rewards/accuracy_reward": 0.6907596290111542, "rewards/format_reward": 0.9795918166637421, "step": 2730 }, { "completion_length": 206.448974609375, "epoch": 0.27481761006289307, "grad_norm": 0.8596115708351135, "kl": 0.105224609375, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.664932131767273, "reward_std": 0.254615843296051, "rewards/accuracy_reward": 0.7159524857997894, "rewards/format_reward": 0.9489795863628387, "step": 2731 }, { "completion_length": 290.2857131958008, "epoch": 0.2749182389937107, "grad_norm": 1.0764737129211426, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.582617461681366, "reward_std": 0.2705649510025978, "rewards/accuracy_reward": 0.6438419371843338, "rewards/format_reward": 0.9387754797935486, "step": 2732 }, { "completion_length": 269.31632232666016, "epoch": 0.2750188679245283, "grad_norm": 1.9080805778503418, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6555142998695374, "reward_std": 0.24031279236078262, "rewards/accuracy_reward": 0.7167388498783112, "rewards/format_reward": 0.9387754797935486, "step": 2733 }, { "completion_length": 368.8775329589844, "epoch": 0.2751194968553459, "grad_norm": 1.2051763534545898, "kl": 0.0960693359375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.5060062408447266, "reward_std": 0.2855432257056236, "rewards/accuracy_reward": 0.5876389145851135, "rewards/format_reward": 0.918367326259613, "step": 2734 }, { "completion_length": 248.39794921875, "epoch": 0.2752201257861635, "grad_norm": 0.9185336828231812, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.4887612462043762, "reward_std": 0.28796179592609406, "rewards/accuracy_reward": 0.6010061204433441, "rewards/format_reward": 0.8877550959587097, "step": 2735 }, { "completion_length": 355.82652282714844, "epoch": 0.27532075471698114, "grad_norm": 0.6382092833518982, "kl": 0.0487060546875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5349091291427612, "reward_std": 0.1808183416724205, "rewards/accuracy_reward": 0.5553172677755356, "rewards/format_reward": 0.9795918166637421, "step": 2736 }, { "completion_length": 234.24489212036133, "epoch": 0.2754213836477987, "grad_norm": 0.6220177412033081, "kl": 0.0455322265625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7639334201812744, "reward_std": 0.16105270385742188, "rewards/accuracy_reward": 0.7945457100868225, "rewards/format_reward": 0.9693877398967743, "step": 2737 }, { "completion_length": 187.6938705444336, "epoch": 0.27552201257861636, "grad_norm": 0.9209526181221008, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7734110355377197, "reward_std": 0.17980916798114777, "rewards/accuracy_reward": 0.7734110951423645, "rewards/format_reward": 1.0, "step": 2738 }, { "completion_length": 263.7244873046875, "epoch": 0.27562264150943394, "grad_norm": 1.1170759201049805, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5234472155570984, "reward_std": 0.21416283398866653, "rewards/accuracy_reward": 0.5336513519287109, "rewards/format_reward": 0.9897959232330322, "step": 2739 }, { "completion_length": 280.4183654785156, "epoch": 0.2757232704402516, "grad_norm": 1.3157621622085571, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7244897484779358, "reward_std": 0.2068992257118225, "rewards/accuracy_reward": 0.7448979318141937, "rewards/format_reward": 0.9795918166637421, "step": 2740 }, { "completion_length": 336.1836700439453, "epoch": 0.27582389937106916, "grad_norm": 0.6329345703125, "kl": 0.0513916015625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.45947265625, "reward_std": 0.22786551713943481, "rewards/accuracy_reward": 0.4798808693885803, "rewards/format_reward": 0.9795918166637421, "step": 2741 }, { "completion_length": 267.5918273925781, "epoch": 0.2759245283018868, "grad_norm": 0.862964391708374, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7227890491485596, "reward_std": 0.2302909716963768, "rewards/accuracy_reward": 0.75340136885643, "rewards/format_reward": 0.9693877398967743, "step": 2742 }, { "completion_length": 261.1836624145508, "epoch": 0.2760251572327044, "grad_norm": 0.5419114232063293, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.772594690322876, "reward_std": 0.12994526326656342, "rewards/accuracy_reward": 0.7930029034614563, "rewards/format_reward": 0.9795918166637421, "step": 2743 }, { "completion_length": 288.89794921875, "epoch": 0.276125786163522, "grad_norm": 0.45994657278060913, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8614948987960815, "reward_std": 0.155817661434412, "rewards/accuracy_reward": 0.8716990053653717, "rewards/format_reward": 0.9897959232330322, "step": 2744 }, { "completion_length": 278.45916748046875, "epoch": 0.27622641509433965, "grad_norm": 0.411215603351593, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.728316307067871, "reward_std": 0.17616861313581467, "rewards/accuracy_reward": 0.7487244606018066, "rewards/format_reward": 0.9795918166637421, "step": 2745 }, { "completion_length": 273.6938781738281, "epoch": 0.27632704402515723, "grad_norm": 0.8555026054382324, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6997526288032532, "reward_std": 0.30345942825078964, "rewards/accuracy_reward": 0.7609771192073822, "rewards/format_reward": 0.938775509595871, "step": 2746 }, { "completion_length": 300.4387664794922, "epoch": 0.27642767295597487, "grad_norm": 0.6306878328323364, "kl": 0.0711669921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6241793036460876, "reward_std": 0.1896706223487854, "rewards/accuracy_reward": 0.6649957001209259, "rewards/format_reward": 0.9591836631298065, "step": 2747 }, { "completion_length": 324.1734619140625, "epoch": 0.27652830188679245, "grad_norm": 0.7519368529319763, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5466305613517761, "reward_std": 0.20536860823631287, "rewards/accuracy_reward": 0.5568347573280334, "rewards/format_reward": 0.9897959232330322, "step": 2748 }, { "completion_length": 249.25509643554688, "epoch": 0.2766289308176101, "grad_norm": 0.8938739895820618, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6391550302505493, "reward_std": 0.09535333327949047, "rewards/accuracy_reward": 0.6493591368198395, "rewards/format_reward": 0.9897959232330322, "step": 2749 }, { "completion_length": 250.4285659790039, "epoch": 0.27672955974842767, "grad_norm": 1.0790084600448608, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6592859625816345, "reward_std": 0.20550784468650818, "rewards/accuracy_reward": 0.6694900691509247, "rewards/format_reward": 0.9897959232330322, "step": 2750 }, { "completion_length": 267.7448959350586, "epoch": 0.2768301886792453, "grad_norm": 1.0063060522079468, "kl": 0.0714111328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7828211188316345, "reward_std": 0.08127975463867188, "rewards/accuracy_reward": 0.7828210890293121, "rewards/format_reward": 1.0, "step": 2751 }, { "completion_length": 266.0714111328125, "epoch": 0.2769308176100629, "grad_norm": 0.5734320282936096, "kl": 0.06103515625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.811424195766449, "reward_std": 0.1938941478729248, "rewards/accuracy_reward": 0.8114242553710938, "rewards/format_reward": 1.0, "step": 2752 }, { "completion_length": 223.1734619140625, "epoch": 0.2770314465408805, "grad_norm": 0.42171016335487366, "kl": 0.0723876953125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.785351276397705, "reward_std": 0.10273881070315838, "rewards/accuracy_reward": 0.8057594299316406, "rewards/format_reward": 0.9795918166637421, "step": 2753 }, { "completion_length": 268.53060150146484, "epoch": 0.2771320754716981, "grad_norm": 0.5878670811653137, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6507287621498108, "reward_std": 0.10795031487941742, "rewards/accuracy_reward": 0.6711369752883911, "rewards/format_reward": 0.9795918464660645, "step": 2754 }, { "completion_length": 236.44896697998047, "epoch": 0.27723270440251574, "grad_norm": 0.4769333004951477, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6984053254127502, "reward_std": 0.1223892830312252, "rewards/accuracy_reward": 0.7086094617843628, "rewards/format_reward": 0.9897959232330322, "step": 2755 }, { "completion_length": 289.4591827392578, "epoch": 0.2773333333333333, "grad_norm": 0.7174396514892578, "kl": 0.05859375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.390864908695221, "reward_std": 0.1310713067650795, "rewards/accuracy_reward": 0.4010689854621887, "rewards/format_reward": 0.9897959232330322, "step": 2756 }, { "completion_length": 308.30611419677734, "epoch": 0.27743396226415096, "grad_norm": 0.9991602897644043, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.451824963092804, "reward_std": 0.15234778448939323, "rewards/accuracy_reward": 0.4722331166267395, "rewards/format_reward": 0.9795918166637421, "step": 2757 }, { "completion_length": 315.24488830566406, "epoch": 0.27753459119496854, "grad_norm": 0.9956981539726257, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.586734652519226, "reward_std": 0.2548293396830559, "rewards/accuracy_reward": 0.6071428507566452, "rewards/format_reward": 0.9795918464660645, "step": 2758 }, { "completion_length": 214.75509643554688, "epoch": 0.2776352201257862, "grad_norm": 1.224664568901062, "kl": 0.0966796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7730401158332825, "reward_std": 0.27168673276901245, "rewards/accuracy_reward": 0.7832442224025726, "rewards/format_reward": 0.9897959232330322, "step": 2759 }, { "completion_length": 307.3163146972656, "epoch": 0.27773584905660376, "grad_norm": 1.1456588506698608, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5596169233322144, "reward_std": 0.2788782864809036, "rewards/accuracy_reward": 0.5596170425415039, "rewards/format_reward": 1.0, "step": 2760 }, { "completion_length": 245.76529693603516, "epoch": 0.2778364779874214, "grad_norm": 0.6541807055473328, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7346938848495483, "reward_std": 0.09670460969209671, "rewards/accuracy_reward": 0.7448979318141937, "rewards/format_reward": 0.9897959232330322, "step": 2761 }, { "completion_length": 266.24488830566406, "epoch": 0.277937106918239, "grad_norm": 1.1713588237762451, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7397959232330322, "reward_std": 0.1708514615893364, "rewards/accuracy_reward": 0.7602040767669678, "rewards/format_reward": 0.9795918166637421, "step": 2762 }, { "completion_length": 271.2551040649414, "epoch": 0.2780377358490566, "grad_norm": 1.236437201499939, "kl": 0.065185546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7233559489250183, "reward_std": 0.2076856642961502, "rewards/accuracy_reward": 0.7335600852966309, "rewards/format_reward": 0.9897959232330322, "step": 2763 }, { "completion_length": 244.0, "epoch": 0.2781383647798742, "grad_norm": 0.6909499764442444, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8304523229599, "reward_std": 0.16754445061087608, "rewards/accuracy_reward": 0.8610645234584808, "rewards/format_reward": 0.9693877398967743, "step": 2764 }, { "completion_length": 249.2346954345703, "epoch": 0.27823899371069183, "grad_norm": 1.3029894828796387, "kl": 0.0662841796875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6764333844184875, "reward_std": 0.17042957246303558, "rewards/accuracy_reward": 0.6764334142208099, "rewards/format_reward": 1.0, "step": 2765 }, { "completion_length": 223.78571319580078, "epoch": 0.2783396226415094, "grad_norm": 1.050074577331543, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6353740692138672, "reward_std": 0.2303837463259697, "rewards/accuracy_reward": 0.6455782055854797, "rewards/format_reward": 0.9897959232330322, "step": 2766 }, { "completion_length": 271.14286041259766, "epoch": 0.27844025157232705, "grad_norm": 0.4918425381183624, "kl": 0.0579833984375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.702443778514862, "reward_std": 0.14090033620595932, "rewards/accuracy_reward": 0.7024437785148621, "rewards/format_reward": 1.0, "step": 2767 }, { "completion_length": 242.53060913085938, "epoch": 0.27854088050314463, "grad_norm": 1.5534913539886475, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6727068424224854, "reward_std": 0.18155473470687866, "rewards/accuracy_reward": 0.6727068871259689, "rewards/format_reward": 1.0, "step": 2768 }, { "completion_length": 287.60204315185547, "epoch": 0.27864150943396226, "grad_norm": 0.8589257001876831, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6576645970344543, "reward_std": 0.2948238253593445, "rewards/accuracy_reward": 0.6780728101730347, "rewards/format_reward": 0.9795918166637421, "step": 2769 }, { "completion_length": 243.49999237060547, "epoch": 0.2787421383647799, "grad_norm": 0.9340376257896423, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6869097352027893, "reward_std": 0.2094186693429947, "rewards/accuracy_reward": 0.6971138715744019, "rewards/format_reward": 0.9897959232330322, "step": 2770 }, { "completion_length": 231.33672332763672, "epoch": 0.2788427672955975, "grad_norm": 0.7219240665435791, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8060091137886047, "reward_std": 0.17253787070512772, "rewards/accuracy_reward": 0.8162131607532501, "rewards/format_reward": 0.9897959232330322, "step": 2771 }, { "completion_length": 194.10203552246094, "epoch": 0.2789433962264151, "grad_norm": 0.6056773066520691, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8067773580551147, "reward_std": 0.15769196301698685, "rewards/accuracy_reward": 0.8169814348220825, "rewards/format_reward": 0.9897959232330322, "step": 2772 }, { "completion_length": 273.7755126953125, "epoch": 0.2790440251572327, "grad_norm": 0.9599092602729797, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6476367115974426, "reward_std": 0.2323230803012848, "rewards/accuracy_reward": 0.6578407883644104, "rewards/format_reward": 0.9897959232330322, "step": 2773 }, { "completion_length": 267.3673400878906, "epoch": 0.27914465408805034, "grad_norm": 1.0465036630630493, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7074506282806396, "reward_std": 0.27512678503990173, "rewards/accuracy_reward": 0.7074505686759949, "rewards/format_reward": 1.0, "step": 2774 }, { "completion_length": 164.63265228271484, "epoch": 0.2792452830188679, "grad_norm": 0.9968351721763611, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.9081632494926453, "reward_std": 0.12370207160711288, "rewards/accuracy_reward": 0.9285714328289032, "rewards/format_reward": 0.9795918464660645, "step": 2775 }, { "completion_length": 263.02040100097656, "epoch": 0.27934591194968555, "grad_norm": 0.44958287477493286, "kl": 0.0653076171875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7915450930595398, "reward_std": 0.10427561402320862, "rewards/accuracy_reward": 0.8017492294311523, "rewards/format_reward": 0.9897959232330322, "step": 2776 }, { "completion_length": 269.79590606689453, "epoch": 0.27944654088050314, "grad_norm": 0.5526366233825684, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6895485520362854, "reward_std": 0.09518550662323833, "rewards/accuracy_reward": 0.689548522233963, "rewards/format_reward": 1.0, "step": 2777 }, { "completion_length": 276.9081573486328, "epoch": 0.2795471698113208, "grad_norm": 0.8144568204879761, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5728214383125305, "reward_std": 0.17736732959747314, "rewards/accuracy_reward": 0.5830255895853043, "rewards/format_reward": 0.9897959232330322, "step": 2778 }, { "completion_length": 262.67345428466797, "epoch": 0.27964779874213835, "grad_norm": 1.0792771577835083, "kl": 0.0662841796875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.702721118927002, "reward_std": 0.2212061733007431, "rewards/accuracy_reward": 0.7333332896232605, "rewards/format_reward": 0.9693877398967743, "step": 2779 }, { "completion_length": 299.39794921875, "epoch": 0.279748427672956, "grad_norm": 0.8824490308761597, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8269193172454834, "reward_std": 0.2713307961821556, "rewards/accuracy_reward": 0.8575315475463867, "rewards/format_reward": 0.9693877398967743, "step": 2780 }, { "completion_length": 128.78570938110352, "epoch": 0.27984905660377357, "grad_norm": 0.4385693371295929, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.744764804840088, "reward_std": 0.056269681081175804, "rewards/accuracy_reward": 0.7447648048400879, "rewards/format_reward": 1.0, "step": 2781 }, { "completion_length": 231.78571319580078, "epoch": 0.2799496855345912, "grad_norm": 0.962510347366333, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7884972095489502, "reward_std": 0.14842950832098722, "rewards/accuracy_reward": 0.798701286315918, "rewards/format_reward": 0.9897959232330322, "step": 2782 }, { "completion_length": 227.4285659790039, "epoch": 0.2800503144654088, "grad_norm": 0.7141614556312561, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7829313278198242, "reward_std": 0.11254712752997875, "rewards/accuracy_reward": 0.7829313278198242, "rewards/format_reward": 1.0, "step": 2783 }, { "completion_length": 259.06121826171875, "epoch": 0.2801509433962264, "grad_norm": 0.5834965109825134, "kl": 0.10693359375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.5721184015274048, "reward_std": 0.10125355515629053, "rewards/accuracy_reward": 0.582322508096695, "rewards/format_reward": 0.9897959232330322, "step": 2784 }, { "completion_length": 281.6326446533203, "epoch": 0.280251572327044, "grad_norm": 0.6759525537490845, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6366546750068665, "reward_std": 0.22154808044433594, "rewards/accuracy_reward": 0.6468587219715118, "rewards/format_reward": 0.9897959232330322, "step": 2785 }, { "completion_length": 408.69386291503906, "epoch": 0.28035220125786164, "grad_norm": 0.6134850978851318, "kl": 0.046875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.4703214764595032, "reward_std": 0.2448902428150177, "rewards/accuracy_reward": 0.4907296746969223, "rewards/format_reward": 0.9795918464660645, "step": 2786 }, { "completion_length": 247.28570556640625, "epoch": 0.2804528301886792, "grad_norm": 0.6561992764472961, "kl": 0.0596923828125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7387025356292725, "reward_std": 0.20550940930843353, "rewards/accuracy_reward": 0.7591107189655304, "rewards/format_reward": 0.9795918166637421, "step": 2787 }, { "completion_length": 201.33673095703125, "epoch": 0.28055345911949686, "grad_norm": 1.0323408842086792, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6978861689567566, "reward_std": 0.17374641448259354, "rewards/accuracy_reward": 0.6978862583637238, "rewards/format_reward": 1.0, "step": 2788 }, { "completion_length": 223.05101776123047, "epoch": 0.28065408805031444, "grad_norm": 0.9267804026603699, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.689504325389862, "reward_std": 0.16559647396206856, "rewards/accuracy_reward": 0.7201165854930878, "rewards/format_reward": 0.9693877398967743, "step": 2789 }, { "completion_length": 196.32653045654297, "epoch": 0.2807547169811321, "grad_norm": 0.609680712223053, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.807141900062561, "reward_std": 0.11181196570396423, "rewards/accuracy_reward": 0.807141900062561, "rewards/format_reward": 1.0, "step": 2790 }, { "completion_length": 257.57141876220703, "epoch": 0.28085534591194966, "grad_norm": 0.7828918099403381, "kl": 0.049560546875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5768746733665466, "reward_std": 0.2268533557653427, "rewards/accuracy_reward": 0.5972827672958374, "rewards/format_reward": 0.9795918166637421, "step": 2791 }, { "completion_length": 200.89795684814453, "epoch": 0.2809559748427673, "grad_norm": 0.8561755418777466, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7278027534484863, "reward_std": 0.10248774103820324, "rewards/accuracy_reward": 0.7278028130531311, "rewards/format_reward": 1.0, "step": 2792 }, { "completion_length": 267.39794921875, "epoch": 0.2810566037735849, "grad_norm": 0.3978388011455536, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7110019326210022, "reward_std": 0.12684814259409904, "rewards/accuracy_reward": 0.7212060391902924, "rewards/format_reward": 0.9897959232330322, "step": 2793 }, { "completion_length": 269.5306091308594, "epoch": 0.2811572327044025, "grad_norm": 0.8690406680107117, "kl": 0.0516357421875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.669632613658905, "reward_std": 0.2622845694422722, "rewards/accuracy_reward": 0.6798367500305176, "rewards/format_reward": 0.9897959232330322, "step": 2794 }, { "completion_length": 200.26529693603516, "epoch": 0.28125786163522015, "grad_norm": 0.7443576455116272, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8709985613822937, "reward_std": 0.17151319235563278, "rewards/accuracy_reward": 0.891406774520874, "rewards/format_reward": 0.9795918166637421, "step": 2795 }, { "completion_length": 182.4897918701172, "epoch": 0.28135849056603773, "grad_norm": 0.8744491338729858, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.9054421782493591, "reward_std": 0.12418831512331963, "rewards/accuracy_reward": 0.9258503615856171, "rewards/format_reward": 0.9795918464660645, "step": 2796 }, { "completion_length": 246.91836547851562, "epoch": 0.28145911949685537, "grad_norm": 0.7434391975402832, "kl": 0.0589599609375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6457719206809998, "reward_std": 0.14707902818918228, "rewards/accuracy_reward": 0.645771935582161, "rewards/format_reward": 1.0, "step": 2797 }, { "completion_length": 279.17346954345703, "epoch": 0.28155974842767295, "grad_norm": 0.5339738130569458, "kl": 0.0545654296875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6338080167770386, "reward_std": 0.0983477421104908, "rewards/accuracy_reward": 0.644012063741684, "rewards/format_reward": 0.9897959232330322, "step": 2798 }, { "completion_length": 203.9591827392578, "epoch": 0.2816603773584906, "grad_norm": 0.43986210227012634, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8231292366981506, "reward_std": 0.0916704311966896, "rewards/accuracy_reward": 0.8231292366981506, "rewards/format_reward": 1.0, "step": 2799 }, { "completion_length": 228.68366241455078, "epoch": 0.28176100628930817, "grad_norm": 0.7607265710830688, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5018552541732788, "reward_std": 0.11289847642183304, "rewards/accuracy_reward": 0.5120593458414078, "rewards/format_reward": 0.9897959232330322, "step": 2800 }, { "completion_length": 220.10203552246094, "epoch": 0.2818616352201258, "grad_norm": 2.332254648208618, "kl": 0.111083984375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.6627498865127563, "reward_std": 0.16920720040798187, "rewards/accuracy_reward": 0.6627498269081116, "rewards/format_reward": 1.0, "step": 2801 }, { "completion_length": 180.08162689208984, "epoch": 0.2819622641509434, "grad_norm": 1.3477028608322144, "kl": 0.103271484375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6809714436531067, "reward_std": 0.23024101555347443, "rewards/accuracy_reward": 0.6911756098270416, "rewards/format_reward": 0.9897959232330322, "step": 2802 }, { "completion_length": 267.0714340209961, "epoch": 0.282062893081761, "grad_norm": 0.7643663883209229, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6181817650794983, "reward_std": 0.14556938409805298, "rewards/accuracy_reward": 0.6283858716487885, "rewards/format_reward": 0.9897959232330322, "step": 2803 }, { "completion_length": 190.69387817382812, "epoch": 0.2821635220125786, "grad_norm": 0.6590152382850647, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7206246256828308, "reward_std": 0.15166759118437767, "rewards/accuracy_reward": 0.7206245958805084, "rewards/format_reward": 1.0, "step": 2804 }, { "completion_length": 190.51020050048828, "epoch": 0.28226415094339624, "grad_norm": 0.7924579381942749, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8673468828201294, "reward_std": 0.10003120824694633, "rewards/accuracy_reward": 0.8775509893894196, "rewards/format_reward": 0.9897959232330322, "step": 2805 }, { "completion_length": 187.63265228271484, "epoch": 0.2823647798742138, "grad_norm": 0.3811144530773163, "kl": 0.062744140625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.9081632494926453, "reward_std": 0.05399492383003235, "rewards/accuracy_reward": 0.9285714328289032, "rewards/format_reward": 0.9795918166637421, "step": 2806 }, { "completion_length": 173.76529693603516, "epoch": 0.28246540880503146, "grad_norm": 0.6852083802223206, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.902397096157074, "reward_std": 0.10265733301639557, "rewards/accuracy_reward": 0.902397096157074, "rewards/format_reward": 1.0, "step": 2807 }, { "completion_length": 232.36734008789062, "epoch": 0.28256603773584904, "grad_norm": 0.6115841865539551, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7331987023353577, "reward_std": 0.16746747493743896, "rewards/accuracy_reward": 0.7434028685092926, "rewards/format_reward": 0.9897959232330322, "step": 2808 }, { "completion_length": 159.88774871826172, "epoch": 0.2826666666666667, "grad_norm": 0.5423650741577148, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8504615426063538, "reward_std": 0.053914714604616165, "rewards/accuracy_reward": 0.8504615724086761, "rewards/format_reward": 1.0, "step": 2809 }, { "completion_length": 270.26529693603516, "epoch": 0.28276729559748426, "grad_norm": 0.6268299221992493, "kl": 0.0552978515625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7785913944244385, "reward_std": 0.17806699872016907, "rewards/accuracy_reward": 0.7785914540290833, "rewards/format_reward": 1.0, "step": 2810 }, { "completion_length": 236.9795913696289, "epoch": 0.2828679245283019, "grad_norm": 0.8989265561103821, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6487998962402344, "reward_std": 0.14122004806995392, "rewards/accuracy_reward": 0.6590040028095245, "rewards/format_reward": 0.9897959232330322, "step": 2811 }, { "completion_length": 244.0816192626953, "epoch": 0.2829685534591195, "grad_norm": 2.29382061958313, "kl": 0.101318359375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6125048995018005, "reward_std": 0.25714661180973053, "rewards/accuracy_reward": 0.653321236371994, "rewards/format_reward": 0.9591836631298065, "step": 2812 }, { "completion_length": 191.41836547851562, "epoch": 0.2830691823899371, "grad_norm": 0.7296483516693115, "kl": 0.0477294921875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.8553557395935059, "reward_std": 0.10410026833415031, "rewards/accuracy_reward": 0.8553557991981506, "rewards/format_reward": 1.0, "step": 2813 }, { "completion_length": 269.7040710449219, "epoch": 0.2831698113207547, "grad_norm": 0.898521900177002, "kl": 0.045166015625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.667867124080658, "reward_std": 0.19024567306041718, "rewards/accuracy_reward": 0.6780711710453033, "rewards/format_reward": 0.9897959232330322, "step": 2814 }, { "completion_length": 304.2244873046875, "epoch": 0.28327044025157233, "grad_norm": 0.5464866161346436, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6574344038963318, "reward_std": 0.18940884992480278, "rewards/accuracy_reward": 0.6676384657621384, "rewards/format_reward": 0.9897959232330322, "step": 2815 }, { "completion_length": 323.8367156982422, "epoch": 0.2833710691823899, "grad_norm": 0.592836320400238, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6123273968696594, "reward_std": 0.12655790895223618, "rewards/accuracy_reward": 0.6327354907989502, "rewards/format_reward": 0.9795918166637421, "step": 2816 }, { "completion_length": 253.6326446533203, "epoch": 0.28347169811320755, "grad_norm": 0.44377923011779785, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6707109808921814, "reward_std": 0.1243201307952404, "rewards/accuracy_reward": 0.6809151470661163, "rewards/format_reward": 0.9897959232330322, "step": 2817 }, { "completion_length": 254.67345428466797, "epoch": 0.28357232704402513, "grad_norm": 0.7229545712471008, "kl": 0.0572509765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7156462669372559, "reward_std": 0.1832295060157776, "rewards/accuracy_reward": 0.7258503139019012, "rewards/format_reward": 0.9897959232330322, "step": 2818 }, { "completion_length": 185.57142639160156, "epoch": 0.28367295597484277, "grad_norm": 0.5298323631286621, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7530612349510193, "reward_std": 0.08669047430157661, "rewards/accuracy_reward": 0.7530612051486969, "rewards/format_reward": 1.0, "step": 2819 }, { "completion_length": 275.09183502197266, "epoch": 0.2837735849056604, "grad_norm": 1.1673716306686401, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5037415027618408, "reward_std": 0.16715434938669205, "rewards/accuracy_reward": 0.5037414878606796, "rewards/format_reward": 1.0, "step": 2820 }, { "completion_length": 171.448974609375, "epoch": 0.283874213836478, "grad_norm": 1.4353209733963013, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7918529510498047, "reward_std": 0.13169988989830017, "rewards/accuracy_reward": 0.8020570576190948, "rewards/format_reward": 0.9897959232330322, "step": 2821 }, { "completion_length": 244.30612182617188, "epoch": 0.2839748427672956, "grad_norm": 0.5115576982498169, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7257254719734192, "reward_std": 0.17202896997332573, "rewards/accuracy_reward": 0.7563377320766449, "rewards/format_reward": 0.9693877398967743, "step": 2822 }, { "completion_length": 282.01019287109375, "epoch": 0.2840754716981132, "grad_norm": 1.003760814666748, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6160836815834045, "reward_std": 0.17217513173818588, "rewards/accuracy_reward": 0.6262878775596619, "rewards/format_reward": 0.9897959232330322, "step": 2823 }, { "completion_length": 221.13265228271484, "epoch": 0.28417610062893084, "grad_norm": 0.36831870675086975, "kl": 0.0521240234375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.714285671710968, "reward_std": 0.07636035233736038, "rewards/accuracy_reward": 0.7142857015132904, "rewards/format_reward": 1.0, "step": 2824 }, { "completion_length": 203.2142791748047, "epoch": 0.2842767295597484, "grad_norm": 0.5184892416000366, "kl": 0.0726318359375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6683782935142517, "reward_std": 0.06857097800821066, "rewards/accuracy_reward": 0.6683783233165741, "rewards/format_reward": 1.0, "step": 2825 }, { "completion_length": 191.8775405883789, "epoch": 0.28437735849056606, "grad_norm": 2.0866177082061768, "kl": 0.0574951171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7381085753440857, "reward_std": 0.20519083738327026, "rewards/accuracy_reward": 0.7483126521110535, "rewards/format_reward": 0.9897959232330322, "step": 2826 }, { "completion_length": 204.35713958740234, "epoch": 0.28447798742138364, "grad_norm": 2.3237922191619873, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.5657251477241516, "reward_std": 0.21345235407352448, "rewards/accuracy_reward": 0.565725103020668, "rewards/format_reward": 1.0, "step": 2827 }, { "completion_length": 182.24488830566406, "epoch": 0.2845786163522013, "grad_norm": 1.2526087760925293, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8027978539466858, "reward_std": 0.23369820788502693, "rewards/accuracy_reward": 0.8130020499229431, "rewards/format_reward": 0.9897959232330322, "step": 2828 }, { "completion_length": 193.7244873046875, "epoch": 0.28467924528301886, "grad_norm": 1.012292742729187, "kl": 0.096923828125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.663739264011383, "reward_std": 0.24064038693904877, "rewards/accuracy_reward": 0.6841474771499634, "rewards/format_reward": 0.9795918464660645, "step": 2829 }, { "completion_length": 287.0408172607422, "epoch": 0.2847798742138365, "grad_norm": 1.1953991651535034, "kl": 0.0635986328125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5332275032997131, "reward_std": 0.2854916453361511, "rewards/accuracy_reward": 0.5638398230075836, "rewards/format_reward": 0.9693877398967743, "step": 2830 }, { "completion_length": 322.6428527832031, "epoch": 0.2848805031446541, "grad_norm": 0.3857422173023224, "kl": 0.048828125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.693286955356598, "reward_std": 0.12112241983413696, "rewards/accuracy_reward": 0.7136951684951782, "rewards/format_reward": 0.9795918166637421, "step": 2831 }, { "completion_length": 235.59182739257812, "epoch": 0.2849811320754717, "grad_norm": 0.5764009952545166, "kl": 0.0482177734375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7205127477645874, "reward_std": 0.10276953130960464, "rewards/accuracy_reward": 0.7307168841362, "rewards/format_reward": 0.9897959232330322, "step": 2832 }, { "completion_length": 301.6530532836914, "epoch": 0.2850817610062893, "grad_norm": 0.5802711844444275, "kl": 0.05029296875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6500405669212341, "reward_std": 0.23133235424757004, "rewards/accuracy_reward": 0.6704487204551697, "rewards/format_reward": 0.9795918464660645, "step": 2833 }, { "completion_length": 257.63265228271484, "epoch": 0.28518238993710693, "grad_norm": 0.38473135232925415, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8645123839378357, "reward_std": 0.07645900174975395, "rewards/accuracy_reward": 0.8645124137401581, "rewards/format_reward": 1.0, "step": 2834 }, { "completion_length": 238.57141876220703, "epoch": 0.2852830188679245, "grad_norm": 0.6972058415412903, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.818343698978424, "reward_std": 0.17862210236489773, "rewards/accuracy_reward": 0.8183437585830688, "rewards/format_reward": 1.0, "step": 2835 }, { "completion_length": 328.3571472167969, "epoch": 0.28538364779874215, "grad_norm": 2.0765764713287354, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5098631978034973, "reward_std": 0.22199367731809616, "rewards/accuracy_reward": 0.5302713960409164, "rewards/format_reward": 0.9795918464660645, "step": 2836 }, { "completion_length": 241.90816497802734, "epoch": 0.2854842767295597, "grad_norm": 0.7035696506500244, "kl": 0.0677490234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.739431381225586, "reward_std": 0.23065009713172913, "rewards/accuracy_reward": 0.7598396241664886, "rewards/format_reward": 0.9795918166637421, "step": 2837 }, { "completion_length": 227.34693145751953, "epoch": 0.28558490566037736, "grad_norm": 0.8637508749961853, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.819587230682373, "reward_std": 0.14336150512099266, "rewards/accuracy_reward": 0.8195872604846954, "rewards/format_reward": 1.0, "step": 2838 }, { "completion_length": 233.6836700439453, "epoch": 0.28568553459119495, "grad_norm": 1.378706693649292, "kl": 0.0736083984375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6696144938468933, "reward_std": 0.2864307314157486, "rewards/accuracy_reward": 0.700226753950119, "rewards/format_reward": 0.9693877398967743, "step": 2839 }, { "completion_length": 179.74488830566406, "epoch": 0.2857861635220126, "grad_norm": 0.8595296740531921, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7721089124679565, "reward_std": 0.20015977323055267, "rewards/accuracy_reward": 0.7721088230609894, "rewards/format_reward": 1.0, "step": 2840 }, { "completion_length": 264.34693908691406, "epoch": 0.28588679245283016, "grad_norm": 1.1338191032409668, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.630816102027893, "reward_std": 0.3127112612128258, "rewards/accuracy_reward": 0.6614283919334412, "rewards/format_reward": 0.9693877398967743, "step": 2841 }, { "completion_length": 275.1632537841797, "epoch": 0.2859874213836478, "grad_norm": 0.6928310394287109, "kl": 0.055908203125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.792999267578125, "reward_std": 0.19471249729394913, "rewards/accuracy_reward": 0.8032034039497375, "rewards/format_reward": 0.9897959232330322, "step": 2842 }, { "completion_length": 249.62245178222656, "epoch": 0.28608805031446544, "grad_norm": 0.5547121167182922, "kl": 0.05517578125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8469387888908386, "reward_std": 0.21347995102405548, "rewards/accuracy_reward": 0.8877550959587097, "rewards/format_reward": 0.9591836631298065, "step": 2843 }, { "completion_length": 190.9081573486328, "epoch": 0.286188679245283, "grad_norm": 0.6534651517868042, "kl": 0.0584716796875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7240831851959229, "reward_std": 0.1393636018037796, "rewards/accuracy_reward": 0.7444914281368256, "rewards/format_reward": 0.9795918166637421, "step": 2844 }, { "completion_length": 358.02040100097656, "epoch": 0.28628930817610065, "grad_norm": 0.5872966051101685, "kl": 0.04541015625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.713265299797058, "reward_std": 0.25921911746263504, "rewards/accuracy_reward": 0.7540816366672516, "rewards/format_reward": 0.9591836631298065, "step": 2845 }, { "completion_length": 294.1836700439453, "epoch": 0.28638993710691824, "grad_norm": 0.8314541578292847, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.73338383436203, "reward_std": 0.14608263969421387, "rewards/accuracy_reward": 0.7333838641643524, "rewards/format_reward": 1.0, "step": 2846 }, { "completion_length": 229.81632232666016, "epoch": 0.28649056603773587, "grad_norm": 1.2969216108322144, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7348798513412476, "reward_std": 0.24468954652547836, "rewards/accuracy_reward": 0.775696188211441, "rewards/format_reward": 0.9591836631298065, "step": 2847 }, { "completion_length": 189.01020050048828, "epoch": 0.28659119496855345, "grad_norm": 0.5011176466941833, "kl": 0.0572509765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8326706886291504, "reward_std": 0.10111489146947861, "rewards/accuracy_reward": 0.8428747951984406, "rewards/format_reward": 0.9897959232330322, "step": 2848 }, { "completion_length": 319.4081497192383, "epoch": 0.2866918238993711, "grad_norm": 0.500006914138794, "kl": 0.054931640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.705396831035614, "reward_std": 0.2625638619065285, "rewards/accuracy_reward": 0.7564172446727753, "rewards/format_reward": 0.9489795565605164, "step": 2849 }, { "completion_length": 259.27550506591797, "epoch": 0.28679245283018867, "grad_norm": 0.798796534538269, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6166990399360657, "reward_std": 0.2036084607243538, "rewards/accuracy_reward": 0.6269031316041946, "rewards/format_reward": 0.9897959232330322, "step": 2850 }, { "completion_length": 302.5, "epoch": 0.2868930817610063, "grad_norm": 0.9300217032432556, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6231354475021362, "reward_std": 0.21020638942718506, "rewards/accuracy_reward": 0.6435436606407166, "rewards/format_reward": 0.9795918464660645, "step": 2851 }, { "completion_length": 309.5204086303711, "epoch": 0.2869937106918239, "grad_norm": 0.638471245765686, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5829080939292908, "reward_std": 0.3108081966638565, "rewards/accuracy_reward": 0.6339285671710968, "rewards/format_reward": 0.9489795863628387, "step": 2852 }, { "completion_length": 300.4081573486328, "epoch": 0.2870943396226415, "grad_norm": 0.9609585404396057, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7244951725006104, "reward_std": 0.20015063881874084, "rewards/accuracy_reward": 0.7551074624061584, "rewards/format_reward": 0.9693877398967743, "step": 2853 }, { "completion_length": 156.4081573486328, "epoch": 0.2871949685534591, "grad_norm": 1.1458337306976318, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.88532555103302, "reward_std": 0.05656610894948244, "rewards/accuracy_reward": 0.8853255212306976, "rewards/format_reward": 1.0, "step": 2854 }, { "completion_length": 231.7448959350586, "epoch": 0.28729559748427674, "grad_norm": 0.7163671851158142, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7280015349388123, "reward_std": 0.16397801786661148, "rewards/accuracy_reward": 0.7382056415081024, "rewards/format_reward": 0.9897959232330322, "step": 2855 }, { "completion_length": 224.61223602294922, "epoch": 0.2873962264150943, "grad_norm": 0.5359477996826172, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7585437893867493, "reward_std": 0.12539758533239365, "rewards/accuracy_reward": 0.7687479853630066, "rewards/format_reward": 0.9897959232330322, "step": 2856 }, { "completion_length": 230.03060913085938, "epoch": 0.28749685534591196, "grad_norm": 0.6810061931610107, "kl": 0.055419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7718366980552673, "reward_std": 0.23041526973247528, "rewards/accuracy_reward": 0.7718366980552673, "rewards/format_reward": 1.0, "step": 2857 }, { "completion_length": 184.06122589111328, "epoch": 0.28759748427672954, "grad_norm": 0.8773625493049622, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7405669689178467, "reward_std": 0.11876624450087547, "rewards/accuracy_reward": 0.7507711052894592, "rewards/format_reward": 0.9897959232330322, "step": 2858 }, { "completion_length": 253.28571319580078, "epoch": 0.2876981132075472, "grad_norm": 0.811010479927063, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6487755179405212, "reward_std": 0.281294371932745, "rewards/accuracy_reward": 0.6793877482414246, "rewards/format_reward": 0.9693877398967743, "step": 2859 }, { "completion_length": 231.9897918701172, "epoch": 0.28779874213836476, "grad_norm": 0.27458474040031433, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7852436900138855, "reward_std": 0.0827547088265419, "rewards/accuracy_reward": 0.7954477965831757, "rewards/format_reward": 0.9897959232330322, "step": 2860 }, { "completion_length": 270.76529693603516, "epoch": 0.2878993710691824, "grad_norm": 0.7835866212844849, "kl": 0.0543212890625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.689990222454071, "reward_std": 0.18143831938505173, "rewards/accuracy_reward": 0.7001943290233612, "rewards/format_reward": 0.9897959232330322, "step": 2861 }, { "completion_length": 361.51019287109375, "epoch": 0.288, "grad_norm": 0.6873385310173035, "kl": 0.0657958984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.615825116634369, "reward_std": 0.18610727041959763, "rewards/accuracy_reward": 0.6158251464366913, "rewards/format_reward": 1.0, "step": 2862 }, { "completion_length": 227.26529693603516, "epoch": 0.2881006289308176, "grad_norm": 1.3290232419967651, "kl": 0.0682373046875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6715252995491028, "reward_std": 0.18975215032696724, "rewards/accuracy_reward": 0.6817293167114258, "rewards/format_reward": 0.9897959232330322, "step": 2863 }, { "completion_length": 234.75509643554688, "epoch": 0.2882012578616352, "grad_norm": 1.3096998929977417, "kl": 0.0762939453125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6273759603500366, "reward_std": 0.2128499299287796, "rewards/accuracy_reward": 0.6477841436862946, "rewards/format_reward": 0.9795918464660645, "step": 2864 }, { "completion_length": 253.23468780517578, "epoch": 0.28830188679245283, "grad_norm": 2.0222933292388916, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6496599316596985, "reward_std": 0.2670467346906662, "rewards/accuracy_reward": 0.6598639488220215, "rewards/format_reward": 0.9897959232330322, "step": 2865 }, { "completion_length": 218.12244415283203, "epoch": 0.2884025157232704, "grad_norm": 0.5743470191955566, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7665349245071411, "reward_std": 0.16230690106749535, "rewards/accuracy_reward": 0.7767390012741089, "rewards/format_reward": 0.9897959232330322, "step": 2866 }, { "completion_length": 255.43877410888672, "epoch": 0.28850314465408805, "grad_norm": 0.890373706817627, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6587706208229065, "reward_std": 0.23077217489480972, "rewards/accuracy_reward": 0.6689746975898743, "rewards/format_reward": 0.9897959232330322, "step": 2867 }, { "completion_length": 273.92857360839844, "epoch": 0.2886037735849057, "grad_norm": 1.3422387838363647, "kl": 0.0606689453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.3896846771240234, "reward_std": 0.29916054010391235, "rewards/accuracy_reward": 0.410092830657959, "rewards/format_reward": 0.9795918166637421, "step": 2868 }, { "completion_length": 243.23468017578125, "epoch": 0.28870440251572327, "grad_norm": 0.7079533934593201, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6990929245948792, "reward_std": 0.15700653195381165, "rewards/accuracy_reward": 0.6990929543972015, "rewards/format_reward": 1.0, "step": 2869 }, { "completion_length": 188.4693832397461, "epoch": 0.2888050314465409, "grad_norm": 1.2998404502868652, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.716957688331604, "reward_std": 0.20861977338790894, "rewards/accuracy_reward": 0.7271617352962494, "rewards/format_reward": 0.9897959232330322, "step": 2870 }, { "completion_length": 225.62244415283203, "epoch": 0.2889056603773585, "grad_norm": 1.2079594135284424, "kl": 0.0794677734375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7252842783927917, "reward_std": 0.2348419949412346, "rewards/accuracy_reward": 0.725284218788147, "rewards/format_reward": 1.0, "step": 2871 }, { "completion_length": 260.9183578491211, "epoch": 0.2890062893081761, "grad_norm": 0.5127943158149719, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8226638436317444, "reward_std": 0.1649615578353405, "rewards/accuracy_reward": 0.8430721163749695, "rewards/format_reward": 0.9795918464660645, "step": 2872 }, { "completion_length": 209.06121826171875, "epoch": 0.2891069182389937, "grad_norm": 1.7896513938903809, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8444488048553467, "reward_std": 0.17235205322504044, "rewards/accuracy_reward": 0.8546528816223145, "rewards/format_reward": 0.9897959232330322, "step": 2873 }, { "completion_length": 259.6224365234375, "epoch": 0.28920754716981134, "grad_norm": 0.9639541506767273, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6178473234176636, "reward_std": 0.1153445653617382, "rewards/accuracy_reward": 0.6178473830223083, "rewards/format_reward": 1.0, "step": 2874 }, { "completion_length": 275.7244873046875, "epoch": 0.2893081761006289, "grad_norm": 0.320874959230423, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8277612924575806, "reward_std": 0.08050109818577766, "rewards/accuracy_reward": 0.8277612626552582, "rewards/format_reward": 1.0, "step": 2875 }, { "completion_length": 237.9387664794922, "epoch": 0.28940880503144656, "grad_norm": 1.01832115650177, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7312925457954407, "reward_std": 0.24724173545837402, "rewards/accuracy_reward": 0.7312925159931183, "rewards/format_reward": 1.0, "step": 2876 }, { "completion_length": 230.54080963134766, "epoch": 0.28950943396226414, "grad_norm": 12.2073335647583, "kl": 0.538330078125, "learning_rate": 1e-06, "loss": 0.0216, "reward": 1.6258503794670105, "reward_std": 0.21060558408498764, "rewards/accuracy_reward": 0.6258503198623657, "rewards/format_reward": 1.0, "step": 2877 }, { "completion_length": 212.7346954345703, "epoch": 0.2896100628930818, "grad_norm": 0.8424152731895447, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.9081632494926453, "reward_std": 0.15402613580226898, "rewards/accuracy_reward": 0.9183673560619354, "rewards/format_reward": 0.9897959232330322, "step": 2878 }, { "completion_length": 235.88774871826172, "epoch": 0.28971069182389936, "grad_norm": 1.0659732818603516, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.73316890001297, "reward_std": 0.17676535993814468, "rewards/accuracy_reward": 0.73316890001297, "rewards/format_reward": 1.0, "step": 2879 }, { "completion_length": 200.03060913085938, "epoch": 0.289811320754717, "grad_norm": 0.39460599422454834, "kl": 0.0821533203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8705084323883057, "reward_std": 0.08030646480619907, "rewards/accuracy_reward": 0.8807125389575958, "rewards/format_reward": 0.9897959232330322, "step": 2880 }, { "completion_length": 172.11224365234375, "epoch": 0.2899119496855346, "grad_norm": 1.2456117868423462, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6997886300086975, "reward_std": 0.12630866467952728, "rewards/accuracy_reward": 0.7099926769733429, "rewards/format_reward": 0.9897959232330322, "step": 2881 }, { "completion_length": 205.38775634765625, "epoch": 0.2900125786163522, "grad_norm": 0.9469463229179382, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.725170075893402, "reward_std": 0.24396871030330658, "rewards/accuracy_reward": 0.7353741228580475, "rewards/format_reward": 0.9897959232330322, "step": 2882 }, { "completion_length": 316.6938781738281, "epoch": 0.2901132075471698, "grad_norm": 0.7756762504577637, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7151034474372864, "reward_std": 0.22388286888599396, "rewards/accuracy_reward": 0.7355116307735443, "rewards/format_reward": 0.9795918166637421, "step": 2883 }, { "completion_length": 263.52040100097656, "epoch": 0.29021383647798743, "grad_norm": 1.246130108833313, "kl": 0.0655517578125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7013604640960693, "reward_std": 0.19822321087121964, "rewards/accuracy_reward": 0.7013605237007141, "rewards/format_reward": 1.0, "step": 2884 }, { "completion_length": 292.3061218261719, "epoch": 0.290314465408805, "grad_norm": 0.5518955588340759, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6632092595100403, "reward_std": 0.15372367203235626, "rewards/accuracy_reward": 0.6632092297077179, "rewards/format_reward": 1.0, "step": 2885 }, { "completion_length": 218.59183502197266, "epoch": 0.29041509433962265, "grad_norm": 0.5659672021865845, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.9081632494926453, "reward_std": 0.17990176007151604, "rewards/accuracy_reward": 0.9285714030265808, "rewards/format_reward": 0.9795918166637421, "step": 2886 }, { "completion_length": 237.84693908691406, "epoch": 0.29051572327044023, "grad_norm": 2.6203091144561768, "kl": 0.0653076171875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.722449004650116, "reward_std": 0.20862357318401337, "rewards/accuracy_reward": 0.7632652819156647, "rewards/format_reward": 0.9591836631298065, "step": 2887 }, { "completion_length": 210.47958374023438, "epoch": 0.29061635220125787, "grad_norm": 8.444917678833008, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8599423170089722, "reward_std": 0.14339318685233593, "rewards/accuracy_reward": 0.8701465427875519, "rewards/format_reward": 0.9897959232330322, "step": 2888 }, { "completion_length": 291.7652893066406, "epoch": 0.29071698113207545, "grad_norm": 0.8159670829772949, "kl": 0.052978515625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7133580446243286, "reward_std": 0.22165807336568832, "rewards/accuracy_reward": 0.7235621511936188, "rewards/format_reward": 0.9897959232330322, "step": 2889 }, { "completion_length": 237.60203552246094, "epoch": 0.2908176100628931, "grad_norm": 1.6658965349197388, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8272343277931213, "reward_std": 0.22012990713119507, "rewards/accuracy_reward": 0.8476425111293793, "rewards/format_reward": 0.9795918464660645, "step": 2890 }, { "completion_length": 280.34693145751953, "epoch": 0.29091823899371066, "grad_norm": 0.6672889590263367, "kl": 0.0579833984375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6780675649642944, "reward_std": 0.2415163740515709, "rewards/accuracy_reward": 0.6984757781028748, "rewards/format_reward": 0.9795918464660645, "step": 2891 }, { "completion_length": 223.37755584716797, "epoch": 0.2910188679245283, "grad_norm": 1.8277573585510254, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7185025811195374, "reward_std": 0.13417518883943558, "rewards/accuracy_reward": 0.7185026109218597, "rewards/format_reward": 1.0, "step": 2892 }, { "completion_length": 224.14285278320312, "epoch": 0.29111949685534594, "grad_norm": 0.40323054790496826, "kl": 0.0706787109375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.826530635356903, "reward_std": 0.09848719835281372, "rewards/accuracy_reward": 0.8265306353569031, "rewards/format_reward": 1.0, "step": 2893 }, { "completion_length": 249.65306091308594, "epoch": 0.2912201257861635, "grad_norm": 0.23119057714939117, "kl": 0.0474853515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7551020383834839, "reward_std": 0.03818017616868019, "rewards/accuracy_reward": 0.7653061151504517, "rewards/format_reward": 0.9897959232330322, "step": 2894 }, { "completion_length": 303.551025390625, "epoch": 0.29132075471698116, "grad_norm": 0.8216590881347656, "kl": 0.0550537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6453285813331604, "reward_std": 0.19278056919574738, "rewards/accuracy_reward": 0.6453286409378052, "rewards/format_reward": 1.0, "step": 2895 }, { "completion_length": 220.1938705444336, "epoch": 0.29142138364779874, "grad_norm": 0.3597620129585266, "kl": 0.052734375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8979591727256775, "reward_std": 0.061851032078266144, "rewards/accuracy_reward": 0.8979591727256775, "rewards/format_reward": 1.0, "step": 2896 }, { "completion_length": 267.6632614135742, "epoch": 0.2915220125786164, "grad_norm": 0.7581238746643066, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7467200756072998, "reward_std": 0.17514152452349663, "rewards/accuracy_reward": 0.7671282887458801, "rewards/format_reward": 0.9795918464660645, "step": 2897 }, { "completion_length": 318.6428527832031, "epoch": 0.29162264150943396, "grad_norm": 0.5514572262763977, "kl": 0.0587158203125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5552720427513123, "reward_std": 0.1541694477200508, "rewards/accuracy_reward": 0.5654762089252472, "rewards/format_reward": 0.9897959232330322, "step": 2898 }, { "completion_length": 214.5408172607422, "epoch": 0.2917232704402516, "grad_norm": 0.322153776884079, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.957337200641632, "reward_std": 0.051565589383244514, "rewards/accuracy_reward": 0.9573372304439545, "rewards/format_reward": 1.0, "step": 2899 }, { "completion_length": 295.33673095703125, "epoch": 0.2918238993710692, "grad_norm": 0.38422155380249023, "kl": 0.0552978515625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7455781698226929, "reward_std": 0.16720585525035858, "rewards/accuracy_reward": 0.755782276391983, "rewards/format_reward": 0.9897959232330322, "step": 2900 }, { "completion_length": 191.62244415283203, "epoch": 0.2919245283018868, "grad_norm": 0.833770215511322, "kl": 0.0704345703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7726500034332275, "reward_std": 0.14982720836997032, "rewards/accuracy_reward": 0.7726499140262604, "rewards/format_reward": 1.0, "step": 2901 }, { "completion_length": 296.0, "epoch": 0.2920251572327044, "grad_norm": 2.7834482192993164, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7246018648147583, "reward_std": 0.1774664893746376, "rewards/accuracy_reward": 0.7450100779533386, "rewards/format_reward": 0.9795918464660645, "step": 2902 }, { "completion_length": 237.29591369628906, "epoch": 0.292125786163522, "grad_norm": 0.9952675104141235, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7762660384178162, "reward_std": 0.16103573143482208, "rewards/accuracy_reward": 0.7762660384178162, "rewards/format_reward": 1.0, "step": 2903 }, { "completion_length": 256.04080963134766, "epoch": 0.2922264150943396, "grad_norm": 1.002472162246704, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6671067476272583, "reward_std": 0.23301222920417786, "rewards/accuracy_reward": 0.6875150203704834, "rewards/format_reward": 0.9795918166637421, "step": 2904 }, { "completion_length": 256.9183654785156, "epoch": 0.29232704402515725, "grad_norm": 0.6564284563064575, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.763411045074463, "reward_std": 0.2510726973414421, "rewards/accuracy_reward": 0.7736150920391083, "rewards/format_reward": 0.9897959232330322, "step": 2905 }, { "completion_length": 347.39794921875, "epoch": 0.2924276729559748, "grad_norm": 1.0372964143753052, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7298080921173096, "reward_std": 0.1800697147846222, "rewards/accuracy_reward": 0.7400120496749878, "rewards/format_reward": 0.9897959232330322, "step": 2906 }, { "completion_length": 138.01020050048828, "epoch": 0.29252830188679246, "grad_norm": 0.30393290519714355, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8979591727256775, "reward_std": 0.019216934219002724, "rewards/accuracy_reward": 0.8979591727256775, "rewards/format_reward": 1.0, "step": 2907 }, { "completion_length": 344.3061218261719, "epoch": 0.29262893081761004, "grad_norm": 0.8937037587165833, "kl": 0.059814453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5617783069610596, "reward_std": 0.3137504607439041, "rewards/accuracy_reward": 0.5821864306926727, "rewards/format_reward": 0.9795918464660645, "step": 2908 }, { "completion_length": 189.7142791748047, "epoch": 0.2927295597484277, "grad_norm": 0.7666600346565247, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7021865248680115, "reward_std": 0.09977025166153908, "rewards/accuracy_reward": 0.7021865844726562, "rewards/format_reward": 1.0, "step": 2909 }, { "completion_length": 269.78570556640625, "epoch": 0.29283018867924526, "grad_norm": 0.7527707815170288, "kl": 0.0400390625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.8582170009613037, "reward_std": 0.09236405417323112, "rewards/accuracy_reward": 0.8684210777282715, "rewards/format_reward": 0.9897959232330322, "step": 2910 }, { "completion_length": 362.65306091308594, "epoch": 0.2929308176100629, "grad_norm": 0.6881676912307739, "kl": 0.056640625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6470715999603271, "reward_std": 0.2185651734471321, "rewards/accuracy_reward": 0.6470716893672943, "rewards/format_reward": 1.0, "step": 2911 }, { "completion_length": 195.4795913696289, "epoch": 0.2930314465408805, "grad_norm": 1.0498095750808716, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7269193530082703, "reward_std": 0.16204652190208435, "rewards/accuracy_reward": 0.7473274767398834, "rewards/format_reward": 0.9795918464660645, "step": 2912 }, { "completion_length": 234.65304565429688, "epoch": 0.2931320754716981, "grad_norm": 0.6006370186805725, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.9401036500930786, "reward_std": 0.11288385838270187, "rewards/accuracy_reward": 0.9401036202907562, "rewards/format_reward": 1.0, "step": 2913 }, { "completion_length": 273.7448959350586, "epoch": 0.2932327044025157, "grad_norm": 0.3668316900730133, "kl": 0.0689697265625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.682785451412201, "reward_std": 0.14414288476109505, "rewards/accuracy_reward": 0.6929895877838135, "rewards/format_reward": 0.9897959232330322, "step": 2914 }, { "completion_length": 259.56121826171875, "epoch": 0.29333333333333333, "grad_norm": 0.696276843547821, "kl": 0.0601806640625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6556397676467896, "reward_std": 0.15620749816298485, "rewards/accuracy_reward": 0.6556398868560791, "rewards/format_reward": 1.0, "step": 2915 }, { "completion_length": 311.77549743652344, "epoch": 0.2934339622641509, "grad_norm": 0.5668615698814392, "kl": 0.0518798828125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8176546096801758, "reward_std": 0.19308824837207794, "rewards/accuracy_reward": 0.8278587758541107, "rewards/format_reward": 0.9897959232330322, "step": 2916 }, { "completion_length": 303.8877487182617, "epoch": 0.29353459119496855, "grad_norm": 0.9065306782722473, "kl": 0.0633544921875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.621147096157074, "reward_std": 0.23760556429624557, "rewards/accuracy_reward": 0.6313512325286865, "rewards/format_reward": 0.9897959232330322, "step": 2917 }, { "completion_length": 273.26529693603516, "epoch": 0.2936352201257862, "grad_norm": 0.9449777007102966, "kl": 0.05419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7990580797195435, "reward_std": 0.20724573731422424, "rewards/accuracy_reward": 0.8092621564865112, "rewards/format_reward": 0.9897959232330322, "step": 2918 }, { "completion_length": 248.25509643554688, "epoch": 0.29373584905660377, "grad_norm": 0.9766262769699097, "kl": 0.0535888671875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7242706418037415, "reward_std": 0.09405258111655712, "rewards/accuracy_reward": 0.7242706418037415, "rewards/format_reward": 1.0, "step": 2919 }, { "completion_length": 241.26529693603516, "epoch": 0.2938364779874214, "grad_norm": 1.3080545663833618, "kl": 0.0833740234375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7400793433189392, "reward_std": 0.16673611477017403, "rewards/accuracy_reward": 0.750283420085907, "rewards/format_reward": 0.9897959232330322, "step": 2920 }, { "completion_length": 260.1734619140625, "epoch": 0.293937106918239, "grad_norm": 1.1709442138671875, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.5131344199180603, "reward_std": 0.1932271085679531, "rewards/accuracy_reward": 0.523338571190834, "rewards/format_reward": 0.9897959232330322, "step": 2921 }, { "completion_length": 267.6020278930664, "epoch": 0.2940377358490566, "grad_norm": 4.110478401184082, "kl": 0.141845703125, "learning_rate": 1e-06, "loss": 0.0057, "reward": 1.5171834230422974, "reward_std": 0.14196586981415749, "rewards/accuracy_reward": 0.5477957129478455, "rewards/format_reward": 0.9693877398967743, "step": 2922 }, { "completion_length": 216.82653045654297, "epoch": 0.2941383647798742, "grad_norm": 0.7485381364822388, "kl": 0.0697021484375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6977921724319458, "reward_std": 0.1649053692817688, "rewards/accuracy_reward": 0.7079962193965912, "rewards/format_reward": 0.9897959232330322, "step": 2923 }, { "completion_length": 224.01020050048828, "epoch": 0.29423899371069184, "grad_norm": 1.166304111480713, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.806681215763092, "reward_std": 0.21620512008666992, "rewards/accuracy_reward": 0.8066812455654144, "rewards/format_reward": 1.0, "step": 2924 }, { "completion_length": 291.65306091308594, "epoch": 0.2943396226415094, "grad_norm": 0.5475339293479919, "kl": 0.0526123046875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5822510719299316, "reward_std": 0.14470204710960388, "rewards/accuracy_reward": 0.5822510719299316, "rewards/format_reward": 1.0, "step": 2925 }, { "completion_length": 218.80611419677734, "epoch": 0.29444025157232706, "grad_norm": 0.9287305474281311, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.787609338760376, "reward_std": 0.13556963950395584, "rewards/accuracy_reward": 0.7978134155273438, "rewards/format_reward": 0.9897959232330322, "step": 2926 }, { "completion_length": 200.71428680419922, "epoch": 0.29454088050314464, "grad_norm": 0.7490028142929077, "kl": 0.0579833984375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7517006993293762, "reward_std": 0.2110065594315529, "rewards/accuracy_reward": 0.761904776096344, "rewards/format_reward": 0.9897959232330322, "step": 2927 }, { "completion_length": 211.61223602294922, "epoch": 0.2946415094339623, "grad_norm": 3.9773151874542236, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8469387292861938, "reward_std": 0.19673580676317215, "rewards/accuracy_reward": 0.8469387590885162, "rewards/format_reward": 1.0, "step": 2928 }, { "completion_length": 229.53060913085938, "epoch": 0.29474213836477986, "grad_norm": 0.6468946933746338, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8232709765434265, "reward_std": 0.13604169338941574, "rewards/accuracy_reward": 0.8334750533103943, "rewards/format_reward": 0.9897959232330322, "step": 2929 }, { "completion_length": 208.23468780517578, "epoch": 0.2948427672955975, "grad_norm": 2.029322862625122, "kl": 0.1124267578125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.8085519671440125, "reward_std": 0.17374929785728455, "rewards/accuracy_reward": 0.8085519969463348, "rewards/format_reward": 1.0, "step": 2930 }, { "completion_length": 244.80611419677734, "epoch": 0.2949433962264151, "grad_norm": 0.5209866762161255, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7302234768867493, "reward_std": 0.132848858833313, "rewards/accuracy_reward": 0.7302235066890717, "rewards/format_reward": 1.0, "step": 2931 }, { "completion_length": 227.77550506591797, "epoch": 0.2950440251572327, "grad_norm": 1.073272705078125, "kl": 0.0650634765625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8119533061981201, "reward_std": 0.1576378419995308, "rewards/accuracy_reward": 0.8119533061981201, "rewards/format_reward": 1.0, "step": 2932 }, { "completion_length": 238.65306091308594, "epoch": 0.2951446540880503, "grad_norm": 0.8154543042182922, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6077502369880676, "reward_std": 0.16146091371774673, "rewards/accuracy_reward": 0.607750192284584, "rewards/format_reward": 1.0, "step": 2933 }, { "completion_length": 325.1734619140625, "epoch": 0.29524528301886793, "grad_norm": 0.668099045753479, "kl": 0.0596923828125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.450547218322754, "reward_std": 0.08265525475144386, "rewards/accuracy_reward": 0.4607512801885605, "rewards/format_reward": 0.9897959232330322, "step": 2934 }, { "completion_length": 228.88774871826172, "epoch": 0.2953459119496855, "grad_norm": 0.8256535530090332, "kl": 0.05859375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6053996086120605, "reward_std": 0.23088057339191437, "rewards/accuracy_reward": 0.6360118985176086, "rewards/format_reward": 0.9693877398967743, "step": 2935 }, { "completion_length": 181.59183502197266, "epoch": 0.29544654088050315, "grad_norm": 0.9221993088722229, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8136054277420044, "reward_std": 0.15112144500017166, "rewards/accuracy_reward": 0.8544217646121979, "rewards/format_reward": 0.9591836631298065, "step": 2936 }, { "completion_length": 267.1938781738281, "epoch": 0.29554716981132073, "grad_norm": 0.5458527207374573, "kl": 0.065185546875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.631869375705719, "reward_std": 0.14691413938999176, "rewards/accuracy_reward": 0.631869375705719, "rewards/format_reward": 1.0, "step": 2937 }, { "completion_length": 204.54080963134766, "epoch": 0.29564779874213837, "grad_norm": 0.9630061388015747, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8282312750816345, "reward_std": 0.17912016808986664, "rewards/accuracy_reward": 0.8384353220462799, "rewards/format_reward": 0.9897959232330322, "step": 2938 }, { "completion_length": 299.2040710449219, "epoch": 0.29574842767295595, "grad_norm": 0.7077988386154175, "kl": 0.097900390625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.509616494178772, "reward_std": 0.20180658251047134, "rewards/accuracy_reward": 0.5096165537834167, "rewards/format_reward": 1.0, "step": 2939 }, { "completion_length": 247.93877410888672, "epoch": 0.2958490566037736, "grad_norm": 0.5861605405807495, "kl": 0.0733642578125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.744155764579773, "reward_std": 0.097624272108078, "rewards/accuracy_reward": 0.7543599009513855, "rewards/format_reward": 0.9897959232330322, "step": 2940 }, { "completion_length": 237.41836547851562, "epoch": 0.2959496855345912, "grad_norm": 2.2948522567749023, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5362554788589478, "reward_std": 0.20310967415571213, "rewards/accuracy_reward": 0.5770718306303024, "rewards/format_reward": 0.9591836631298065, "step": 2941 }, { "completion_length": 211.20407104492188, "epoch": 0.2960503144654088, "grad_norm": 0.9363822340965271, "kl": 0.0804443359375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7697683572769165, "reward_std": 0.20558517426252365, "rewards/accuracy_reward": 0.7799724638462067, "rewards/format_reward": 0.9897959232330322, "step": 2942 }, { "completion_length": 246.66326141357422, "epoch": 0.29615094339622644, "grad_norm": 1.4823291301727295, "kl": 0.0714111328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6363635659217834, "reward_std": 0.21768148615956306, "rewards/accuracy_reward": 0.6771799623966217, "rewards/format_reward": 0.9591836333274841, "step": 2943 }, { "completion_length": 264.8877487182617, "epoch": 0.296251572327044, "grad_norm": 0.7477564811706543, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7257570624351501, "reward_std": 0.1490781120955944, "rewards/accuracy_reward": 0.7257570922374725, "rewards/format_reward": 1.0, "step": 2944 }, { "completion_length": 213.57142639160156, "epoch": 0.29635220125786166, "grad_norm": 0.6906836628913879, "kl": 0.05908203125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7653061151504517, "reward_std": 0.12370206415653229, "rewards/accuracy_reward": 0.7755101919174194, "rewards/format_reward": 0.9897959232330322, "step": 2945 }, { "completion_length": 220.59182739257812, "epoch": 0.29645283018867924, "grad_norm": 0.4777489900588989, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.795464813709259, "reward_std": 0.16292136535048485, "rewards/accuracy_reward": 0.815872997045517, "rewards/format_reward": 0.9795918166637421, "step": 2946 }, { "completion_length": 242.61223602294922, "epoch": 0.2965534591194969, "grad_norm": 1.6087710857391357, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5764896273612976, "reward_std": 0.28348807990550995, "rewards/accuracy_reward": 0.5866937190294266, "rewards/format_reward": 0.9897959232330322, "step": 2947 }, { "completion_length": 207.16326141357422, "epoch": 0.29665408805031446, "grad_norm": 0.9498071670532227, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6938775181770325, "reward_std": 0.14153798297047615, "rewards/accuracy_reward": 0.7142857015132904, "rewards/format_reward": 0.9795918166637421, "step": 2948 }, { "completion_length": 222.12244415283203, "epoch": 0.2967547169811321, "grad_norm": 0.6149774193763733, "kl": 0.10107421875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8061224222183228, "reward_std": 0.15402613580226898, "rewards/accuracy_reward": 0.8163265287876129, "rewards/format_reward": 0.9897959232330322, "step": 2949 }, { "completion_length": 275.1734619140625, "epoch": 0.2968553459119497, "grad_norm": 0.758592963218689, "kl": 0.053466796875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.516945719718933, "reward_std": 0.23025447130203247, "rewards/accuracy_reward": 0.5169457048177719, "rewards/format_reward": 1.0, "step": 2950 }, { "completion_length": 212.48979949951172, "epoch": 0.2969559748427673, "grad_norm": 0.8184898495674133, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7919501066207886, "reward_std": 0.24659457057714462, "rewards/accuracy_reward": 0.8123582303524017, "rewards/format_reward": 0.9795918464660645, "step": 2951 }, { "completion_length": 301.32653045654297, "epoch": 0.2970566037735849, "grad_norm": 0.6600590348243713, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6022899150848389, "reward_std": 0.12560251727700233, "rewards/accuracy_reward": 0.6022898852825165, "rewards/format_reward": 1.0, "step": 2952 }, { "completion_length": 272.6530456542969, "epoch": 0.29715723270440253, "grad_norm": 0.7487476468086243, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6611192226409912, "reward_std": 0.18620219081640244, "rewards/accuracy_reward": 0.6713232845067978, "rewards/format_reward": 0.9897959232330322, "step": 2953 }, { "completion_length": 268.08162689208984, "epoch": 0.2972578616352201, "grad_norm": 0.6675273776054382, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.718294382095337, "reward_std": 0.20043817162513733, "rewards/accuracy_reward": 0.7387025952339172, "rewards/format_reward": 0.9795918464660645, "step": 2954 }, { "completion_length": 187.29591369628906, "epoch": 0.29735849056603775, "grad_norm": 1.4550368785858154, "kl": 0.099609375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7383686304092407, "reward_std": 0.338304728269577, "rewards/accuracy_reward": 0.768980860710144, "rewards/format_reward": 0.9693877398967743, "step": 2955 }, { "completion_length": 307.2550964355469, "epoch": 0.29745911949685533, "grad_norm": 0.7233182787895203, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.617490291595459, "reward_std": 0.2042030319571495, "rewards/accuracy_reward": 0.6378985047340393, "rewards/format_reward": 0.9795918166637421, "step": 2956 }, { "completion_length": 282.24488830566406, "epoch": 0.29755974842767297, "grad_norm": 0.41253727674484253, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6666666865348816, "reward_std": 0.13293716311454773, "rewards/accuracy_reward": 0.6870748400688171, "rewards/format_reward": 0.9795918166637421, "step": 2957 }, { "completion_length": 320.06121826171875, "epoch": 0.29766037735849055, "grad_norm": 1.5542476177215576, "kl": 0.0635986328125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5815304517745972, "reward_std": 0.2265602871775627, "rewards/accuracy_reward": 0.5917345136404037, "rewards/format_reward": 0.9897959232330322, "step": 2958 }, { "completion_length": 245.6530532836914, "epoch": 0.2977610062893082, "grad_norm": 0.9165502786636353, "kl": 0.0789794921875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.5037432312965393, "reward_std": 0.18755362555384636, "rewards/accuracy_reward": 0.5139473676681519, "rewards/format_reward": 0.9897959232330322, "step": 2959 }, { "completion_length": 272.78570556640625, "epoch": 0.29786163522012576, "grad_norm": 0.5267979502677917, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8163264989852905, "reward_std": 0.16041186451911926, "rewards/accuracy_reward": 0.8265306055545807, "rewards/format_reward": 0.9897959232330322, "step": 2960 }, { "completion_length": 208.7448959350586, "epoch": 0.2979622641509434, "grad_norm": 0.7163049578666687, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8415451645851135, "reward_std": 0.04354347102344036, "rewards/accuracy_reward": 0.8415451645851135, "rewards/format_reward": 1.0, "step": 2961 }, { "completion_length": 209.89794921875, "epoch": 0.298062893081761, "grad_norm": 0.555728554725647, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.5663067698478699, "reward_std": 0.13198605179786682, "rewards/accuracy_reward": 0.5663067996501923, "rewards/format_reward": 1.0, "step": 2962 }, { "completion_length": 268.08162689208984, "epoch": 0.2981635220125786, "grad_norm": 1.2434223890304565, "kl": 0.100830078125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.65879487991333, "reward_std": 0.26575350761413574, "rewards/accuracy_reward": 0.6894071698188782, "rewards/format_reward": 0.9693877398967743, "step": 2963 }, { "completion_length": 265.0408020019531, "epoch": 0.2982641509433962, "grad_norm": 0.570894718170166, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.819727897644043, "reward_std": 0.12484510987997055, "rewards/accuracy_reward": 0.819727897644043, "rewards/format_reward": 1.0, "step": 2964 }, { "completion_length": 184.29591369628906, "epoch": 0.29836477987421384, "grad_norm": 1.1977425813674927, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7727567553520203, "reward_std": 0.21728211641311646, "rewards/accuracy_reward": 0.8033689558506012, "rewards/format_reward": 0.9693877398967743, "step": 2965 }, { "completion_length": 171.79591369628906, "epoch": 0.2984654088050315, "grad_norm": 1.0809998512268066, "kl": 0.122802734375, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.707096517086029, "reward_std": 0.1938622072339058, "rewards/accuracy_reward": 0.7070964574813843, "rewards/format_reward": 1.0, "step": 2966 }, { "completion_length": 223.58162689208984, "epoch": 0.29856603773584905, "grad_norm": 0.6185457706451416, "kl": 0.0657958984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7100984454154968, "reward_std": 0.15401425957679749, "rewards/accuracy_reward": 0.7305066585540771, "rewards/format_reward": 0.9795918166637421, "step": 2967 }, { "completion_length": 254.5102081298828, "epoch": 0.2986666666666667, "grad_norm": 0.8037659525871277, "kl": 0.0904541015625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6219623684883118, "reward_std": 0.24227221310138702, "rewards/accuracy_reward": 0.6525746881961823, "rewards/format_reward": 0.9693877398967743, "step": 2968 }, { "completion_length": 199.10203552246094, "epoch": 0.2987672955974843, "grad_norm": 0.5507922768592834, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.789198100566864, "reward_std": 0.14162556733936071, "rewards/accuracy_reward": 0.8096062242984772, "rewards/format_reward": 0.9795918166637421, "step": 2969 }, { "completion_length": 239.04080963134766, "epoch": 0.2988679245283019, "grad_norm": 0.717168927192688, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6775509715080261, "reward_std": 0.16702989488840103, "rewards/accuracy_reward": 0.7081632614135742, "rewards/format_reward": 0.9693877398967743, "step": 2970 }, { "completion_length": 236.25509643554688, "epoch": 0.2989685534591195, "grad_norm": 0.7905473709106445, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7387754917144775, "reward_std": 0.20726174116134644, "rewards/accuracy_reward": 0.7693877518177032, "rewards/format_reward": 0.9693877398967743, "step": 2971 }, { "completion_length": 273.73468017578125, "epoch": 0.2990691823899371, "grad_norm": 0.7576705813407898, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6323614716529846, "reward_std": 0.2324654683470726, "rewards/accuracy_reward": 0.6527696549892426, "rewards/format_reward": 0.9795918166637421, "step": 2972 }, { "completion_length": 213.9795913696289, "epoch": 0.2991698113207547, "grad_norm": 0.5642437934875488, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6778432726860046, "reward_std": 0.10569995641708374, "rewards/accuracy_reward": 0.708455502986908, "rewards/format_reward": 0.9693877398967743, "step": 2973 }, { "completion_length": 273.51019287109375, "epoch": 0.29927044025157234, "grad_norm": 0.6883410811424255, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.5510203838348389, "reward_std": 0.2314886748790741, "rewards/accuracy_reward": 0.5918367207050323, "rewards/format_reward": 0.9591836333274841, "step": 2974 }, { "completion_length": 236.14285278320312, "epoch": 0.2993710691823899, "grad_norm": 0.6096125841140747, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7424197793006897, "reward_std": 0.17585786432027817, "rewards/accuracy_reward": 0.7526238858699799, "rewards/format_reward": 0.9897959232330322, "step": 2975 }, { "completion_length": 168.5, "epoch": 0.29947169811320756, "grad_norm": 1.0690667629241943, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7627550959587097, "reward_std": 0.21083995699882507, "rewards/accuracy_reward": 0.7729591727256775, "rewards/format_reward": 0.9897959232330322, "step": 2976 }, { "completion_length": 144.16326522827148, "epoch": 0.29957232704402514, "grad_norm": 1.407199501991272, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.652442753314972, "reward_std": 0.14362921565771103, "rewards/accuracy_reward": 0.6626468896865845, "rewards/format_reward": 0.9897959232330322, "step": 2977 }, { "completion_length": 260.05101013183594, "epoch": 0.2996729559748428, "grad_norm": 0.8039801120758057, "kl": 0.0650634765625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6315840482711792, "reward_std": 0.1969287246465683, "rewards/accuracy_reward": 0.6621963381767273, "rewards/format_reward": 0.9693877398967743, "step": 2978 }, { "completion_length": 256.4795913696289, "epoch": 0.29977358490566036, "grad_norm": 0.8027560710906982, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7313775420188904, "reward_std": 0.14518877118825912, "rewards/accuracy_reward": 0.7415816187858582, "rewards/format_reward": 0.9897959232330322, "step": 2979 }, { "completion_length": 187.12244415283203, "epoch": 0.299874213836478, "grad_norm": 1.402963399887085, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6324281692504883, "reward_std": 0.18226032704114914, "rewards/accuracy_reward": 0.6324281990528107, "rewards/format_reward": 1.0, "step": 2980 }, { "completion_length": 202.53060913085938, "epoch": 0.2999748427672956, "grad_norm": 0.7348253130912781, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.80098956823349, "reward_std": 0.11464005336165428, "rewards/accuracy_reward": 0.8111935257911682, "rewards/format_reward": 0.9897959232330322, "step": 2981 }, { "completion_length": 277.76529693603516, "epoch": 0.3000754716981132, "grad_norm": 0.9680716395378113, "kl": 0.097412109375, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.629145622253418, "reward_std": 0.18002599105238914, "rewards/accuracy_reward": 0.6495537757873535, "rewards/format_reward": 0.9795918166637421, "step": 2982 }, { "completion_length": 249.3775405883789, "epoch": 0.3001761006289308, "grad_norm": 0.7560871839523315, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6584224104881287, "reward_std": 0.22435056418180466, "rewards/accuracy_reward": 0.6584223806858063, "rewards/format_reward": 1.0, "step": 2983 }, { "completion_length": 272.16326904296875, "epoch": 0.30027672955974843, "grad_norm": 0.5923909544944763, "kl": 0.055419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6632652878761292, "reward_std": 0.16288189217448235, "rewards/accuracy_reward": 0.6836734712123871, "rewards/format_reward": 0.9795918166637421, "step": 2984 }, { "completion_length": 204.16326141357422, "epoch": 0.300377358490566, "grad_norm": 0.5671856999397278, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8834466338157654, "reward_std": 0.11062349006533623, "rewards/accuracy_reward": 0.8936507701873779, "rewards/format_reward": 0.9897959232330322, "step": 2985 }, { "completion_length": 288.23468017578125, "epoch": 0.30047798742138365, "grad_norm": 1.3071978092193604, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.5365654230117798, "reward_std": 0.22604082897305489, "rewards/accuracy_reward": 0.5569736063480377, "rewards/format_reward": 0.9795918464660645, "step": 2986 }, { "completion_length": 233.08162689208984, "epoch": 0.30057861635220123, "grad_norm": 0.7852640748023987, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7157359719276428, "reward_std": 0.14363494142889977, "rewards/accuracy_reward": 0.7361441850662231, "rewards/format_reward": 0.9795918166637421, "step": 2987 }, { "completion_length": 193.7653045654297, "epoch": 0.30067924528301887, "grad_norm": 1.5493255853652954, "kl": 0.1134033203125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.8135101795196533, "reward_std": 0.1868113875389099, "rewards/accuracy_reward": 0.8237142264842987, "rewards/format_reward": 0.9897959232330322, "step": 2988 }, { "completion_length": 284.5408172607422, "epoch": 0.30077987421383645, "grad_norm": 1.764747142791748, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5129339694976807, "reward_std": 0.2579113841056824, "rewards/accuracy_reward": 0.512933999300003, "rewards/format_reward": 1.0, "step": 2989 }, { "completion_length": 160.2448959350586, "epoch": 0.3008805031446541, "grad_norm": 0.9163230061531067, "kl": 0.0745849609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7887755036354065, "reward_std": 0.08864165842533112, "rewards/accuracy_reward": 0.8193877339363098, "rewards/format_reward": 0.9693877398967743, "step": 2990 }, { "completion_length": 310.32653045654297, "epoch": 0.3009811320754717, "grad_norm": 0.7621347904205322, "kl": 0.0528564453125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7714285254478455, "reward_std": 0.13242815807461739, "rewards/accuracy_reward": 0.7714285254478455, "rewards/format_reward": 1.0, "step": 2991 }, { "completion_length": 188.17346954345703, "epoch": 0.3010817610062893, "grad_norm": 2.5688862800598145, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8338192105293274, "reward_std": 0.12688611447811127, "rewards/accuracy_reward": 0.8440232872962952, "rewards/format_reward": 0.9897959232330322, "step": 2992 }, { "completion_length": 152.15306091308594, "epoch": 0.30118238993710694, "grad_norm": 0.5907439589500427, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.9387754797935486, "reward_std": 0.14284341782331467, "rewards/accuracy_reward": 0.9489795565605164, "rewards/format_reward": 0.9897959232330322, "step": 2993 }, { "completion_length": 217.7142791748047, "epoch": 0.3012830188679245, "grad_norm": 0.6054811477661133, "kl": 0.0576171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7073283791542053, "reward_std": 0.1670079603791237, "rewards/accuracy_reward": 0.7073284089565277, "rewards/format_reward": 1.0, "step": 2994 }, { "completion_length": 235.56122589111328, "epoch": 0.30138364779874216, "grad_norm": 1.4601069688796997, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.728571355342865, "reward_std": 0.27869081497192383, "rewards/accuracy_reward": 0.7591836750507355, "rewards/format_reward": 0.9693877398967743, "step": 2995 }, { "completion_length": 255.75509643554688, "epoch": 0.30148427672955974, "grad_norm": 0.8075354099273682, "kl": 0.0526123046875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7943722605705261, "reward_std": 0.15633957087993622, "rewards/accuracy_reward": 0.8147804439067841, "rewards/format_reward": 0.9795918464660645, "step": 2996 }, { "completion_length": 224.4591827392578, "epoch": 0.3015849056603774, "grad_norm": 0.891042172908783, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6696792244911194, "reward_std": 0.3028307110071182, "rewards/accuracy_reward": 0.6798833608627319, "rewards/format_reward": 0.9897959232330322, "step": 2997 }, { "completion_length": 259.39794921875, "epoch": 0.30168553459119496, "grad_norm": 0.3788590133190155, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7563168406486511, "reward_std": 0.06638868851587176, "rewards/accuracy_reward": 0.7563168108463287, "rewards/format_reward": 1.0, "step": 2998 }, { "completion_length": 176.15306091308594, "epoch": 0.3017861635220126, "grad_norm": 1.3874074220657349, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7178416848182678, "reward_std": 0.17359068244695663, "rewards/accuracy_reward": 0.7484539151191711, "rewards/format_reward": 0.9693877398967743, "step": 2999 }, { "completion_length": 202.06122589111328, "epoch": 0.3018867924528302, "grad_norm": 0.6467328071594238, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8286113739013672, "reward_std": 0.13758406788110733, "rewards/accuracy_reward": 0.8388154804706573, "rewards/format_reward": 0.9897959232330322, "step": 3000 }, { "completion_length": 266.73468017578125, "epoch": 0.3019874213836478, "grad_norm": 1.6619398593902588, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.54178124666214, "reward_std": 0.2227102555334568, "rewards/accuracy_reward": 0.5825976133346558, "rewards/format_reward": 0.9591836631298065, "step": 3001 }, { "completion_length": 245.5306167602539, "epoch": 0.3020880503144654, "grad_norm": 1.267175555229187, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6231291890144348, "reward_std": 0.2374672368168831, "rewards/accuracy_reward": 0.6639455407857895, "rewards/format_reward": 0.9591836631298065, "step": 3002 }, { "completion_length": 260.1224365234375, "epoch": 0.30218867924528303, "grad_norm": 1.2360247373580933, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5039281249046326, "reward_std": 0.3926295042037964, "rewards/accuracy_reward": 0.5243362784385681, "rewards/format_reward": 0.9795918166637421, "step": 3003 }, { "completion_length": 212.2448959350586, "epoch": 0.3022893081761006, "grad_norm": 3.9840400218963623, "kl": 0.06689453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8142857551574707, "reward_std": 0.11338933557271957, "rewards/accuracy_reward": 0.8244897723197937, "rewards/format_reward": 0.9897959232330322, "step": 3004 }, { "completion_length": 274.06121826171875, "epoch": 0.30238993710691825, "grad_norm": 1.1514173746109009, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.66722571849823, "reward_std": 0.3330572694540024, "rewards/accuracy_reward": 0.6876339316368103, "rewards/format_reward": 0.9795918464660645, "step": 3005 }, { "completion_length": 214.78571319580078, "epoch": 0.30249056603773583, "grad_norm": 0.9255841970443726, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8435373306274414, "reward_std": 0.20453206449747086, "rewards/accuracy_reward": 0.8435373902320862, "rewards/format_reward": 1.0, "step": 3006 }, { "completion_length": 226.448974609375, "epoch": 0.30259119496855347, "grad_norm": 0.8615579009056091, "kl": 0.0595703125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6079828143119812, "reward_std": 0.14864902198314667, "rewards/accuracy_reward": 0.6283909380435944, "rewards/format_reward": 0.9795918464660645, "step": 3007 }, { "completion_length": 224.43877410888672, "epoch": 0.30269182389937105, "grad_norm": 0.6266952753067017, "kl": 0.0550537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8095238208770752, "reward_std": 0.18320715054869652, "rewards/accuracy_reward": 0.8095238208770752, "rewards/format_reward": 1.0, "step": 3008 }, { "completion_length": 168.9897918701172, "epoch": 0.3027924528301887, "grad_norm": 1.3872647285461426, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8084973692893982, "reward_std": 0.12475195527076721, "rewards/accuracy_reward": 0.808497428894043, "rewards/format_reward": 1.0, "step": 3009 }, { "completion_length": 265.76529693603516, "epoch": 0.30289308176100627, "grad_norm": 0.6829997301101685, "kl": 0.0633544921875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5658008456230164, "reward_std": 0.17280448228120804, "rewards/accuracy_reward": 0.5658008456230164, "rewards/format_reward": 1.0, "step": 3010 }, { "completion_length": 197.4591827392578, "epoch": 0.3029937106918239, "grad_norm": 0.8674067854881287, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6276294589042664, "reward_std": 0.18520589172840118, "rewards/accuracy_reward": 0.6378335952758789, "rewards/format_reward": 0.9897959232330322, "step": 3011 }, { "completion_length": 212.1326446533203, "epoch": 0.3030943396226415, "grad_norm": 0.9945318698883057, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7448979616165161, "reward_std": 0.2049330323934555, "rewards/accuracy_reward": 0.7551020085811615, "rewards/format_reward": 0.9897959232330322, "step": 3012 }, { "completion_length": 235.3775405883789, "epoch": 0.3031949685534591, "grad_norm": 0.37901946902275085, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8780369758605957, "reward_std": 0.08656877279281616, "rewards/accuracy_reward": 0.8882410526275635, "rewards/format_reward": 0.9897959232330322, "step": 3013 }, { "completion_length": 218.68366241455078, "epoch": 0.3032955974842767, "grad_norm": 0.8410727977752686, "kl": 0.0682373046875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7647958993911743, "reward_std": 0.18480989336967468, "rewards/accuracy_reward": 0.7647958993911743, "rewards/format_reward": 1.0, "step": 3014 }, { "completion_length": 249.16326141357422, "epoch": 0.30339622641509434, "grad_norm": 0.6657050848007202, "kl": 0.0556640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7539681792259216, "reward_std": 0.1188422366976738, "rewards/accuracy_reward": 0.7539682388305664, "rewards/format_reward": 1.0, "step": 3015 }, { "completion_length": 174.7346954345703, "epoch": 0.303496855345912, "grad_norm": 0.7408180236816406, "kl": 0.0528564453125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8386297225952148, "reward_std": 0.08169577643275261, "rewards/accuracy_reward": 0.8386297225952148, "rewards/format_reward": 1.0, "step": 3016 }, { "completion_length": 183.24488830566406, "epoch": 0.30359748427672956, "grad_norm": 1.3099256753921509, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7551020383834839, "reward_std": 0.13155816495418549, "rewards/accuracy_reward": 0.7551020085811615, "rewards/format_reward": 1.0, "step": 3017 }, { "completion_length": 169.2040786743164, "epoch": 0.3036981132075472, "grad_norm": 1.2263284921646118, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8163264989852905, "reward_std": 0.16188225150108337, "rewards/accuracy_reward": 0.8163265287876129, "rewards/format_reward": 1.0, "step": 3018 }, { "completion_length": 180.1836700439453, "epoch": 0.3037987421383648, "grad_norm": 1.6901499032974243, "kl": 0.103271484375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7753673195838928, "reward_std": 0.18311788141727448, "rewards/accuracy_reward": 0.7957754731178284, "rewards/format_reward": 0.9795918166637421, "step": 3019 }, { "completion_length": 290.9081573486328, "epoch": 0.3038993710691824, "grad_norm": 0.8775841593742371, "kl": 0.0477294921875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.657123327255249, "reward_std": 0.2559032365679741, "rewards/accuracy_reward": 0.6673273742198944, "rewards/format_reward": 0.9897959232330322, "step": 3020 }, { "completion_length": 204.84693145751953, "epoch": 0.304, "grad_norm": 2.578996181488037, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.606477975845337, "reward_std": 0.29041943699121475, "rewards/accuracy_reward": 0.6268861889839172, "rewards/format_reward": 0.9795918464660645, "step": 3021 }, { "completion_length": 217.2244873046875, "epoch": 0.30410062893081763, "grad_norm": 0.507729172706604, "kl": 0.0494384765625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7708719372749329, "reward_std": 0.16931767761707306, "rewards/accuracy_reward": 0.7912801206111908, "rewards/format_reward": 0.9795918166637421, "step": 3022 }, { "completion_length": 236.37754821777344, "epoch": 0.3042012578616352, "grad_norm": 0.8650121092796326, "kl": 0.089111328125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.693730652332306, "reward_std": 0.10225018113851547, "rewards/accuracy_reward": 0.6937306523323059, "rewards/format_reward": 1.0, "step": 3023 }, { "completion_length": 213.16326141357422, "epoch": 0.30430188679245285, "grad_norm": 0.838270366191864, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6874443888664246, "reward_std": 0.1192173883318901, "rewards/accuracy_reward": 0.6874443888664246, "rewards/format_reward": 1.0, "step": 3024 }, { "completion_length": 187.13265228271484, "epoch": 0.30440251572327043, "grad_norm": 0.8335762619972229, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7413728833198547, "reward_std": 0.11535020917654037, "rewards/accuracy_reward": 0.7413729429244995, "rewards/format_reward": 1.0, "step": 3025 }, { "completion_length": 264.9183654785156, "epoch": 0.30450314465408806, "grad_norm": 0.7193173766136169, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.512479305267334, "reward_std": 0.22560687363147736, "rewards/accuracy_reward": 0.5328875035047531, "rewards/format_reward": 0.9795918166637421, "step": 3026 }, { "completion_length": 203.89795684814453, "epoch": 0.30460377358490565, "grad_norm": 1.1956512928009033, "kl": 0.108154296875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7612022757530212, "reward_std": 0.09045856073498726, "rewards/accuracy_reward": 0.7612024247646332, "rewards/format_reward": 1.0, "step": 3027 }, { "completion_length": 223.80611419677734, "epoch": 0.3047044025157233, "grad_norm": 0.77596515417099, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6585033535957336, "reward_std": 0.1887364760041237, "rewards/accuracy_reward": 0.6891156136989594, "rewards/format_reward": 0.9693877398967743, "step": 3028 }, { "completion_length": 207.25509643554688, "epoch": 0.30480503144654086, "grad_norm": 2.1212711334228516, "kl": 0.0576171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7010365724563599, "reward_std": 0.20136407017707825, "rewards/accuracy_reward": 0.7010366022586823, "rewards/format_reward": 1.0, "step": 3029 }, { "completion_length": 189.6836700439453, "epoch": 0.3049056603773585, "grad_norm": 1.509225606918335, "kl": 0.0667724609375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6692096590995789, "reward_std": 0.21212871372699738, "rewards/accuracy_reward": 0.6692096590995789, "rewards/format_reward": 1.0, "step": 3030 }, { "completion_length": 228.10203552246094, "epoch": 0.3050062893081761, "grad_norm": 0.9837355017662048, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.759205400943756, "reward_std": 0.2074560672044754, "rewards/accuracy_reward": 0.7694095075130463, "rewards/format_reward": 0.9897959232330322, "step": 3031 }, { "completion_length": 225.0408172607422, "epoch": 0.3051069182389937, "grad_norm": 0.7663949728012085, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6628037095069885, "reward_std": 0.2204049974679947, "rewards/accuracy_reward": 0.6730077862739563, "rewards/format_reward": 0.9897959232330322, "step": 3032 }, { "completion_length": 227.14285278320312, "epoch": 0.3052075471698113, "grad_norm": 0.45585763454437256, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7656098008155823, "reward_std": 0.1599969007074833, "rewards/accuracy_reward": 0.7758138477802277, "rewards/format_reward": 0.9897959232330322, "step": 3033 }, { "completion_length": 208.51020050048828, "epoch": 0.30530817610062894, "grad_norm": 1.2472431659698486, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.715316116809845, "reward_std": 0.14887145534157753, "rewards/accuracy_reward": 0.715316116809845, "rewards/format_reward": 1.0, "step": 3034 }, { "completion_length": 251.24488830566406, "epoch": 0.3054088050314465, "grad_norm": 0.6025377511978149, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6382896304130554, "reward_std": 0.18179303035140038, "rewards/accuracy_reward": 0.6586977243423462, "rewards/format_reward": 0.9795918464660645, "step": 3035 }, { "completion_length": 254.02040100097656, "epoch": 0.30550943396226415, "grad_norm": 1.884740948677063, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5874370336532593, "reward_std": 0.2109866440296173, "rewards/accuracy_reward": 0.5874370485544205, "rewards/format_reward": 1.0, "step": 3036 }, { "completion_length": 201.99999237060547, "epoch": 0.30561006289308174, "grad_norm": 6.258775234222412, "kl": 0.1988525390625, "learning_rate": 1e-06, "loss": 0.008, "reward": 1.6859409809112549, "reward_std": 0.1289878748357296, "rewards/accuracy_reward": 0.6859410107135773, "rewards/format_reward": 1.0, "step": 3037 }, { "completion_length": 240.88774871826172, "epoch": 0.30571069182389937, "grad_norm": 0.9176744818687439, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7829446196556091, "reward_std": 0.17617133259773254, "rewards/accuracy_reward": 0.7931486368179321, "rewards/format_reward": 0.9897959232330322, "step": 3038 }, { "completion_length": 221.1530532836914, "epoch": 0.305811320754717, "grad_norm": 0.5642598867416382, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.843537449836731, "reward_std": 0.08999153599143028, "rewards/accuracy_reward": 0.8435373902320862, "rewards/format_reward": 1.0, "step": 3039 }, { "completion_length": 256.52040100097656, "epoch": 0.3059119496855346, "grad_norm": 1.1146767139434814, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.4732194542884827, "reward_std": 0.16116609424352646, "rewards/accuracy_reward": 0.47321951389312744, "rewards/format_reward": 1.0, "step": 3040 }, { "completion_length": 321.8877410888672, "epoch": 0.3060125786163522, "grad_norm": 3.1750783920288086, "kl": 0.0579833984375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6769774556159973, "reward_std": 0.19954124838113785, "rewards/accuracy_reward": 0.6871815323829651, "rewards/format_reward": 0.9897959232330322, "step": 3041 }, { "completion_length": 248.33672332763672, "epoch": 0.3061132075471698, "grad_norm": 1.522246241569519, "kl": 0.05322265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7648855447769165, "reward_std": 0.12135158479213715, "rewards/accuracy_reward": 0.7750896215438843, "rewards/format_reward": 0.9897959232330322, "step": 3042 }, { "completion_length": 193.7142791748047, "epoch": 0.30621383647798744, "grad_norm": 1.751471757888794, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7916380763053894, "reward_std": 0.1869942769408226, "rewards/accuracy_reward": 0.7916381061077118, "rewards/format_reward": 1.0, "step": 3043 }, { "completion_length": 235.9081573486328, "epoch": 0.306314465408805, "grad_norm": 0.53684401512146, "kl": 0.05810546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.714285671710968, "reward_std": 0.16188225150108337, "rewards/accuracy_reward": 0.7244897782802582, "rewards/format_reward": 0.9897959232330322, "step": 3044 }, { "completion_length": 225.01019287109375, "epoch": 0.30641509433962266, "grad_norm": 0.846274197101593, "kl": 0.093994140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8371469378471375, "reward_std": 0.13813448697328568, "rewards/accuracy_reward": 0.8473510146141052, "rewards/format_reward": 0.9897959232330322, "step": 3045 }, { "completion_length": 251.33672332763672, "epoch": 0.30651572327044024, "grad_norm": 0.9107704758644104, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6502267122268677, "reward_std": 0.20906932651996613, "rewards/accuracy_reward": 0.6706348955631256, "rewards/format_reward": 0.9795918464660645, "step": 3046 }, { "completion_length": 208.9387664794922, "epoch": 0.3066163522012579, "grad_norm": 4.216726779937744, "kl": 0.295166015625, "learning_rate": 1e-06, "loss": 0.0118, "reward": 1.7934792041778564, "reward_std": 0.20760007202625275, "rewards/accuracy_reward": 0.8036831915378571, "rewards/format_reward": 0.9897959232330322, "step": 3047 }, { "completion_length": 255.23468017578125, "epoch": 0.30671698113207546, "grad_norm": 1.751708745956421, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.4336858987808228, "reward_std": 0.2635085806250572, "rewards/accuracy_reward": 0.4438900202512741, "rewards/format_reward": 0.9897959232330322, "step": 3048 }, { "completion_length": 190.2040786743164, "epoch": 0.3068176100628931, "grad_norm": 1.3063892126083374, "kl": 0.0743408203125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7246443629264832, "reward_std": 0.13601897656917572, "rewards/accuracy_reward": 0.7348484694957733, "rewards/format_reward": 0.9897959232330322, "step": 3049 }, { "completion_length": 259.1530532836914, "epoch": 0.3069182389937107, "grad_norm": 0.8010584115982056, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6607850193977356, "reward_std": 0.19627662748098373, "rewards/accuracy_reward": 0.660785049200058, "rewards/format_reward": 1.0, "step": 3050 }, { "completion_length": 221.71428680419922, "epoch": 0.3070188679245283, "grad_norm": 1.8924497365951538, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.749343991279602, "reward_std": 0.20948296785354614, "rewards/accuracy_reward": 0.76975217461586, "rewards/format_reward": 0.9795918464660645, "step": 3051 }, { "completion_length": 280.32652282714844, "epoch": 0.3071194968553459, "grad_norm": 0.6501570343971252, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7181379795074463, "reward_std": 0.29180267453193665, "rewards/accuracy_reward": 0.7385461926460266, "rewards/format_reward": 0.9795918166637421, "step": 3052 }, { "completion_length": 214.37754821777344, "epoch": 0.30722012578616353, "grad_norm": 0.5808720588684082, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7224618792533875, "reward_std": 0.12163068167865276, "rewards/accuracy_reward": 0.7224618792533875, "rewards/format_reward": 1.0, "step": 3053 }, { "completion_length": 236.84693145751953, "epoch": 0.3073207547169811, "grad_norm": 0.4744364619255066, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.71746027469635, "reward_std": 0.0970672108232975, "rewards/accuracy_reward": 0.7174603044986725, "rewards/format_reward": 1.0, "step": 3054 }, { "completion_length": 170.16326141357422, "epoch": 0.30742138364779875, "grad_norm": 2.9984405040740967, "kl": 0.1123046875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.845918357372284, "reward_std": 0.17474541068077087, "rewards/accuracy_reward": 0.8561224639415741, "rewards/format_reward": 0.9897959232330322, "step": 3055 }, { "completion_length": 227.35714721679688, "epoch": 0.30752201257861633, "grad_norm": 1.389436960220337, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.711224377155304, "reward_std": 0.1922934725880623, "rewards/accuracy_reward": 0.7214285135269165, "rewards/format_reward": 0.9897959232330322, "step": 3056 }, { "completion_length": 247.2551040649414, "epoch": 0.30762264150943397, "grad_norm": 0.45009633898735046, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6690961718559265, "reward_std": 0.1410134993493557, "rewards/accuracy_reward": 0.6793002784252167, "rewards/format_reward": 0.9897959232330322, "step": 3057 }, { "completion_length": 210.7551040649414, "epoch": 0.30772327044025155, "grad_norm": 0.8981218934059143, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.835076093673706, "reward_std": 0.20137270539999008, "rewards/accuracy_reward": 0.8452802002429962, "rewards/format_reward": 0.9897959232330322, "step": 3058 }, { "completion_length": 224.77550506591797, "epoch": 0.3078238993710692, "grad_norm": 0.9007765650749207, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.728774607181549, "reward_std": 0.11683881748467684, "rewards/accuracy_reward": 0.7287745773792267, "rewards/format_reward": 1.0, "step": 3059 }, { "completion_length": 234.6428451538086, "epoch": 0.30792452830188677, "grad_norm": 0.5281760692596436, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8191094398498535, "reward_std": 0.1181054450571537, "rewards/accuracy_reward": 0.8191094398498535, "rewards/format_reward": 1.0, "step": 3060 }, { "completion_length": 203.0408172607422, "epoch": 0.3080251572327044, "grad_norm": 1.113753080368042, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6603498458862305, "reward_std": 0.09382757544517517, "rewards/accuracy_reward": 0.6603498756885529, "rewards/format_reward": 1.0, "step": 3061 }, { "completion_length": 233.1632537841797, "epoch": 0.308125786163522, "grad_norm": 0.7268760800361633, "kl": 0.05078125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8673468828201294, "reward_std": 0.20016494393348694, "rewards/accuracy_reward": 0.8877550661563873, "rewards/format_reward": 0.9795918464660645, "step": 3062 }, { "completion_length": 188.4897918701172, "epoch": 0.3082264150943396, "grad_norm": 1.1290802955627441, "kl": 0.10546875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7332996726036072, "reward_std": 0.158887829631567, "rewards/accuracy_reward": 0.7435038089752197, "rewards/format_reward": 0.9897959232330322, "step": 3063 }, { "completion_length": 247.4285659790039, "epoch": 0.30832704402515726, "grad_norm": 0.8273931741714478, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.648508608341217, "reward_std": 0.20764076709747314, "rewards/accuracy_reward": 0.6587126851081848, "rewards/format_reward": 0.9897959232330322, "step": 3064 }, { "completion_length": 261.3367385864258, "epoch": 0.30842767295597484, "grad_norm": 0.841251015663147, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6591836214065552, "reward_std": 0.21087896078824997, "rewards/accuracy_reward": 0.6693877279758453, "rewards/format_reward": 0.9897959232330322, "step": 3065 }, { "completion_length": 313.93878173828125, "epoch": 0.3085283018867925, "grad_norm": 2.7997965812683105, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6946195363998413, "reward_std": 0.27317485213279724, "rewards/accuracy_reward": 0.7252319157123566, "rewards/format_reward": 0.9693877398967743, "step": 3066 }, { "completion_length": 238.9795913696289, "epoch": 0.30862893081761006, "grad_norm": 0.9511704444885254, "kl": 0.054443359375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7260203957557678, "reward_std": 0.15072598308324814, "rewards/accuracy_reward": 0.7260203957557678, "rewards/format_reward": 1.0, "step": 3067 }, { "completion_length": 280.6836700439453, "epoch": 0.3087295597484277, "grad_norm": 0.6764246225357056, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.626901626586914, "reward_std": 0.2711077481508255, "rewards/accuracy_reward": 0.6269016861915588, "rewards/format_reward": 1.0, "step": 3068 }, { "completion_length": 206.9795913696289, "epoch": 0.3088301886792453, "grad_norm": 0.7782183289527893, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7331956624984741, "reward_std": 0.15055802464485168, "rewards/accuracy_reward": 0.7331956923007965, "rewards/format_reward": 1.0, "step": 3069 }, { "completion_length": 197.95917510986328, "epoch": 0.3089308176100629, "grad_norm": 4.4207072257995605, "kl": 0.21044921875, "learning_rate": 1e-06, "loss": 0.0084, "reward": 1.8404022455215454, "reward_std": 0.11227639019489288, "rewards/accuracy_reward": 0.8506063222885132, "rewards/format_reward": 0.9897959232330322, "step": 3070 }, { "completion_length": 256.9183654785156, "epoch": 0.3090314465408805, "grad_norm": 1.4153292179107666, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6060670614242554, "reward_std": 0.33456844091415405, "rewards/accuracy_reward": 0.6264752745628357, "rewards/format_reward": 0.9795918464660645, "step": 3071 }, { "completion_length": 257.0408172607422, "epoch": 0.30913207547169813, "grad_norm": 0.5476493239402771, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.663388967514038, "reward_std": 0.1565018817782402, "rewards/accuracy_reward": 0.6940012276172638, "rewards/format_reward": 0.9693877398967743, "step": 3072 }, { "completion_length": 233.57142639160156, "epoch": 0.3092327044025157, "grad_norm": 1.4254995584487915, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.513822078704834, "reward_std": 0.23090488463640213, "rewards/accuracy_reward": 0.6158629357814789, "rewards/format_reward": 0.8979591727256775, "step": 3073 }, { "completion_length": 146.79591751098633, "epoch": 0.30933333333333335, "grad_norm": 1.3760756254196167, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7683849334716797, "reward_std": 0.22536008805036545, "rewards/accuracy_reward": 0.7887931168079376, "rewards/format_reward": 0.9795918464660645, "step": 3074 }, { "completion_length": 162.65306091308594, "epoch": 0.30943396226415093, "grad_norm": 0.9956346750259399, "kl": 0.0670166015625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7521865963935852, "reward_std": 0.16215252503752708, "rewards/accuracy_reward": 0.7827987968921661, "rewards/format_reward": 0.9693877398967743, "step": 3075 }, { "completion_length": 202.1836700439453, "epoch": 0.30953459119496857, "grad_norm": 0.6527113914489746, "kl": 0.0966796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.4961933493614197, "reward_std": 0.08146555721759796, "rewards/accuracy_reward": 0.5676219463348389, "rewards/format_reward": 0.9285714328289032, "step": 3076 }, { "completion_length": 241.6428451538086, "epoch": 0.30963522012578615, "grad_norm": 0.8367912173271179, "kl": 0.054443359375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.622016727924347, "reward_std": 0.2805536240339279, "rewards/accuracy_reward": 0.6832412779331207, "rewards/format_reward": 0.938775509595871, "step": 3077 }, { "completion_length": 218.33672332763672, "epoch": 0.3097358490566038, "grad_norm": 0.5594661235809326, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.510440707206726, "reward_std": 0.1524266004562378, "rewards/accuracy_reward": 0.5410529524087906, "rewards/format_reward": 0.9693877398967743, "step": 3078 }, { "completion_length": 246.448974609375, "epoch": 0.30983647798742137, "grad_norm": 0.6560444235801697, "kl": 0.057373046875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8025487065315247, "reward_std": 0.18430569022893906, "rewards/accuracy_reward": 0.822956919670105, "rewards/format_reward": 0.9795918166637421, "step": 3079 }, { "completion_length": 195.948974609375, "epoch": 0.309937106918239, "grad_norm": 0.807963490486145, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6991081833839417, "reward_std": 0.10464478703215718, "rewards/accuracy_reward": 0.7705366313457489, "rewards/format_reward": 0.9285714328289032, "step": 3080 }, { "completion_length": 275.9387741088867, "epoch": 0.3100377358490566, "grad_norm": 0.8704630136489868, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.2941789031028748, "reward_std": 0.2926231101155281, "rewards/accuracy_reward": 0.5696891844272614, "rewards/format_reward": 0.7244897782802582, "step": 3081 }, { "completion_length": 241.5204086303711, "epoch": 0.3101383647798742, "grad_norm": 0.6989539265632629, "kl": 0.096923828125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.390720248222351, "reward_std": 0.21182651817798615, "rewards/accuracy_reward": 0.6152100563049316, "rewards/format_reward": 0.7755101919174194, "step": 3082 }, { "completion_length": 229.04080963134766, "epoch": 0.3102389937106918, "grad_norm": 1.057680368423462, "kl": 0.097412109375, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.4955585598945618, "reward_std": 0.2914409562945366, "rewards/accuracy_reward": 0.6690279841423035, "rewards/format_reward": 0.8265305757522583, "step": 3083 }, { "completion_length": 246.42855834960938, "epoch": 0.31033962264150944, "grad_norm": 1.4359116554260254, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6576728224754333, "reward_std": 0.2048913985490799, "rewards/accuracy_reward": 0.6678769886493683, "rewards/format_reward": 0.9897959232330322, "step": 3084 }, { "completion_length": 237.08162689208984, "epoch": 0.310440251572327, "grad_norm": 1.0288584232330322, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6765329837799072, "reward_std": 0.20037023723125458, "rewards/accuracy_reward": 0.6867371201515198, "rewards/format_reward": 0.9897959232330322, "step": 3085 }, { "completion_length": 252.84693145751953, "epoch": 0.31054088050314466, "grad_norm": 1.1797869205474854, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7745382189750671, "reward_std": 0.2159847393631935, "rewards/accuracy_reward": 0.8357626497745514, "rewards/format_reward": 0.938775509595871, "step": 3086 }, { "completion_length": 241.05101776123047, "epoch": 0.31064150943396224, "grad_norm": 0.746430516242981, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7066094875335693, "reward_std": 0.18757322430610657, "rewards/accuracy_reward": 0.7270176708698273, "rewards/format_reward": 0.9795918166637421, "step": 3087 }, { "completion_length": 195.1530532836914, "epoch": 0.3107421383647799, "grad_norm": 1.1431777477264404, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7259394526481628, "reward_std": 0.18006456270813942, "rewards/accuracy_reward": 0.7361434698104858, "rewards/format_reward": 0.9897959232330322, "step": 3088 }, { "completion_length": 228.6938705444336, "epoch": 0.3108427672955975, "grad_norm": 0.7396615147590637, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7323349714279175, "reward_std": 0.20775265246629715, "rewards/accuracy_reward": 0.7425390481948853, "rewards/format_reward": 0.9897959232330322, "step": 3089 }, { "completion_length": 204.60203552246094, "epoch": 0.3109433962264151, "grad_norm": 0.9505957365036011, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7341306805610657, "reward_std": 0.2502560690045357, "rewards/accuracy_reward": 0.7545388042926788, "rewards/format_reward": 0.9795918166637421, "step": 3090 }, { "completion_length": 248.36734771728516, "epoch": 0.31104402515723273, "grad_norm": 0.7720499634742737, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.765120506286621, "reward_std": 0.11475479602813721, "rewards/accuracy_reward": 0.7753246426582336, "rewards/format_reward": 0.9897959232330322, "step": 3091 }, { "completion_length": 272.1122360229492, "epoch": 0.3111446540880503, "grad_norm": 1.101637363433838, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6749559044837952, "reward_std": 0.22601518407464027, "rewards/accuracy_reward": 0.6851600706577301, "rewards/format_reward": 0.9897959232330322, "step": 3092 }, { "completion_length": 256.2040710449219, "epoch": 0.31124528301886795, "grad_norm": 0.9107975363731384, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7238094806671143, "reward_std": 0.10652362555265427, "rewards/accuracy_reward": 0.7340135872364044, "rewards/format_reward": 0.9897959232330322, "step": 3093 }, { "completion_length": 237.05101776123047, "epoch": 0.3113459119496855, "grad_norm": 0.816616415977478, "kl": 0.106201171875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.713208556175232, "reward_std": 0.1895415484905243, "rewards/accuracy_reward": 0.7234126627445221, "rewards/format_reward": 0.9897959232330322, "step": 3094 }, { "completion_length": 241.09182739257812, "epoch": 0.31144654088050316, "grad_norm": 0.5162771940231323, "kl": 0.0589599609375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8061224222183228, "reward_std": 0.09217509627342224, "rewards/accuracy_reward": 0.8163264989852905, "rewards/format_reward": 0.9897959232330322, "step": 3095 }, { "completion_length": 203.09183502197266, "epoch": 0.31154716981132075, "grad_norm": 0.9992022514343262, "kl": 0.091552734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.675775170326233, "reward_std": 0.11465809866786003, "rewards/accuracy_reward": 0.6757752299308777, "rewards/format_reward": 1.0, "step": 3096 }, { "completion_length": 200.25509643554688, "epoch": 0.3116477987421384, "grad_norm": 1.2834434509277344, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7606516480445862, "reward_std": 0.16642288118600845, "rewards/accuracy_reward": 0.7606516182422638, "rewards/format_reward": 1.0, "step": 3097 }, { "completion_length": 238.91836547851562, "epoch": 0.31174842767295596, "grad_norm": 0.6987563371658325, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.836734652519226, "reward_std": 0.23038648068904877, "rewards/accuracy_reward": 0.8571428656578064, "rewards/format_reward": 0.9795918166637421, "step": 3098 }, { "completion_length": 216.7244873046875, "epoch": 0.3118490566037736, "grad_norm": 0.8971416354179382, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8097909688949585, "reward_std": 0.17644242197275162, "rewards/accuracy_reward": 0.8097910583019257, "rewards/format_reward": 1.0, "step": 3099 }, { "completion_length": 201.09183502197266, "epoch": 0.3119496855345912, "grad_norm": 1.7695636749267578, "kl": 0.0855712890625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8456135392189026, "reward_std": 0.21152720600366592, "rewards/accuracy_reward": 0.8558176159858704, "rewards/format_reward": 0.9897959232330322, "step": 3100 }, { "completion_length": 197.7448959350586, "epoch": 0.3120503144654088, "grad_norm": 0.8897714614868164, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7363094687461853, "reward_std": 0.13822411000728607, "rewards/accuracy_reward": 0.7465135753154755, "rewards/format_reward": 0.9897959232330322, "step": 3101 }, { "completion_length": 221.2448959350586, "epoch": 0.3121509433962264, "grad_norm": 3.085505247116089, "kl": 0.06689453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6476190090179443, "reward_std": 0.12476392090320587, "rewards/accuracy_reward": 0.6578231155872345, "rewards/format_reward": 0.9897959232330322, "step": 3102 }, { "completion_length": 278.3367233276367, "epoch": 0.31225157232704404, "grad_norm": 0.9604330062866211, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6805066466331482, "reward_std": 0.1945801079273224, "rewards/accuracy_reward": 0.680506706237793, "rewards/format_reward": 1.0, "step": 3103 }, { "completion_length": 246.29590606689453, "epoch": 0.3123522012578616, "grad_norm": 1.2037097215652466, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6759122610092163, "reward_std": 0.22009103000164032, "rewards/accuracy_reward": 0.6963204145431519, "rewards/format_reward": 0.9795918166637421, "step": 3104 }, { "completion_length": 261.4285583496094, "epoch": 0.31245283018867925, "grad_norm": 0.7387257218360901, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7103984355926514, "reward_std": 0.26631639897823334, "rewards/accuracy_reward": 0.7206025421619415, "rewards/format_reward": 0.9897959232330322, "step": 3105 }, { "completion_length": 296.9897918701172, "epoch": 0.31255345911949683, "grad_norm": 0.6288784146308899, "kl": 0.0548095703125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7063491344451904, "reward_std": 0.16341137886047363, "rewards/accuracy_reward": 0.7063491940498352, "rewards/format_reward": 1.0, "step": 3106 }, { "completion_length": 251.62244415283203, "epoch": 0.31265408805031447, "grad_norm": 1.02860689163208, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.761451244354248, "reward_std": 0.15523266792297363, "rewards/accuracy_reward": 0.761451244354248, "rewards/format_reward": 1.0, "step": 3107 }, { "completion_length": 255.05101776123047, "epoch": 0.31275471698113205, "grad_norm": 1.6090538501739502, "kl": 0.05126953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.631828248500824, "reward_std": 0.09014379233121872, "rewards/accuracy_reward": 0.631828248500824, "rewards/format_reward": 1.0, "step": 3108 }, { "completion_length": 180.7551040649414, "epoch": 0.3128553459119497, "grad_norm": 1.207939624786377, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8028571605682373, "reward_std": 0.09473760984838009, "rewards/accuracy_reward": 0.8028571307659149, "rewards/format_reward": 1.0, "step": 3109 }, { "completion_length": 243.9387664794922, "epoch": 0.31295597484276727, "grad_norm": 2.578819751739502, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.694039523601532, "reward_std": 0.20583079755306244, "rewards/accuracy_reward": 0.7144477069377899, "rewards/format_reward": 0.9795918464660645, "step": 3110 }, { "completion_length": 210.32653045654297, "epoch": 0.3130566037735849, "grad_norm": 1.0777581930160522, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7176870107650757, "reward_std": 0.1257719285786152, "rewards/accuracy_reward": 0.7176870405673981, "rewards/format_reward": 1.0, "step": 3111 }, { "completion_length": 318.20408630371094, "epoch": 0.3131572327044025, "grad_norm": 0.8513752222061157, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6127650737762451, "reward_std": 0.2034919261932373, "rewards/accuracy_reward": 0.6229691803455353, "rewards/format_reward": 0.9897959232330322, "step": 3112 }, { "completion_length": 290.02040100097656, "epoch": 0.3132578616352201, "grad_norm": 0.6722230911254883, "kl": 0.052734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7269517183303833, "reward_std": 0.18839478120207787, "rewards/accuracy_reward": 0.7575640082359314, "rewards/format_reward": 0.9693877398967743, "step": 3113 }, { "completion_length": 191.51020050048828, "epoch": 0.31335849056603776, "grad_norm": 0.6739418506622314, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.868016004562378, "reward_std": 0.1039479449391365, "rewards/accuracy_reward": 0.8782200813293457, "rewards/format_reward": 0.9897959232330322, "step": 3114 }, { "completion_length": 250.34693145751953, "epoch": 0.31345911949685534, "grad_norm": 0.7431116104125977, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8498541712760925, "reward_std": 0.22598276287317276, "rewards/accuracy_reward": 0.8702623844146729, "rewards/format_reward": 0.9795918166637421, "step": 3115 }, { "completion_length": 228.6734619140625, "epoch": 0.313559748427673, "grad_norm": 0.9991176724433899, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6700680255889893, "reward_std": 0.1866961345076561, "rewards/accuracy_reward": 0.6904761791229248, "rewards/format_reward": 0.9795918464660645, "step": 3116 }, { "completion_length": 297.6326446533203, "epoch": 0.31366037735849056, "grad_norm": 1.8476296663284302, "kl": 0.161865234375, "learning_rate": 1e-06, "loss": 0.0065, "reward": 1.7202380895614624, "reward_std": 0.23780755698680878, "rewards/accuracy_reward": 0.7202380895614624, "rewards/format_reward": 1.0, "step": 3117 }, { "completion_length": 218.4285659790039, "epoch": 0.3137610062893082, "grad_norm": 0.9391062259674072, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8351090550422668, "reward_std": 0.1287002395838499, "rewards/accuracy_reward": 0.8351091146469116, "rewards/format_reward": 1.0, "step": 3118 }, { "completion_length": 298.39796447753906, "epoch": 0.3138616352201258, "grad_norm": 0.7714946269989014, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6947038173675537, "reward_std": 0.2798044830560684, "rewards/accuracy_reward": 0.7151120007038116, "rewards/format_reward": 0.9795918464660645, "step": 3119 }, { "completion_length": 264.9387664794922, "epoch": 0.3139622641509434, "grad_norm": 0.4670509099960327, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7301586866378784, "reward_std": 0.19108448922634125, "rewards/accuracy_reward": 0.7505668997764587, "rewards/format_reward": 0.9795918166637421, "step": 3120 }, { "completion_length": 197.7346954345703, "epoch": 0.314062893081761, "grad_norm": 1.0154504776000977, "kl": 0.0723876953125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7029144763946533, "reward_std": 0.09783101826906204, "rewards/accuracy_reward": 0.7029144763946533, "rewards/format_reward": 1.0, "step": 3121 }, { "completion_length": 192.79591369628906, "epoch": 0.31416352201257863, "grad_norm": 0.671910285949707, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8141947388648987, "reward_std": 0.06896547647193074, "rewards/accuracy_reward": 0.8141947686672211, "rewards/format_reward": 1.0, "step": 3122 }, { "completion_length": 277.8673400878906, "epoch": 0.3142641509433962, "grad_norm": 0.6079310178756714, "kl": 0.0675048828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6776239275932312, "reward_std": 0.14504285156726837, "rewards/accuracy_reward": 0.6776238977909088, "rewards/format_reward": 1.0, "step": 3123 }, { "completion_length": 240.85714721679688, "epoch": 0.31436477987421385, "grad_norm": 0.8976648449897766, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6499911546707153, "reward_std": 0.17136459052562714, "rewards/accuracy_reward": 0.6601952314376831, "rewards/format_reward": 0.9897959232330322, "step": 3124 }, { "completion_length": 199.448974609375, "epoch": 0.31446540880503143, "grad_norm": 0.5023066401481628, "kl": 0.0667724609375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8114673495292664, "reward_std": 0.07356078177690506, "rewards/accuracy_reward": 0.8114674389362335, "rewards/format_reward": 1.0, "step": 3125 }, { "completion_length": 250.54080963134766, "epoch": 0.31456603773584907, "grad_norm": 1.3943687677383423, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7380959391593933, "reward_std": 0.2031891942024231, "rewards/accuracy_reward": 0.7380959391593933, "rewards/format_reward": 1.0, "step": 3126 }, { "completion_length": 203.0408172607422, "epoch": 0.31466666666666665, "grad_norm": 1.970870018005371, "kl": 0.09912109375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.73312509059906, "reward_std": 0.21619630604982376, "rewards/accuracy_reward": 0.7331250905990601, "rewards/format_reward": 1.0, "step": 3127 }, { "completion_length": 217.58162689208984, "epoch": 0.3147672955974843, "grad_norm": 0.7598640322685242, "kl": 0.062744140625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8002111315727234, "reward_std": 0.13148586824536324, "rewards/accuracy_reward": 0.8104151487350464, "rewards/format_reward": 0.9897959232330322, "step": 3128 }, { "completion_length": 246.7448959350586, "epoch": 0.31486792452830187, "grad_norm": 0.987359344959259, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6993898153305054, "reward_std": 0.16919348388910294, "rewards/accuracy_reward": 0.7095939517021179, "rewards/format_reward": 0.9897959232330322, "step": 3129 }, { "completion_length": 261.4183654785156, "epoch": 0.3149685534591195, "grad_norm": 0.3366769254207611, "kl": 0.0479736328125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.8381186723709106, "reward_std": 0.09390595555305481, "rewards/accuracy_reward": 0.8483227491378784, "rewards/format_reward": 0.9897959232330322, "step": 3130 }, { "completion_length": 254.68366241455078, "epoch": 0.3150691823899371, "grad_norm": 0.8125492930412292, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.732296645641327, "reward_std": 0.23057229816913605, "rewards/accuracy_reward": 0.7425008416175842, "rewards/format_reward": 0.9897959232330322, "step": 3131 }, { "completion_length": 211.62244415283203, "epoch": 0.3151698113207547, "grad_norm": 0.6982691287994385, "kl": 0.0689697265625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.694747507572174, "reward_std": 0.15500563383102417, "rewards/accuracy_reward": 0.7049515247344971, "rewards/format_reward": 0.9897959232330322, "step": 3132 }, { "completion_length": 260.31632232666016, "epoch": 0.3152704402515723, "grad_norm": 1.2787069082260132, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5507288575172424, "reward_std": 0.17987989634275436, "rewards/accuracy_reward": 0.5609329342842102, "rewards/format_reward": 0.9897959232330322, "step": 3133 }, { "completion_length": 171.1530532836914, "epoch": 0.31537106918238994, "grad_norm": 0.7117949724197388, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8997525572776794, "reward_std": 0.12131073698401451, "rewards/accuracy_reward": 0.8997526168823242, "rewards/format_reward": 1.0, "step": 3134 }, { "completion_length": 268.82652282714844, "epoch": 0.3154716981132075, "grad_norm": 0.9321375489234924, "kl": 0.0643310546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7009088397026062, "reward_std": 0.20219558477401733, "rewards/accuracy_reward": 0.7111129462718964, "rewards/format_reward": 0.9897959232330322, "step": 3135 }, { "completion_length": 250.80611419677734, "epoch": 0.31557232704402516, "grad_norm": 0.8692997694015503, "kl": 0.053466796875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.768356204032898, "reward_std": 0.18147026002407074, "rewards/accuracy_reward": 0.7785603106021881, "rewards/format_reward": 0.9897959232330322, "step": 3136 }, { "completion_length": 311.0204086303711, "epoch": 0.3156729559748428, "grad_norm": 0.6661897301673889, "kl": 0.0670166015625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7259424328804016, "reward_std": 0.22155410796403885, "rewards/accuracy_reward": 0.7259424924850464, "rewards/format_reward": 1.0, "step": 3137 }, { "completion_length": 195.20407104492188, "epoch": 0.3157735849056604, "grad_norm": 0.2571313977241516, "kl": 0.0762939453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.768007516860962, "reward_std": 0.05036865174770355, "rewards/accuracy_reward": 0.7680074870586395, "rewards/format_reward": 1.0, "step": 3138 }, { "completion_length": 206.39794921875, "epoch": 0.315874213836478, "grad_norm": 1.4094874858856201, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6881776452064514, "reward_std": 0.26190898567438126, "rewards/accuracy_reward": 0.7085858285427094, "rewards/format_reward": 0.9795918166637421, "step": 3139 }, { "completion_length": 256.5204086303711, "epoch": 0.3159748427672956, "grad_norm": 0.8090223670005798, "kl": 0.097900390625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.5363327264785767, "reward_std": 0.02251934725791216, "rewards/accuracy_reward": 0.5363326817750931, "rewards/format_reward": 1.0, "step": 3140 }, { "completion_length": 197.84693908691406, "epoch": 0.31607547169811323, "grad_norm": 0.5947872996330261, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7271324396133423, "reward_std": 0.12919362634420395, "rewards/accuracy_reward": 0.7475405931472778, "rewards/format_reward": 0.9795918166637421, "step": 3141 }, { "completion_length": 179.32653045654297, "epoch": 0.3161761006289308, "grad_norm": 1.381108045578003, "kl": 0.0882568359375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6899902820587158, "reward_std": 0.17604144662618637, "rewards/accuracy_reward": 0.6899902820587158, "rewards/format_reward": 1.0, "step": 3142 }, { "completion_length": 175.84693908691406, "epoch": 0.31627672955974845, "grad_norm": 0.6960960030555725, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6700862646102905, "reward_std": 0.14405550435185432, "rewards/accuracy_reward": 0.6802903711795807, "rewards/format_reward": 0.9897959232330322, "step": 3143 }, { "completion_length": 267.3877487182617, "epoch": 0.31637735849056603, "grad_norm": 4.937726974487305, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.724953532218933, "reward_std": 0.24453136324882507, "rewards/accuracy_reward": 0.7351577281951904, "rewards/format_reward": 0.9897959232330322, "step": 3144 }, { "completion_length": 260.6836700439453, "epoch": 0.31647798742138367, "grad_norm": 0.7760584354400635, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.611953318119049, "reward_std": 0.19615063816308975, "rewards/accuracy_reward": 0.6221573948860168, "rewards/format_reward": 0.9897959232330322, "step": 3145 }, { "completion_length": 200.21428298950195, "epoch": 0.31657861635220125, "grad_norm": 1.565746784210205, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7945268750190735, "reward_std": 0.18016919493675232, "rewards/accuracy_reward": 0.7945268750190735, "rewards/format_reward": 1.0, "step": 3146 }, { "completion_length": 202.1530532836914, "epoch": 0.3166792452830189, "grad_norm": 0.6824851036071777, "kl": 0.091064453125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8134920001029968, "reward_std": 0.1896578148007393, "rewards/accuracy_reward": 0.8134920299053192, "rewards/format_reward": 1.0, "step": 3147 }, { "completion_length": 190.2346954345703, "epoch": 0.31677987421383647, "grad_norm": 1.1092945337295532, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.774052381515503, "reward_std": 0.12166209705173969, "rewards/accuracy_reward": 0.7740524411201477, "rewards/format_reward": 1.0, "step": 3148 }, { "completion_length": 309.1632537841797, "epoch": 0.3168805031446541, "grad_norm": 0.5450891852378845, "kl": 0.054443359375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6258503198623657, "reward_std": 0.15620969235897064, "rewards/accuracy_reward": 0.6258503198623657, "rewards/format_reward": 1.0, "step": 3149 }, { "completion_length": 217.80612182617188, "epoch": 0.3169811320754717, "grad_norm": 0.6871381998062134, "kl": 0.0594482421875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8626201152801514, "reward_std": 0.10768258199095726, "rewards/accuracy_reward": 0.8626199960708618, "rewards/format_reward": 1.0, "step": 3150 }, { "completion_length": 243.4897918701172, "epoch": 0.3170817610062893, "grad_norm": 0.6106624007225037, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7540330290794373, "reward_std": 0.20453251525759697, "rewards/accuracy_reward": 0.7744411826133728, "rewards/format_reward": 0.9795918166637421, "step": 3151 }, { "completion_length": 251.30611419677734, "epoch": 0.3171823899371069, "grad_norm": 0.7137296795845032, "kl": 0.0699462890625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.658892035484314, "reward_std": 0.15774036943912506, "rewards/accuracy_reward": 0.6588920950889587, "rewards/format_reward": 1.0, "step": 3152 }, { "completion_length": 222.09182739257812, "epoch": 0.31728301886792454, "grad_norm": 0.7355990409851074, "kl": 0.0498046875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7023809552192688, "reward_std": 0.14981701225042343, "rewards/accuracy_reward": 0.7023809850215912, "rewards/format_reward": 1.0, "step": 3153 }, { "completion_length": 184.33673095703125, "epoch": 0.3173836477987421, "grad_norm": 0.6120811700820923, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6990723609924316, "reward_std": 0.11641592904925346, "rewards/accuracy_reward": 0.7092764377593994, "rewards/format_reward": 0.9897959232330322, "step": 3154 }, { "completion_length": 243.16326141357422, "epoch": 0.31748427672955976, "grad_norm": 0.7078474760055542, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8566568493843079, "reward_std": 0.11841582134366035, "rewards/accuracy_reward": 0.8566569089889526, "rewards/format_reward": 1.0, "step": 3155 }, { "completion_length": 266.77550506591797, "epoch": 0.31758490566037734, "grad_norm": 0.5637176632881165, "kl": 0.0550537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7072885632514954, "reward_std": 0.07606348302215338, "rewards/accuracy_reward": 0.7072885632514954, "rewards/format_reward": 1.0, "step": 3156 }, { "completion_length": 247.62245178222656, "epoch": 0.317685534591195, "grad_norm": 1.4632492065429688, "kl": 0.139404296875, "learning_rate": 1e-06, "loss": 0.0056, "reward": 1.6084312200546265, "reward_std": 0.2286609187722206, "rewards/accuracy_reward": 0.6084312796592712, "rewards/format_reward": 1.0, "step": 3157 }, { "completion_length": 243.10203552246094, "epoch": 0.31778616352201255, "grad_norm": 0.8284488916397095, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.711621344089508, "reward_std": 0.1899840235710144, "rewards/accuracy_reward": 0.7320294678211212, "rewards/format_reward": 0.9795918464660645, "step": 3158 }, { "completion_length": 234.08162689208984, "epoch": 0.3178867924528302, "grad_norm": 2.194723129272461, "kl": 0.0665283203125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8571428060531616, "reward_std": 0.1730649620294571, "rewards/accuracy_reward": 0.857142835855484, "rewards/format_reward": 1.0, "step": 3159 }, { "completion_length": 286.9387664794922, "epoch": 0.3179874213836478, "grad_norm": 0.6439885497093201, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6071794033050537, "reward_std": 0.236051507294178, "rewards/accuracy_reward": 0.637791708111763, "rewards/format_reward": 0.9693877398967743, "step": 3160 }, { "completion_length": 249.09182739257812, "epoch": 0.3180880503144654, "grad_norm": 0.9411760568618774, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6384714841842651, "reward_std": 0.26532815396785736, "rewards/accuracy_reward": 0.6588795781135559, "rewards/format_reward": 0.9795918464660645, "step": 3161 }, { "completion_length": 335.22447204589844, "epoch": 0.31818867924528305, "grad_norm": 0.8640558123588562, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.720129907131195, "reward_std": 0.11614304408431053, "rewards/accuracy_reward": 0.7303339242935181, "rewards/format_reward": 0.9897959232330322, "step": 3162 }, { "completion_length": 317.98978424072266, "epoch": 0.3182893081761006, "grad_norm": 0.8616616725921631, "kl": 0.0582275390625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5293025970458984, "reward_std": 0.23650282248854637, "rewards/accuracy_reward": 0.529302641749382, "rewards/format_reward": 1.0, "step": 3163 }, { "completion_length": 213.59183502197266, "epoch": 0.31838993710691826, "grad_norm": 0.7982630133628845, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7142857313156128, "reward_std": 0.1715220883488655, "rewards/accuracy_reward": 0.734693855047226, "rewards/format_reward": 0.9795918166637421, "step": 3164 }, { "completion_length": 227.1530532836914, "epoch": 0.31849056603773584, "grad_norm": 0.6433870196342468, "kl": 0.0556640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7972140908241272, "reward_std": 0.11852932162582874, "rewards/accuracy_reward": 0.7972141206264496, "rewards/format_reward": 1.0, "step": 3165 }, { "completion_length": 231.11223602294922, "epoch": 0.3185911949685535, "grad_norm": 0.848284125328064, "kl": 0.0614013671875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5911153554916382, "reward_std": 0.21142203360795975, "rewards/accuracy_reward": 0.5911153256893158, "rewards/format_reward": 1.0, "step": 3166 }, { "completion_length": 251.76529693603516, "epoch": 0.31869182389937106, "grad_norm": 0.6236189603805542, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6037251949310303, "reward_std": 0.15446912869811058, "rewards/accuracy_reward": 0.6037252992391586, "rewards/format_reward": 1.0, "step": 3167 }, { "completion_length": 212.11223602294922, "epoch": 0.3187924528301887, "grad_norm": 2.122434139251709, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.769862174987793, "reward_std": 0.133536696434021, "rewards/accuracy_reward": 0.7902703583240509, "rewards/format_reward": 0.9795918166637421, "step": 3168 }, { "completion_length": 243.30612182617188, "epoch": 0.3188930817610063, "grad_norm": 0.9677100777626038, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7346938848495483, "reward_std": 0.23043803125619888, "rewards/accuracy_reward": 0.734693855047226, "rewards/format_reward": 1.0, "step": 3169 }, { "completion_length": 313.2550964355469, "epoch": 0.3189937106918239, "grad_norm": 0.7191066741943359, "kl": 0.051513671875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8341113924980164, "reward_std": 0.23261819034814835, "rewards/accuracy_reward": 0.8443154990673065, "rewards/format_reward": 0.9897959232330322, "step": 3170 }, { "completion_length": 188.14286041259766, "epoch": 0.3190943396226415, "grad_norm": 0.62385094165802, "kl": 0.0986328125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7534577250480652, "reward_std": 0.13112996518611908, "rewards/accuracy_reward": 0.7738659083843231, "rewards/format_reward": 0.9795918464660645, "step": 3171 }, { "completion_length": 267.4897918701172, "epoch": 0.31919496855345914, "grad_norm": 0.4768504798412323, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6326530575752258, "reward_std": 0.11584595590829849, "rewards/accuracy_reward": 0.6326530426740646, "rewards/format_reward": 1.0, "step": 3172 }, { "completion_length": 186.5204086303711, "epoch": 0.3192955974842767, "grad_norm": 1.2836743593215942, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.672265648841858, "reward_std": 0.21557464450597763, "rewards/accuracy_reward": 0.6824697256088257, "rewards/format_reward": 0.9897959232330322, "step": 3173 }, { "completion_length": 242.80611419677734, "epoch": 0.31939622641509435, "grad_norm": 0.5652376413345337, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7796536087989807, "reward_std": 0.15146857872605324, "rewards/accuracy_reward": 0.7796536386013031, "rewards/format_reward": 1.0, "step": 3174 }, { "completion_length": 358.8061065673828, "epoch": 0.31949685534591193, "grad_norm": 1.0064945220947266, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5017815828323364, "reward_std": 0.28413233160972595, "rewards/accuracy_reward": 0.5119857341051102, "rewards/format_reward": 0.9897959232330322, "step": 3175 }, { "completion_length": 296.7959213256836, "epoch": 0.31959748427672957, "grad_norm": 0.6018039584159851, "kl": 0.0574951171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.4449707865715027, "reward_std": 0.17834709584712982, "rewards/accuracy_reward": 0.45517490804195404, "rewards/format_reward": 0.9897959232330322, "step": 3176 }, { "completion_length": 270.948974609375, "epoch": 0.31969811320754715, "grad_norm": 0.7639728784561157, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7625930905342102, "reward_std": 0.15862416103482246, "rewards/accuracy_reward": 0.7625930309295654, "rewards/format_reward": 1.0, "step": 3177 }, { "completion_length": 276.7244873046875, "epoch": 0.3197987421383648, "grad_norm": 0.7121656537055969, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8448267579078674, "reward_std": 0.2164381518959999, "rewards/accuracy_reward": 0.844826877117157, "rewards/format_reward": 1.0, "step": 3178 }, { "completion_length": 215.1734619140625, "epoch": 0.31989937106918237, "grad_norm": 0.8580344915390015, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7099745869636536, "reward_std": 0.1324935108423233, "rewards/accuracy_reward": 0.7099745869636536, "rewards/format_reward": 1.0, "step": 3179 }, { "completion_length": 288.9897918701172, "epoch": 0.32, "grad_norm": 0.7250635623931885, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.762576937675476, "reward_std": 0.23660355061292648, "rewards/accuracy_reward": 0.7931891679763794, "rewards/format_reward": 0.9693877398967743, "step": 3180 }, { "completion_length": 243.39795684814453, "epoch": 0.3201006289308176, "grad_norm": 0.6440001726150513, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.918367326259613, "reward_std": 0.11584595590829849, "rewards/accuracy_reward": 0.918367326259613, "rewards/format_reward": 1.0, "step": 3181 }, { "completion_length": 191.87754821777344, "epoch": 0.3202012578616352, "grad_norm": 1.1272746324539185, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7120493054389954, "reward_std": 0.2195148766040802, "rewards/accuracy_reward": 0.7324574589729309, "rewards/format_reward": 0.9795918166637421, "step": 3182 }, { "completion_length": 267.3673324584961, "epoch": 0.3203018867924528, "grad_norm": 0.9081548452377319, "kl": 0.0797119140625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6768707633018494, "reward_std": 0.25383297353982925, "rewards/accuracy_reward": 0.6870747953653336, "rewards/format_reward": 0.9897959232330322, "step": 3183 }, { "completion_length": 252.57142639160156, "epoch": 0.32040251572327044, "grad_norm": 1.6477270126342773, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7050387263298035, "reward_std": 0.1726497858762741, "rewards/accuracy_reward": 0.7152428030967712, "rewards/format_reward": 0.9897959232330322, "step": 3184 }, { "completion_length": 230.5, "epoch": 0.320503144654088, "grad_norm": 0.47830504179000854, "kl": 0.099609375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7347699999809265, "reward_std": 0.1249965038150549, "rewards/accuracy_reward": 0.7449740767478943, "rewards/format_reward": 0.9897959232330322, "step": 3185 }, { "completion_length": 214.08162689208984, "epoch": 0.32060377358490566, "grad_norm": 0.4802255630493164, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8144023418426514, "reward_std": 0.13483507558703423, "rewards/accuracy_reward": 0.8144023716449738, "rewards/format_reward": 1.0, "step": 3186 }, { "completion_length": 211.79591369628906, "epoch": 0.3207044025157233, "grad_norm": 1.0287257432937622, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.707482933998108, "reward_std": 0.1495177112519741, "rewards/accuracy_reward": 0.7074829936027527, "rewards/format_reward": 1.0, "step": 3187 }, { "completion_length": 259.29591369628906, "epoch": 0.3208050314465409, "grad_norm": 0.5198586583137512, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7243196964263916, "reward_std": 0.1316743567585945, "rewards/accuracy_reward": 0.7345238029956818, "rewards/format_reward": 0.9897959232330322, "step": 3188 }, { "completion_length": 303.1122360229492, "epoch": 0.3209056603773585, "grad_norm": 0.6863895654678345, "kl": 0.056884765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6887754797935486, "reward_std": 0.14998390525579453, "rewards/accuracy_reward": 0.6989795863628387, "rewards/format_reward": 0.9897959232330322, "step": 3189 }, { "completion_length": 235.84693908691406, "epoch": 0.3210062893081761, "grad_norm": 0.5077970027923584, "kl": 0.0760498046875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8877898454666138, "reward_std": 0.04292925167828798, "rewards/accuracy_reward": 0.8877898752689362, "rewards/format_reward": 1.0, "step": 3190 }, { "completion_length": 176.71428680419922, "epoch": 0.32110691823899373, "grad_norm": 0.9866737127304077, "kl": 0.0718994140625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.9195634126663208, "reward_std": 0.14904137700796127, "rewards/accuracy_reward": 0.9297674894332886, "rewards/format_reward": 0.9897959232330322, "step": 3191 }, { "completion_length": 231.12244415283203, "epoch": 0.3212075471698113, "grad_norm": 0.6037569046020508, "kl": 0.097412109375, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7809361815452576, "reward_std": 0.07815235294401646, "rewards/accuracy_reward": 0.7809361517429352, "rewards/format_reward": 1.0, "step": 3192 }, { "completion_length": 258.05101776123047, "epoch": 0.32130817610062895, "grad_norm": 0.7641595005989075, "kl": 0.0556640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7595734596252441, "reward_std": 0.13944949442520738, "rewards/accuracy_reward": 0.7595734596252441, "rewards/format_reward": 1.0, "step": 3193 }, { "completion_length": 306.0, "epoch": 0.32140880503144653, "grad_norm": 0.9273257851600647, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.719387710094452, "reward_std": 0.3035379648208618, "rewards/accuracy_reward": 0.7295918166637421, "rewards/format_reward": 0.9897959232330322, "step": 3194 }, { "completion_length": 220.58163452148438, "epoch": 0.32150943396226417, "grad_norm": 0.7397608757019043, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7443413734436035, "reward_std": 0.15182927623391151, "rewards/accuracy_reward": 0.7647495269775391, "rewards/format_reward": 0.9795918166637421, "step": 3195 }, { "completion_length": 228.30611419677734, "epoch": 0.32161006289308175, "grad_norm": 0.6203059554100037, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.854130208492279, "reward_std": 0.1625358983874321, "rewards/accuracy_reward": 0.8643342852592468, "rewards/format_reward": 0.9897959232330322, "step": 3196 }, { "completion_length": 224.7448959350586, "epoch": 0.3217106918238994, "grad_norm": 1.8335933685302734, "kl": 0.05517578125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7344671487808228, "reward_std": 0.1604599803686142, "rewards/accuracy_reward": 0.7344671189785004, "rewards/format_reward": 1.0, "step": 3197 }, { "completion_length": 285.30611419677734, "epoch": 0.32181132075471697, "grad_norm": 0.7657856941223145, "kl": 0.093994140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.5763118863105774, "reward_std": 0.16422457247972488, "rewards/accuracy_reward": 0.6069241464138031, "rewards/format_reward": 0.9693877398967743, "step": 3198 }, { "completion_length": 226.31632232666016, "epoch": 0.3219119496855346, "grad_norm": 0.7907391786575317, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6819727420806885, "reward_std": 0.20028963685035706, "rewards/accuracy_reward": 0.692176878452301, "rewards/format_reward": 0.9897959232330322, "step": 3199 }, { "completion_length": 260.6734619140625, "epoch": 0.3220125786163522, "grad_norm": 0.6617985963821411, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.693539798259735, "reward_std": 0.17620807886123657, "rewards/accuracy_reward": 0.713947981595993, "rewards/format_reward": 0.9795918166637421, "step": 3200 }, { "completion_length": 221.2142791748047, "epoch": 0.3221132075471698, "grad_norm": 1.4568792581558228, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7633360624313354, "reward_std": 0.13173974305391312, "rewards/accuracy_reward": 0.7735401690006256, "rewards/format_reward": 0.9897959232330322, "step": 3201 }, { "completion_length": 246.16326141357422, "epoch": 0.3222138364779874, "grad_norm": 0.705838143825531, "kl": 0.0821533203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7568644881248474, "reward_std": 0.22543224692344666, "rewards/accuracy_reward": 0.7772727012634277, "rewards/format_reward": 0.9795918464660645, "step": 3202 }, { "completion_length": 176.94897842407227, "epoch": 0.32231446540880504, "grad_norm": 0.25884467363357544, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.9650145173072815, "reward_std": 0.07957790791988373, "rewards/accuracy_reward": 0.9752186238765717, "rewards/format_reward": 0.9897959232330322, "step": 3203 }, { "completion_length": 235.948974609375, "epoch": 0.3224150943396226, "grad_norm": 0.5564208626747131, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7142857313156128, "reward_std": 0.1977376937866211, "rewards/accuracy_reward": 0.734693855047226, "rewards/format_reward": 0.9795918166637421, "step": 3204 }, { "completion_length": 284.7244873046875, "epoch": 0.32251572327044026, "grad_norm": 0.945663571357727, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7417606115341187, "reward_std": 0.18121184408664703, "rewards/accuracy_reward": 0.7519647181034088, "rewards/format_reward": 0.9897959232330322, "step": 3205 }, { "completion_length": 287.76529693603516, "epoch": 0.32261635220125784, "grad_norm": 0.7855045795440674, "kl": 0.0474853515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7265305519104004, "reward_std": 0.20082516968250275, "rewards/accuracy_reward": 0.7265306115150452, "rewards/format_reward": 1.0, "step": 3206 }, { "completion_length": 317.57141876220703, "epoch": 0.3227169811320755, "grad_norm": 0.9572110772132874, "kl": 0.0670166015625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.695532202720642, "reward_std": 0.22593602538108826, "rewards/accuracy_reward": 0.7159404158592224, "rewards/format_reward": 0.9795918464660645, "step": 3207 }, { "completion_length": 228.96937561035156, "epoch": 0.32281761006289306, "grad_norm": 1.101304292678833, "kl": 0.0682373046875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7377373576164246, "reward_std": 0.2443327233195305, "rewards/accuracy_reward": 0.7581455409526825, "rewards/format_reward": 0.9795918464660645, "step": 3208 }, { "completion_length": 223.62244415283203, "epoch": 0.3229182389937107, "grad_norm": 1.2302625179290771, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8787514567375183, "reward_std": 0.16120021790266037, "rewards/accuracy_reward": 0.8889555633068085, "rewards/format_reward": 0.9897959232330322, "step": 3209 }, { "completion_length": 231.73468780517578, "epoch": 0.3230188679245283, "grad_norm": 0.7635917663574219, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.563391923904419, "reward_std": 0.11694023385643959, "rewards/accuracy_reward": 0.5633918941020966, "rewards/format_reward": 1.0, "step": 3210 }, { "completion_length": 324.0918273925781, "epoch": 0.3231194968553459, "grad_norm": 0.4909249246120453, "kl": 0.0498046875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5413021445274353, "reward_std": 0.19878144562244415, "rewards/accuracy_reward": 0.5821185111999512, "rewards/format_reward": 0.9591836333274841, "step": 3211 }, { "completion_length": 331.01019287109375, "epoch": 0.32322012578616355, "grad_norm": 7.032057285308838, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7023991346359253, "reward_std": 0.2661591023206711, "rewards/accuracy_reward": 0.7330114245414734, "rewards/format_reward": 0.9693877398967743, "step": 3212 }, { "completion_length": 247.36734771728516, "epoch": 0.32332075471698113, "grad_norm": 2.590700626373291, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6861934661865234, "reward_std": 0.25925901532173157, "rewards/accuracy_reward": 0.6963976621627808, "rewards/format_reward": 0.9897959232330322, "step": 3213 }, { "completion_length": 296.3571472167969, "epoch": 0.32342138364779877, "grad_norm": 0.39227232336997986, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7112450003623962, "reward_std": 0.08969644084572792, "rewards/accuracy_reward": 0.7214491367340088, "rewards/format_reward": 0.9897959232330322, "step": 3214 }, { "completion_length": 303.34693908691406, "epoch": 0.32352201257861635, "grad_norm": 0.5400360822677612, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.566481113433838, "reward_std": 0.18485569581389427, "rewards/accuracy_reward": 0.597093403339386, "rewards/format_reward": 0.9693877398967743, "step": 3215 }, { "completion_length": 246.91836547851562, "epoch": 0.323622641509434, "grad_norm": 0.8529428243637085, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8262909054756165, "reward_std": 0.24194075167179108, "rewards/accuracy_reward": 0.8466991186141968, "rewards/format_reward": 0.9795918166637421, "step": 3216 }, { "completion_length": 338.92857360839844, "epoch": 0.32372327044025156, "grad_norm": 5.413506031036377, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.692055344581604, "reward_std": 0.24541576951742172, "rewards/accuracy_reward": 0.7022594213485718, "rewards/format_reward": 0.9897959232330322, "step": 3217 }, { "completion_length": 290.4795837402344, "epoch": 0.3238238993710692, "grad_norm": 0.37182965874671936, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7125878930091858, "reward_std": 0.1434175744652748, "rewards/accuracy_reward": 0.7432001829147339, "rewards/format_reward": 0.9693877398967743, "step": 3218 }, { "completion_length": 286.6938705444336, "epoch": 0.3239245283018868, "grad_norm": 0.4058399498462677, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.754354476928711, "reward_std": 0.11235266178846359, "rewards/accuracy_reward": 0.7747626304626465, "rewards/format_reward": 0.9795918464660645, "step": 3219 }, { "completion_length": 250.2551040649414, "epoch": 0.3240251572327044, "grad_norm": 0.9361487030982971, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7647762894630432, "reward_std": 0.21225005388259888, "rewards/accuracy_reward": 0.7851844429969788, "rewards/format_reward": 0.9795918464660645, "step": 3220 }, { "completion_length": 283.5918273925781, "epoch": 0.324125786163522, "grad_norm": 0.8030375838279724, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.526910126209259, "reward_std": 0.2667079418897629, "rewards/accuracy_reward": 0.5473183691501617, "rewards/format_reward": 0.9795918166637421, "step": 3221 }, { "completion_length": 229.36734771728516, "epoch": 0.32422641509433964, "grad_norm": 0.5758161544799805, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.812739610671997, "reward_std": 0.13566205650568008, "rewards/accuracy_reward": 0.8127396404743195, "rewards/format_reward": 1.0, "step": 3222 }, { "completion_length": 310.3673400878906, "epoch": 0.3243270440251572, "grad_norm": 0.592074453830719, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7568519115447998, "reward_std": 0.21512135118246078, "rewards/accuracy_reward": 0.7772601246833801, "rewards/format_reward": 0.9795918464660645, "step": 3223 }, { "completion_length": 266.4897918701172, "epoch": 0.32442767295597486, "grad_norm": 1.2807592153549194, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6348251104354858, "reward_std": 0.21461563557386398, "rewards/accuracy_reward": 0.6450292468070984, "rewards/format_reward": 0.9897959232330322, "step": 3224 }, { "completion_length": 251.75509643554688, "epoch": 0.32452830188679244, "grad_norm": 0.7152183055877686, "kl": 0.0631103515625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8023987412452698, "reward_std": 0.17519892752170563, "rewards/accuracy_reward": 0.8126028478145599, "rewards/format_reward": 0.9897959232330322, "step": 3225 }, { "completion_length": 343.7652893066406, "epoch": 0.3246289308176101, "grad_norm": 0.8935769200325012, "kl": 0.0506591796875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5386499166488647, "reward_std": 0.1971932202577591, "rewards/accuracy_reward": 0.5488540381193161, "rewards/format_reward": 0.9897959232330322, "step": 3226 }, { "completion_length": 337.68365478515625, "epoch": 0.32472955974842765, "grad_norm": 0.6582101583480835, "kl": 0.0540771484375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.77557772397995, "reward_std": 0.19973544776439667, "rewards/accuracy_reward": 0.7857818901538849, "rewards/format_reward": 0.9897959232330322, "step": 3227 }, { "completion_length": 131.30612182617188, "epoch": 0.3248301886792453, "grad_norm": 0.46783554553985596, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8700120449066162, "reward_std": 0.01625758968293667, "rewards/accuracy_reward": 0.8700120449066162, "rewards/format_reward": 1.0, "step": 3228 }, { "completion_length": 231.4591827392578, "epoch": 0.32493081761006287, "grad_norm": 0.9025216102600098, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7604212760925293, "reward_std": 0.1928388699889183, "rewards/accuracy_reward": 0.7706254124641418, "rewards/format_reward": 0.9897959232330322, "step": 3229 }, { "completion_length": 239.2551040649414, "epoch": 0.3250314465408805, "grad_norm": 0.35995611548423767, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.9001237154006958, "reward_std": 0.0728151723742485, "rewards/accuracy_reward": 0.920531839132309, "rewards/format_reward": 0.9795918166637421, "step": 3230 }, { "completion_length": 337.5612030029297, "epoch": 0.3251320754716981, "grad_norm": 0.531286358833313, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.691943347454071, "reward_std": 0.2056170403957367, "rewards/accuracy_reward": 0.7225556671619415, "rewards/format_reward": 0.9693877398967743, "step": 3231 }, { "completion_length": 254.61223602294922, "epoch": 0.3252327044025157, "grad_norm": 0.7979308366775513, "kl": 0.05859375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6825938820838928, "reward_std": 0.22382806986570358, "rewards/accuracy_reward": 0.692797988653183, "rewards/format_reward": 0.9897959232330322, "step": 3232 }, { "completion_length": 236.07141876220703, "epoch": 0.3253333333333333, "grad_norm": 0.519932210445404, "kl": 0.09423828125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6222702264785767, "reward_std": 0.08087743818759918, "rewards/accuracy_reward": 0.6222702115774155, "rewards/format_reward": 1.0, "step": 3233 }, { "completion_length": 271.0306167602539, "epoch": 0.32543396226415094, "grad_norm": 0.8926976919174194, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7202900648117065, "reward_std": 0.30550431460142136, "rewards/accuracy_reward": 0.7509022653102875, "rewards/format_reward": 0.9693877398967743, "step": 3234 }, { "completion_length": 282.2346878051758, "epoch": 0.3255345911949686, "grad_norm": 1.2289280891418457, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.547701895236969, "reward_std": 0.20090384781360626, "rewards/accuracy_reward": 0.5579059571027756, "rewards/format_reward": 0.9897959232330322, "step": 3235 }, { "completion_length": 276.6734619140625, "epoch": 0.32563522012578616, "grad_norm": 0.6515935063362122, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7263296246528625, "reward_std": 0.2162100225687027, "rewards/accuracy_reward": 0.7263296246528625, "rewards/format_reward": 1.0, "step": 3236 }, { "completion_length": 275.57141876220703, "epoch": 0.3257358490566038, "grad_norm": 0.6196724772453308, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7075929641723633, "reward_std": 0.15147437155246735, "rewards/accuracy_reward": 0.7382052540779114, "rewards/format_reward": 0.9693877398967743, "step": 3237 }, { "completion_length": 371.4285583496094, "epoch": 0.3258364779874214, "grad_norm": 0.973019003868103, "kl": 0.051025390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.5388460755348206, "reward_std": 0.3214281052350998, "rewards/accuracy_reward": 0.5592541247606277, "rewards/format_reward": 0.9795918464660645, "step": 3238 }, { "completion_length": 302.4285583496094, "epoch": 0.325937106918239, "grad_norm": 0.9431812167167664, "kl": 0.0731201171875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.660452425479889, "reward_std": 0.17132777348160744, "rewards/accuracy_reward": 0.6808606684207916, "rewards/format_reward": 0.9795918464660645, "step": 3239 }, { "completion_length": 190.12244415283203, "epoch": 0.3260377358490566, "grad_norm": 0.3792257010936737, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.836734652519226, "reward_std": 0.06517763808369637, "rewards/accuracy_reward": 0.8367346823215485, "rewards/format_reward": 1.0, "step": 3240 }, { "completion_length": 254.9591827392578, "epoch": 0.32613836477987423, "grad_norm": 1.237835168838501, "kl": 0.097412109375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6228243708610535, "reward_std": 0.20487917214632034, "rewards/accuracy_reward": 0.6432325690984726, "rewards/format_reward": 0.9795918166637421, "step": 3241 }, { "completion_length": 238.32652282714844, "epoch": 0.3262389937106918, "grad_norm": 0.9810670018196106, "kl": 0.0667724609375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6598638892173767, "reward_std": 0.20692354440689087, "rewards/accuracy_reward": 0.6700679659843445, "rewards/format_reward": 0.9897959232330322, "step": 3242 }, { "completion_length": 246.4081573486328, "epoch": 0.32633962264150945, "grad_norm": 0.7787060141563416, "kl": 0.104248046875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7637187838554382, "reward_std": 0.10201245546340942, "rewards/accuracy_reward": 0.7841269671916962, "rewards/format_reward": 0.9795918464660645, "step": 3243 }, { "completion_length": 279.19386291503906, "epoch": 0.32644025157232703, "grad_norm": 0.7977600693702698, "kl": 0.0657958984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6969974637031555, "reward_std": 0.13780471682548523, "rewards/accuracy_reward": 0.7174056768417358, "rewards/format_reward": 0.9795918464660645, "step": 3244 }, { "completion_length": 259.42857360839844, "epoch": 0.32654088050314467, "grad_norm": 0.8698066473007202, "kl": 0.0721435546875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7295917868614197, "reward_std": 0.17538097500801086, "rewards/accuracy_reward": 0.7499999701976776, "rewards/format_reward": 0.9795918166637421, "step": 3245 }, { "completion_length": 244.42857360839844, "epoch": 0.32664150943396225, "grad_norm": 0.5601822137832642, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7838151454925537, "reward_std": 0.09440874680876732, "rewards/accuracy_reward": 0.7838151752948761, "rewards/format_reward": 1.0, "step": 3246 }, { "completion_length": 312.06121826171875, "epoch": 0.3267421383647799, "grad_norm": 1.415864109992981, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6428570747375488, "reward_std": 0.1270286664366722, "rewards/accuracy_reward": 0.6632652878761292, "rewards/format_reward": 0.9795918464660645, "step": 3247 }, { "completion_length": 260.1836624145508, "epoch": 0.32684276729559747, "grad_norm": 0.5004879236221313, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.747764766216278, "reward_std": 0.11406367644667625, "rewards/accuracy_reward": 0.7579689025878906, "rewards/format_reward": 0.9897959232330322, "step": 3248 }, { "completion_length": 273.46937561035156, "epoch": 0.3269433962264151, "grad_norm": 0.6293354034423828, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6645164489746094, "reward_std": 0.110186867415905, "rewards/accuracy_reward": 0.6951286643743515, "rewards/format_reward": 0.9693877398967743, "step": 3249 }, { "completion_length": 254.05101776123047, "epoch": 0.3270440251572327, "grad_norm": 1.1185611486434937, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6423953771591187, "reward_std": 0.2748396247625351, "rewards/accuracy_reward": 0.6628034710884094, "rewards/format_reward": 0.9795918464660645, "step": 3250 }, { "completion_length": 237.30611419677734, "epoch": 0.3271446540880503, "grad_norm": 1.3392748832702637, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8344670534133911, "reward_std": 0.22226588428020477, "rewards/accuracy_reward": 0.8548752665519714, "rewards/format_reward": 0.9795918166637421, "step": 3251 }, { "completion_length": 231.6326446533203, "epoch": 0.3272452830188679, "grad_norm": 0.4588637351989746, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8877550959587097, "reward_std": 0.08099238574504852, "rewards/accuracy_reward": 0.8979591727256775, "rewards/format_reward": 0.9897959232330322, "step": 3252 }, { "completion_length": 288.77550506591797, "epoch": 0.32734591194968554, "grad_norm": 0.6558843851089478, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7484936714172363, "reward_std": 0.23859910666942596, "rewards/accuracy_reward": 0.7689018249511719, "rewards/format_reward": 0.9795918166637421, "step": 3253 }, { "completion_length": 205.2448959350586, "epoch": 0.3274465408805031, "grad_norm": 0.7774765491485596, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.680202305316925, "reward_std": 0.20017008483409882, "rewards/accuracy_reward": 0.700610488653183, "rewards/format_reward": 0.9795918166637421, "step": 3254 }, { "completion_length": 275.15306091308594, "epoch": 0.32754716981132076, "grad_norm": 0.4377111494541168, "kl": 0.0775146484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8498541712760925, "reward_std": 0.08074730960652232, "rewards/accuracy_reward": 0.8600583076477051, "rewards/format_reward": 0.9897959232330322, "step": 3255 }, { "completion_length": 302.28570556640625, "epoch": 0.32764779874213834, "grad_norm": 0.8757442235946655, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.695427119731903, "reward_std": 0.2234458327293396, "rewards/accuracy_reward": 0.715835303068161, "rewards/format_reward": 0.9795918464660645, "step": 3256 }, { "completion_length": 283.8673400878906, "epoch": 0.327748427672956, "grad_norm": 0.8249228596687317, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7504637837409973, "reward_std": 0.18805450946092606, "rewards/accuracy_reward": 0.7810760736465454, "rewards/format_reward": 0.9693877398967743, "step": 3257 }, { "completion_length": 321.1428527832031, "epoch": 0.32784905660377356, "grad_norm": 1.133896827697754, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6360543966293335, "reward_std": 0.3241407126188278, "rewards/accuracy_reward": 0.6870748102664948, "rewards/format_reward": 0.9489795565605164, "step": 3258 }, { "completion_length": 152.35713958740234, "epoch": 0.3279496855345912, "grad_norm": 1.0874189138412476, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8189905881881714, "reward_std": 0.14171331375837326, "rewards/accuracy_reward": 0.8393986821174622, "rewards/format_reward": 0.9795918166637421, "step": 3259 }, { "completion_length": 229.16326141357422, "epoch": 0.32805031446540883, "grad_norm": 1.0512391328811646, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8612899780273438, "reward_std": 0.10232711210846901, "rewards/accuracy_reward": 0.8612899482250214, "rewards/format_reward": 1.0, "step": 3260 }, { "completion_length": 328.95916748046875, "epoch": 0.3281509433962264, "grad_norm": 0.7134917378425598, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.57614403963089, "reward_std": 0.1930737979710102, "rewards/accuracy_reward": 0.5863481909036636, "rewards/format_reward": 0.9897959232330322, "step": 3261 }, { "completion_length": 271.57141876220703, "epoch": 0.32825157232704405, "grad_norm": 0.8021804094314575, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7131387591362, "reward_std": 0.25827378034591675, "rewards/accuracy_reward": 0.7131387889385223, "rewards/format_reward": 1.0, "step": 3262 }, { "completion_length": 224.7448959350586, "epoch": 0.32835220125786163, "grad_norm": 0.6173325777053833, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7285714149475098, "reward_std": 0.12997154146432877, "rewards/accuracy_reward": 0.7387754917144775, "rewards/format_reward": 0.9897959232330322, "step": 3263 }, { "completion_length": 330.4795837402344, "epoch": 0.32845283018867927, "grad_norm": 0.47221145033836365, "kl": 0.0428466796875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.7728557586669922, "reward_std": 0.17525216937065125, "rewards/accuracy_reward": 0.7830598652362823, "rewards/format_reward": 0.9897959232330322, "step": 3264 }, { "completion_length": 269.948974609375, "epoch": 0.32855345911949685, "grad_norm": 0.793225109577179, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.537696897983551, "reward_std": 0.1897348165512085, "rewards/accuracy_reward": 0.5376969277858734, "rewards/format_reward": 1.0, "step": 3265 }, { "completion_length": 305.3367233276367, "epoch": 0.3286540880503145, "grad_norm": 0.5831113457679749, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6636576056480408, "reward_std": 0.17892935499548912, "rewards/accuracy_reward": 0.6738616228103638, "rewards/format_reward": 0.9897959232330322, "step": 3266 }, { "completion_length": 295.0306091308594, "epoch": 0.32875471698113207, "grad_norm": 0.8260488510131836, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.579207956790924, "reward_std": 0.17999516427516937, "rewards/accuracy_reward": 0.5894120037555695, "rewards/format_reward": 0.9897959232330322, "step": 3267 }, { "completion_length": 254.12244415283203, "epoch": 0.3288553459119497, "grad_norm": 0.6173029541969299, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7210884094238281, "reward_std": 0.0708501860499382, "rewards/accuracy_reward": 0.7210884094238281, "rewards/format_reward": 1.0, "step": 3268 }, { "completion_length": 198.51020050048828, "epoch": 0.3289559748427673, "grad_norm": 0.609867513179779, "kl": 0.049072265625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8828845024108887, "reward_std": 0.12427085265517235, "rewards/accuracy_reward": 0.8828845620155334, "rewards/format_reward": 1.0, "step": 3269 }, { "completion_length": 314.18365478515625, "epoch": 0.3290566037735849, "grad_norm": 0.4611901640892029, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6873221397399902, "reward_std": 0.15647803992033005, "rewards/accuracy_reward": 0.7179343998432159, "rewards/format_reward": 0.9693877398967743, "step": 3270 }, { "completion_length": 277.1938705444336, "epoch": 0.3291572327044025, "grad_norm": 0.6953217387199402, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.734790027141571, "reward_std": 0.2425597682595253, "rewards/accuracy_reward": 0.755198210477829, "rewards/format_reward": 0.9795918166637421, "step": 3271 }, { "completion_length": 215.59183502197266, "epoch": 0.32925786163522014, "grad_norm": 1.0434871912002563, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6598639488220215, "reward_std": 0.10490182414650917, "rewards/accuracy_reward": 0.6700679957866669, "rewards/format_reward": 0.9897959232330322, "step": 3272 }, { "completion_length": 191.35713958740234, "epoch": 0.3293584905660377, "grad_norm": 1.2731225490570068, "kl": 0.0711669921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7387754917144775, "reward_std": 0.13797033578157425, "rewards/accuracy_reward": 0.7489795982837677, "rewards/format_reward": 0.9897959232330322, "step": 3273 }, { "completion_length": 236.8775405883789, "epoch": 0.32945911949685536, "grad_norm": 0.9584758877754211, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6802720427513123, "reward_std": 0.1473044566810131, "rewards/accuracy_reward": 0.680272102355957, "rewards/format_reward": 1.0, "step": 3274 }, { "completion_length": 291.1326446533203, "epoch": 0.32955974842767294, "grad_norm": 1.0513880252838135, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7306256890296936, "reward_std": 0.16303575783967972, "rewards/accuracy_reward": 0.7408297657966614, "rewards/format_reward": 0.9897959232330322, "step": 3275 }, { "completion_length": 326.346923828125, "epoch": 0.3296603773584906, "grad_norm": 0.5118177533149719, "kl": 0.0528564453125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6723031401634216, "reward_std": 0.15973025560379028, "rewards/accuracy_reward": 0.6825072765350342, "rewards/format_reward": 0.9897959232330322, "step": 3276 }, { "completion_length": 247.78571319580078, "epoch": 0.32976100628930816, "grad_norm": 0.5286374092102051, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6859409809112549, "reward_std": 0.1255587339401245, "rewards/accuracy_reward": 0.6859410107135773, "rewards/format_reward": 1.0, "step": 3277 }, { "completion_length": 262.8571319580078, "epoch": 0.3298616352201258, "grad_norm": 2.5179991722106934, "kl": 0.054443359375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8116450905799866, "reward_std": 0.13223420456051826, "rewards/accuracy_reward": 0.811645120382309, "rewards/format_reward": 1.0, "step": 3278 }, { "completion_length": 158.12245178222656, "epoch": 0.3299622641509434, "grad_norm": 1.984377384185791, "kl": 0.065185546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8152737021446228, "reward_std": 0.08463230356574059, "rewards/accuracy_reward": 0.8152737021446228, "rewards/format_reward": 1.0, "step": 3279 }, { "completion_length": 283.31632232666016, "epoch": 0.330062893081761, "grad_norm": 1.0148959159851074, "kl": 0.0467529296875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6625850200653076, "reward_std": 0.23339122533798218, "rewards/accuracy_reward": 0.672789067029953, "rewards/format_reward": 0.9897959232330322, "step": 3280 }, { "completion_length": 264.4183654785156, "epoch": 0.3301635220125786, "grad_norm": 1.5592800378799438, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.633345901966095, "reward_std": 0.25101465731859207, "rewards/accuracy_reward": 0.6537542045116425, "rewards/format_reward": 0.9795918464660645, "step": 3281 }, { "completion_length": 230.31632232666016, "epoch": 0.33026415094339623, "grad_norm": 1.2181755304336548, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7824584245681763, "reward_std": 0.23486171662807465, "rewards/accuracy_reward": 0.8028664886951447, "rewards/format_reward": 0.9795918166637421, "step": 3282 }, { "completion_length": 305.10203552246094, "epoch": 0.3303647798742138, "grad_norm": 0.6410309672355652, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6167646050453186, "reward_std": 0.1689368113875389, "rewards/accuracy_reward": 0.6269686818122864, "rewards/format_reward": 0.9897959232330322, "step": 3283 }, { "completion_length": 258.6122360229492, "epoch": 0.33046540880503145, "grad_norm": 0.7780445218086243, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7506390810012817, "reward_std": 0.23061606287956238, "rewards/accuracy_reward": 0.7506391704082489, "rewards/format_reward": 1.0, "step": 3284 }, { "completion_length": 266.9387741088867, "epoch": 0.3305660377358491, "grad_norm": 1.0106303691864014, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7553228735923767, "reward_std": 0.21728502213954926, "rewards/accuracy_reward": 0.7859351336956024, "rewards/format_reward": 0.9693877398967743, "step": 3285 }, { "completion_length": 248.89794921875, "epoch": 0.33066666666666666, "grad_norm": 0.6287591457366943, "kl": 0.0655517578125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.602975308895111, "reward_std": 0.10739198327064514, "rewards/accuracy_reward": 0.6029752641916275, "rewards/format_reward": 1.0, "step": 3286 }, { "completion_length": 237.7346954345703, "epoch": 0.3307672955974843, "grad_norm": 0.6945087909698486, "kl": 0.0635986328125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8350340127944946, "reward_std": 0.20780439674854279, "rewards/accuracy_reward": 0.865646243095398, "rewards/format_reward": 0.9693877398967743, "step": 3287 }, { "completion_length": 323.1530532836914, "epoch": 0.3308679245283019, "grad_norm": 0.8372417092323303, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5756202340126038, "reward_std": 0.27122724056243896, "rewards/accuracy_reward": 0.5960284769535065, "rewards/format_reward": 0.9795918166637421, "step": 3288 }, { "completion_length": 233.78570556640625, "epoch": 0.3309685534591195, "grad_norm": 1.9734609127044678, "kl": 0.0576171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6600369811058044, "reward_std": 0.21463225781917572, "rewards/accuracy_reward": 0.6600369811058044, "rewards/format_reward": 1.0, "step": 3289 }, { "completion_length": 250.55101776123047, "epoch": 0.3310691823899371, "grad_norm": 1.7023855447769165, "kl": 0.105712890625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7245515584945679, "reward_std": 0.19357897341251373, "rewards/accuracy_reward": 0.7347557246685028, "rewards/format_reward": 0.9897959232330322, "step": 3290 }, { "completion_length": 230.86734008789062, "epoch": 0.33116981132075474, "grad_norm": 0.6018826365470886, "kl": 0.0682373046875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8229425549507141, "reward_std": 0.15173610113561153, "rewards/accuracy_reward": 0.8229426741600037, "rewards/format_reward": 1.0, "step": 3291 }, { "completion_length": 307.4387664794922, "epoch": 0.3312704402515723, "grad_norm": 1.1408785581588745, "kl": 0.054931640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.615460753440857, "reward_std": 0.34209735691547394, "rewards/accuracy_reward": 0.6358688771724701, "rewards/format_reward": 0.9795918464660645, "step": 3292 }, { "completion_length": 224.27550506591797, "epoch": 0.33137106918238995, "grad_norm": 1.1113442182540894, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6248711347579956, "reward_std": 0.22250424325466156, "rewards/accuracy_reward": 0.6350752115249634, "rewards/format_reward": 0.9897959232330322, "step": 3293 }, { "completion_length": 228.1836700439453, "epoch": 0.33147169811320754, "grad_norm": 1.454234004020691, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7004858255386353, "reward_std": 0.12169606238603592, "rewards/accuracy_reward": 0.70048588514328, "rewards/format_reward": 1.0, "step": 3294 }, { "completion_length": 202.7448959350586, "epoch": 0.3315723270440252, "grad_norm": 1.4862642288208008, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.857616901397705, "reward_std": 0.18935995548963547, "rewards/accuracy_reward": 0.8882291913032532, "rewards/format_reward": 0.9693877398967743, "step": 3295 }, { "completion_length": 226.19386291503906, "epoch": 0.33167295597484275, "grad_norm": 2.1141200065612793, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.768014371395111, "reward_std": 0.2504749372601509, "rewards/accuracy_reward": 0.7884226143360138, "rewards/format_reward": 0.9795918166637421, "step": 3296 }, { "completion_length": 190.33673095703125, "epoch": 0.3317735849056604, "grad_norm": 0.7346693873405457, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.659883737564087, "reward_std": 0.15627256780862808, "rewards/accuracy_reward": 0.6598837673664093, "rewards/format_reward": 1.0, "step": 3297 }, { "completion_length": 263.7244873046875, "epoch": 0.33187421383647797, "grad_norm": 0.3334728181362152, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8105201125144958, "reward_std": 0.12911148741841316, "rewards/accuracy_reward": 0.841132402420044, "rewards/format_reward": 0.9693877398967743, "step": 3298 }, { "completion_length": 188.64285278320312, "epoch": 0.3319748427672956, "grad_norm": 0.8179052472114563, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8770831823349, "reward_std": 0.11606721207499504, "rewards/accuracy_reward": 0.8872871994972229, "rewards/format_reward": 0.9897959232330322, "step": 3299 }, { "completion_length": 261.9081573486328, "epoch": 0.3320754716981132, "grad_norm": 0.8001638650894165, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6180818676948547, "reward_std": 0.1953238621354103, "rewards/accuracy_reward": 0.6282859742641449, "rewards/format_reward": 0.9897959232330322, "step": 3300 }, { "completion_length": 193.97958755493164, "epoch": 0.3321761006289308, "grad_norm": 1.1549384593963623, "kl": 0.0770263671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7210822701454163, "reward_std": 0.08714442141354084, "rewards/accuracy_reward": 0.721082329750061, "rewards/format_reward": 1.0, "step": 3301 }, { "completion_length": 173.39794921875, "epoch": 0.3322767295597484, "grad_norm": 1.1142040491104126, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5569728016853333, "reward_std": 0.22866177558898926, "rewards/accuracy_reward": 0.587585061788559, "rewards/format_reward": 0.9693877398967743, "step": 3302 }, { "completion_length": 290.87754821777344, "epoch": 0.33237735849056604, "grad_norm": 0.774861752986908, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6188032627105713, "reward_std": 0.19129328429698944, "rewards/accuracy_reward": 0.6392114460468292, "rewards/format_reward": 0.9795918166637421, "step": 3303 }, { "completion_length": 275.0408172607422, "epoch": 0.3324779874213836, "grad_norm": 0.468062162399292, "kl": 0.054931640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8236151337623596, "reward_std": 0.1275043673813343, "rewards/accuracy_reward": 0.8338192403316498, "rewards/format_reward": 0.9897959232330322, "step": 3304 }, { "completion_length": 240.4795913696289, "epoch": 0.33257861635220126, "grad_norm": 1.1152783632278442, "kl": 0.0606689453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8204081654548645, "reward_std": 0.2073642574250698, "rewards/accuracy_reward": 0.8408163189888, "rewards/format_reward": 0.9795918166637421, "step": 3305 }, { "completion_length": 258.8061065673828, "epoch": 0.33267924528301884, "grad_norm": 0.936493992805481, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6864206790924072, "reward_std": 0.2521611601114273, "rewards/accuracy_reward": 0.686420738697052, "rewards/format_reward": 1.0, "step": 3306 }, { "completion_length": 295.0612106323242, "epoch": 0.3327798742138365, "grad_norm": 1.1497578620910645, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6740972995758057, "reward_std": 0.22207189351320267, "rewards/accuracy_reward": 0.6843014061450958, "rewards/format_reward": 0.9897959232330322, "step": 3307 }, { "completion_length": 182.4897918701172, "epoch": 0.33288050314465406, "grad_norm": 0.7531176805496216, "kl": 0.0758056640625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7358600497245789, "reward_std": 0.1552315391600132, "rewards/accuracy_reward": 0.7358600199222565, "rewards/format_reward": 1.0, "step": 3308 }, { "completion_length": 301.14286041259766, "epoch": 0.3329811320754717, "grad_norm": 1.0337450504302979, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6166500449180603, "reward_std": 0.2585895359516144, "rewards/accuracy_reward": 0.6370581984519958, "rewards/format_reward": 0.9795918166637421, "step": 3309 }, { "completion_length": 163.55101013183594, "epoch": 0.33308176100628933, "grad_norm": 0.2686610519886017, "kl": 0.052978515625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8061224222183228, "reward_std": 0.07303375005722046, "rewards/accuracy_reward": 0.8061224222183228, "rewards/format_reward": 1.0, "step": 3310 }, { "completion_length": 246.9693832397461, "epoch": 0.3331823899371069, "grad_norm": 2.8217151165008545, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6730932593345642, "reward_std": 0.15663763135671616, "rewards/accuracy_reward": 0.6730933487415314, "rewards/format_reward": 1.0, "step": 3311 }, { "completion_length": 219.75509643554688, "epoch": 0.33328301886792455, "grad_norm": 0.6211119294166565, "kl": 0.0504150390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8488823771476746, "reward_std": 0.13693980872631073, "rewards/accuracy_reward": 0.848882406949997, "rewards/format_reward": 1.0, "step": 3312 }, { "completion_length": 262.1632614135742, "epoch": 0.33338364779874213, "grad_norm": 1.128755807876587, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7149041295051575, "reward_std": 0.24535521864891052, "rewards/accuracy_reward": 0.7149041295051575, "rewards/format_reward": 1.0, "step": 3313 }, { "completion_length": 183.2653045654297, "epoch": 0.33348427672955977, "grad_norm": 0.5679741501808167, "kl": 0.059326171875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.675418734550476, "reward_std": 0.10355257987976074, "rewards/accuracy_reward": 0.6754187643527985, "rewards/format_reward": 1.0, "step": 3314 }, { "completion_length": 316.9387664794922, "epoch": 0.33358490566037735, "grad_norm": 0.8519189953804016, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7346938848495483, "reward_std": 0.21587716042995453, "rewards/accuracy_reward": 0.7448979318141937, "rewards/format_reward": 0.9897959232330322, "step": 3315 }, { "completion_length": 263.4897994995117, "epoch": 0.333685534591195, "grad_norm": 0.7250319123268127, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6106818914413452, "reward_std": 0.2206813022494316, "rewards/accuracy_reward": 0.620885968208313, "rewards/format_reward": 0.9897959232330322, "step": 3316 }, { "completion_length": 281.84693908691406, "epoch": 0.33378616352201257, "grad_norm": 0.9333324432373047, "kl": 0.0718994140625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.590462565422058, "reward_std": 0.20160987973213196, "rewards/accuracy_reward": 0.5904625356197357, "rewards/format_reward": 1.0, "step": 3317 }, { "completion_length": 260.9387741088867, "epoch": 0.3338867924528302, "grad_norm": 0.898902952671051, "kl": 0.0552978515625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6596409678459167, "reward_std": 0.20591050386428833, "rewards/accuracy_reward": 0.6698450148105621, "rewards/format_reward": 0.9897959232330322, "step": 3318 }, { "completion_length": 260.1530532836914, "epoch": 0.3339874213836478, "grad_norm": 0.7058312296867371, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7690578699111938, "reward_std": 0.22972781211137772, "rewards/accuracy_reward": 0.7894660830497742, "rewards/format_reward": 0.9795918464660645, "step": 3319 }, { "completion_length": 231.4591827392578, "epoch": 0.3340880503144654, "grad_norm": 0.7338488101959229, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6594387292861938, "reward_std": 0.18232915550470352, "rewards/accuracy_reward": 0.6798469126224518, "rewards/format_reward": 0.9795918166637421, "step": 3320 }, { "completion_length": 254.29591369628906, "epoch": 0.334188679245283, "grad_norm": 0.9741212725639343, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7252200841903687, "reward_std": 0.14477086812257767, "rewards/accuracy_reward": 0.7252201735973358, "rewards/format_reward": 1.0, "step": 3321 }, { "completion_length": 218.07142639160156, "epoch": 0.33428930817610064, "grad_norm": 0.6006289720535278, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.775739312171936, "reward_std": 0.15851135179400444, "rewards/accuracy_reward": 0.7859433889389038, "rewards/format_reward": 0.9897959232330322, "step": 3322 }, { "completion_length": 353.1326446533203, "epoch": 0.3343899371069182, "grad_norm": 0.9331421256065369, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5359326601028442, "reward_std": 0.21879665553569794, "rewards/accuracy_reward": 0.5563407242298126, "rewards/format_reward": 0.9795918166637421, "step": 3323 }, { "completion_length": 290.57142639160156, "epoch": 0.33449056603773586, "grad_norm": 0.7001569867134094, "kl": 0.0447998046875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6291545629501343, "reward_std": 0.19500859081745148, "rewards/accuracy_reward": 0.6495627164840698, "rewards/format_reward": 0.9795918166637421, "step": 3324 }, { "completion_length": 217.4285659790039, "epoch": 0.33459119496855344, "grad_norm": 0.4721413850784302, "kl": 0.0576171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.818154215812683, "reward_std": 0.11891549453139305, "rewards/accuracy_reward": 0.8181542158126831, "rewards/format_reward": 1.0, "step": 3325 }, { "completion_length": 320.948974609375, "epoch": 0.3346918238993711, "grad_norm": 1.9584927558898926, "kl": 0.0601806640625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.535128116607666, "reward_std": 0.30458804219961166, "rewards/accuracy_reward": 0.5555362701416016, "rewards/format_reward": 0.9795918166637421, "step": 3326 }, { "completion_length": 264.3571319580078, "epoch": 0.33479245283018866, "grad_norm": 0.8347976803779602, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5749701857566833, "reward_std": 0.23678481578826904, "rewards/accuracy_reward": 0.625990629196167, "rewards/format_reward": 0.9489795565605164, "step": 3327 }, { "completion_length": 373.0918273925781, "epoch": 0.3348930817610063, "grad_norm": 0.5676072835922241, "kl": 0.056396484375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.408911943435669, "reward_std": 0.1986762210726738, "rewards/accuracy_reward": 0.43952421844005585, "rewards/format_reward": 0.9693877398967743, "step": 3328 }, { "completion_length": 257.5612258911133, "epoch": 0.3349937106918239, "grad_norm": 1.2428593635559082, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.691260278224945, "reward_std": 0.4080490469932556, "rewards/accuracy_reward": 0.7422806918621063, "rewards/format_reward": 0.9489795863628387, "step": 3329 }, { "completion_length": 259.0408172607422, "epoch": 0.3350943396226415, "grad_norm": 1.7385350465774536, "kl": 0.11328125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.6809730529785156, "reward_std": 0.23769664019346237, "rewards/accuracy_reward": 0.7013811469078064, "rewards/format_reward": 0.9795918166637421, "step": 3330 }, { "completion_length": 299.0714111328125, "epoch": 0.3351949685534591, "grad_norm": 0.8633918166160583, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6730157732963562, "reward_std": 0.23976489156484604, "rewards/accuracy_reward": 0.6832198202610016, "rewards/format_reward": 0.9897959232330322, "step": 3331 }, { "completion_length": 242.6836700439453, "epoch": 0.33529559748427673, "grad_norm": 0.6649437546730042, "kl": 0.0537109375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.798469364643097, "reward_std": 0.1730649545788765, "rewards/accuracy_reward": 0.7984693646430969, "rewards/format_reward": 1.0, "step": 3332 }, { "completion_length": 282.0306091308594, "epoch": 0.33539622641509437, "grad_norm": 0.7174186706542969, "kl": 0.0867919921875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7149814367294312, "reward_std": 0.2556362822651863, "rewards/accuracy_reward": 0.7455936968326569, "rewards/format_reward": 0.9693877398967743, "step": 3333 }, { "completion_length": 280.9081573486328, "epoch": 0.33549685534591195, "grad_norm": 0.9117888808250427, "kl": 0.05810546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.659823477268219, "reward_std": 0.33094553649425507, "rewards/accuracy_reward": 0.6802316009998322, "rewards/format_reward": 0.9795918464660645, "step": 3334 }, { "completion_length": 290.11224365234375, "epoch": 0.3355974842767296, "grad_norm": 0.76362544298172, "kl": 0.0777587890625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.844387710094452, "reward_std": 0.2724786251783371, "rewards/accuracy_reward": 0.8647958636283875, "rewards/format_reward": 0.9795918166637421, "step": 3335 }, { "completion_length": 260.0102005004883, "epoch": 0.33569811320754717, "grad_norm": 0.5575037002563477, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.671115756034851, "reward_std": 0.19110336154699326, "rewards/accuracy_reward": 0.691523939371109, "rewards/format_reward": 0.9795918166637421, "step": 3336 }, { "completion_length": 205.70407104492188, "epoch": 0.3357987421383648, "grad_norm": 0.6255293488502502, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6733538508415222, "reward_std": 0.1541414111852646, "rewards/accuracy_reward": 0.6835578680038452, "rewards/format_reward": 0.9897959232330322, "step": 3337 }, { "completion_length": 252.16326904296875, "epoch": 0.3358993710691824, "grad_norm": 0.6239283084869385, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7398754358291626, "reward_std": 0.19098522514104843, "rewards/accuracy_reward": 0.7500795423984528, "rewards/format_reward": 0.9897959232330322, "step": 3338 }, { "completion_length": 238.948974609375, "epoch": 0.336, "grad_norm": 3.1289100646972656, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7526469826698303, "reward_std": 0.20904295146465302, "rewards/accuracy_reward": 0.7730551958084106, "rewards/format_reward": 0.9795918464660645, "step": 3339 }, { "completion_length": 203.37754821777344, "epoch": 0.3361006289308176, "grad_norm": 0.532168447971344, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7653060555458069, "reward_std": 0.11078635975718498, "rewards/accuracy_reward": 0.7755101919174194, "rewards/format_reward": 0.9897959232330322, "step": 3340 }, { "completion_length": 305.1224365234375, "epoch": 0.33620125786163524, "grad_norm": 1.0895099639892578, "kl": 0.091064453125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6253644227981567, "reward_std": 0.2670544609427452, "rewards/accuracy_reward": 0.6253644227981567, "rewards/format_reward": 1.0, "step": 3341 }, { "completion_length": 226.6938705444336, "epoch": 0.3363018867924528, "grad_norm": 0.989996075630188, "kl": 0.0711669921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.618582308292389, "reward_std": 0.2372015342116356, "rewards/accuracy_reward": 0.6491945683956146, "rewards/format_reward": 0.9693877398967743, "step": 3342 }, { "completion_length": 197.1836700439453, "epoch": 0.33640251572327046, "grad_norm": 1.043158769607544, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8352043628692627, "reward_std": 0.20586390793323517, "rewards/accuracy_reward": 0.8556124269962311, "rewards/format_reward": 0.9795918464660645, "step": 3343 }, { "completion_length": 295.948974609375, "epoch": 0.33650314465408804, "grad_norm": 0.8597015738487244, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.4951931238174438, "reward_std": 0.2772928923368454, "rewards/accuracy_reward": 0.505397230386734, "rewards/format_reward": 0.9897959232330322, "step": 3344 }, { "completion_length": 282.89794921875, "epoch": 0.3366037735849057, "grad_norm": 1.5386539697647095, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7713672518730164, "reward_std": 0.23270123451948166, "rewards/accuracy_reward": 0.7917754650115967, "rewards/format_reward": 0.9795918464660645, "step": 3345 }, { "completion_length": 232.0, "epoch": 0.33670440251572326, "grad_norm": 0.7919872999191284, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.737609326839447, "reward_std": 0.18843013793230057, "rewards/accuracy_reward": 0.7580174803733826, "rewards/format_reward": 0.9795918464660645, "step": 3346 }, { "completion_length": 214.95917510986328, "epoch": 0.3368050314465409, "grad_norm": 0.643798291683197, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7686588168144226, "reward_std": 0.11855209246277809, "rewards/accuracy_reward": 0.7686588764190674, "rewards/format_reward": 1.0, "step": 3347 }, { "completion_length": 250.55101013183594, "epoch": 0.3369056603773585, "grad_norm": 0.6297322511672974, "kl": 0.0640869140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8163264989852905, "reward_std": 0.20006242394447327, "rewards/accuracy_reward": 0.8163264989852905, "rewards/format_reward": 1.0, "step": 3348 }, { "completion_length": 246.55101013183594, "epoch": 0.3370062893081761, "grad_norm": 1.0875664949417114, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.9761905074119568, "reward_std": 0.05979302525520325, "rewards/accuracy_reward": 0.9761904776096344, "rewards/format_reward": 1.0, "step": 3349 }, { "completion_length": 268.1734619140625, "epoch": 0.3371069182389937, "grad_norm": 0.4644325375556946, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7372448444366455, "reward_std": 0.13553227484226227, "rewards/accuracy_reward": 0.7474489510059357, "rewards/format_reward": 0.9897959232330322, "step": 3350 }, { "completion_length": 260.6428527832031, "epoch": 0.33720754716981133, "grad_norm": 2.1455039978027344, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6092127561569214, "reward_std": 0.21168643608689308, "rewards/accuracy_reward": 0.6398249566555023, "rewards/format_reward": 0.9693877398967743, "step": 3351 }, { "completion_length": 252.1224365234375, "epoch": 0.3373081761006289, "grad_norm": 0.9746741652488708, "kl": 0.0562744140625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6234265565872192, "reward_std": 0.31404542922973633, "rewards/accuracy_reward": 0.6540387868881226, "rewards/format_reward": 0.9693877398967743, "step": 3352 }, { "completion_length": 185.2448959350586, "epoch": 0.33740880503144655, "grad_norm": 0.6075280904769897, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7707173824310303, "reward_std": 0.13679734990000725, "rewards/accuracy_reward": 0.8013296127319336, "rewards/format_reward": 0.9693877398967743, "step": 3353 }, { "completion_length": 316.3061065673828, "epoch": 0.3375094339622641, "grad_norm": 0.6802625060081482, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.573693335056305, "reward_std": 0.23691676557064056, "rewards/accuracy_reward": 0.5838975608348846, "rewards/format_reward": 0.9897959232330322, "step": 3354 }, { "completion_length": 295.9795837402344, "epoch": 0.33761006289308176, "grad_norm": 1.0214813947677612, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8187155723571777, "reward_std": 0.18536289781332016, "rewards/accuracy_reward": 0.8289197087287903, "rewards/format_reward": 0.9897959232330322, "step": 3355 }, { "completion_length": 239.0306167602539, "epoch": 0.33771069182389934, "grad_norm": 0.6047735810279846, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7857142686843872, "reward_std": 0.13008780032396317, "rewards/accuracy_reward": 0.795918345451355, "rewards/format_reward": 0.9897959232330322, "step": 3356 }, { "completion_length": 224.34693908691406, "epoch": 0.337811320754717, "grad_norm": 0.7709329128265381, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8299319744110107, "reward_std": 0.10684680193662643, "rewards/accuracy_reward": 0.8401360511779785, "rewards/format_reward": 0.9897959232330322, "step": 3357 }, { "completion_length": 291.9183578491211, "epoch": 0.3379119496855346, "grad_norm": 0.8874433040618896, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6488500237464905, "reward_std": 0.2745252698659897, "rewards/accuracy_reward": 0.6692581474781036, "rewards/format_reward": 0.9795918166637421, "step": 3358 }, { "completion_length": 280.87754821777344, "epoch": 0.3380125786163522, "grad_norm": 1.3277461528778076, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6086270213127136, "reward_std": 0.2342430204153061, "rewards/accuracy_reward": 0.6188311874866486, "rewards/format_reward": 0.9897959232330322, "step": 3359 }, { "completion_length": 258.6836700439453, "epoch": 0.33811320754716984, "grad_norm": 0.7475393414497375, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7978620529174805, "reward_std": 0.20341191440820694, "rewards/accuracy_reward": 0.7978619635105133, "rewards/format_reward": 1.0, "step": 3360 }, { "completion_length": 269.4795913696289, "epoch": 0.3382138364779874, "grad_norm": 0.8135778903961182, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7961145043373108, "reward_std": 0.11104968376457691, "rewards/accuracy_reward": 0.7961145639419556, "rewards/format_reward": 1.0, "step": 3361 }, { "completion_length": 304.2142791748047, "epoch": 0.33831446540880505, "grad_norm": 0.7963653802871704, "kl": 0.0516357421875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6318511962890625, "reward_std": 0.2637063041329384, "rewards/accuracy_reward": 0.6624634861946106, "rewards/format_reward": 0.9693877398967743, "step": 3362 }, { "completion_length": 283.36734771728516, "epoch": 0.33841509433962264, "grad_norm": 0.5949150919914246, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5675373673439026, "reward_std": 0.2585466802120209, "rewards/accuracy_reward": 0.6185578256845474, "rewards/format_reward": 0.9489795863628387, "step": 3363 }, { "completion_length": 204.08162689208984, "epoch": 0.33851572327044027, "grad_norm": 1.0095640420913696, "kl": 0.094970703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.802656352519989, "reward_std": 0.12624356895685196, "rewards/accuracy_reward": 0.8026562929153442, "rewards/format_reward": 1.0, "step": 3364 }, { "completion_length": 264.1734619140625, "epoch": 0.33861635220125785, "grad_norm": 0.6405665874481201, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7548348307609558, "reward_std": 0.1492389403283596, "rewards/accuracy_reward": 0.7548348009586334, "rewards/format_reward": 1.0, "step": 3365 }, { "completion_length": 301.41835021972656, "epoch": 0.3387169811320755, "grad_norm": 0.8046680688858032, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.641302227973938, "reward_std": 0.21416792273521423, "rewards/accuracy_reward": 0.6515062749385834, "rewards/format_reward": 0.9897959232330322, "step": 3366 }, { "completion_length": 296.39794921875, "epoch": 0.33881761006289307, "grad_norm": 0.7108480930328369, "kl": 0.055419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5264129638671875, "reward_std": 0.14689762517809868, "rewards/accuracy_reward": 0.526413157582283, "rewards/format_reward": 1.0, "step": 3367 }, { "completion_length": 320.3673400878906, "epoch": 0.3389182389937107, "grad_norm": 0.7179298400878906, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6628838181495667, "reward_std": 0.284661665558815, "rewards/accuracy_reward": 0.6832919418811798, "rewards/format_reward": 0.9795918464660645, "step": 3368 }, { "completion_length": 214.81632232666016, "epoch": 0.3390188679245283, "grad_norm": 2.2164440155029297, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8374238014221191, "reward_std": 0.19095852226018906, "rewards/accuracy_reward": 0.8476278483867645, "rewards/format_reward": 0.9897959232330322, "step": 3369 }, { "completion_length": 243.36734008789062, "epoch": 0.3391194968553459, "grad_norm": 0.7978722453117371, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7885203957557678, "reward_std": 0.17358897253870964, "rewards/accuracy_reward": 0.7885203957557678, "rewards/format_reward": 1.0, "step": 3370 }, { "completion_length": 228.06122589111328, "epoch": 0.3392201257861635, "grad_norm": 0.9351133704185486, "kl": 0.0701904296875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.867055356502533, "reward_std": 0.11225956305861473, "rewards/accuracy_reward": 0.8772594332695007, "rewards/format_reward": 0.9897959232330322, "step": 3371 }, { "completion_length": 239.62244415283203, "epoch": 0.33932075471698114, "grad_norm": 0.9854511618614197, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7084575295448303, "reward_std": 0.2604052610695362, "rewards/accuracy_reward": 0.7390697598457336, "rewards/format_reward": 0.9693877398967743, "step": 3372 }, { "completion_length": 228.82652282714844, "epoch": 0.3394213836477987, "grad_norm": 0.5974403619766235, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7836518287658691, "reward_std": 0.14109563454985619, "rewards/accuracy_reward": 0.7836518585681915, "rewards/format_reward": 1.0, "step": 3373 }, { "completion_length": 309.82652282714844, "epoch": 0.33952201257861636, "grad_norm": 1.1939818859100342, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7678614854812622, "reward_std": 0.22135332971811295, "rewards/accuracy_reward": 0.7780655920505524, "rewards/format_reward": 0.9897959232330322, "step": 3374 }, { "completion_length": 321.6428527832031, "epoch": 0.33962264150943394, "grad_norm": 0.6226642727851868, "kl": 0.0694580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7349693775177002, "reward_std": 0.21632599085569382, "rewards/accuracy_reward": 0.7451734244823456, "rewards/format_reward": 0.9897959232330322, "step": 3375 }, { "completion_length": 277.9897918701172, "epoch": 0.3397232704402516, "grad_norm": 1.0202120542526245, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8163264989852905, "reward_std": 0.15184256434440613, "rewards/accuracy_reward": 0.8163265287876129, "rewards/format_reward": 1.0, "step": 3376 }, { "completion_length": 333.6326446533203, "epoch": 0.33982389937106916, "grad_norm": 0.6589069962501526, "kl": 0.0504150390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.627418339252472, "reward_std": 0.27144406735897064, "rewards/accuracy_reward": 0.6478265225887299, "rewards/format_reward": 0.9795918166637421, "step": 3377 }, { "completion_length": 274.9285583496094, "epoch": 0.3399245283018868, "grad_norm": 0.5615872144699097, "kl": 0.04833984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.725623607635498, "reward_std": 0.09554945304989815, "rewards/accuracy_reward": 0.7256235778331757, "rewards/format_reward": 1.0, "step": 3378 }, { "completion_length": 310.3061218261719, "epoch": 0.3400251572327044, "grad_norm": 0.9295603632926941, "kl": 0.05908203125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6873605251312256, "reward_std": 0.2703147232532501, "rewards/accuracy_reward": 0.7077687084674835, "rewards/format_reward": 0.9795918166637421, "step": 3379 }, { "completion_length": 258.33673095703125, "epoch": 0.340125786163522, "grad_norm": 0.7284166812896729, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6579015851020813, "reward_std": 0.1730143129825592, "rewards/accuracy_reward": 0.6579016149044037, "rewards/format_reward": 1.0, "step": 3380 }, { "completion_length": 307.82652282714844, "epoch": 0.3402264150943396, "grad_norm": 1.2467657327651978, "kl": 0.05078125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.561719298362732, "reward_std": 0.29611627757549286, "rewards/accuracy_reward": 0.5821274071931839, "rewards/format_reward": 0.9795918166637421, "step": 3381 }, { "completion_length": 341.38775634765625, "epoch": 0.34032704402515723, "grad_norm": 0.5768393874168396, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7688046097755432, "reward_std": 0.19953250885009766, "rewards/accuracy_reward": 0.7790087163448334, "rewards/format_reward": 0.9897959232330322, "step": 3382 }, { "completion_length": 195.74489974975586, "epoch": 0.34042767295597487, "grad_norm": 0.36495184898376465, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.752289354801178, "reward_std": 0.029710138216614723, "rewards/accuracy_reward": 0.7522893846035004, "rewards/format_reward": 1.0, "step": 3383 }, { "completion_length": 223.7448959350586, "epoch": 0.34052830188679245, "grad_norm": 3.0533559322357178, "kl": 0.1240234375, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.6930133700370789, "reward_std": 0.256403848528862, "rewards/accuracy_reward": 0.7134215533733368, "rewards/format_reward": 0.9795918464660645, "step": 3384 }, { "completion_length": 346.6938781738281, "epoch": 0.3406289308176101, "grad_norm": 0.5071461796760559, "kl": 0.0606689453125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6147046089172363, "reward_std": 0.19884294271469116, "rewards/accuracy_reward": 0.6351127922534943, "rewards/format_reward": 0.9795918464660645, "step": 3385 }, { "completion_length": 315.8061218261719, "epoch": 0.34072955974842767, "grad_norm": 0.7204010486602783, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7252729535102844, "reward_std": 0.10509225726127625, "rewards/accuracy_reward": 0.7252729833126068, "rewards/format_reward": 1.0, "step": 3386 }, { "completion_length": 302.45916748046875, "epoch": 0.3408301886792453, "grad_norm": 0.521431028842926, "kl": 0.0545654296875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7482993006706238, "reward_std": 0.1184304915368557, "rewards/accuracy_reward": 0.7482993304729462, "rewards/format_reward": 1.0, "step": 3387 }, { "completion_length": 286.8367233276367, "epoch": 0.3409308176100629, "grad_norm": 0.6504589915275574, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6866495609283447, "reward_std": 0.1643049754202366, "rewards/accuracy_reward": 0.6866496205329895, "rewards/format_reward": 1.0, "step": 3388 }, { "completion_length": 242.90816497802734, "epoch": 0.3410314465408805, "grad_norm": 0.7017701864242554, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7588064670562744, "reward_std": 0.13849307224154472, "rewards/accuracy_reward": 0.7690105736255646, "rewards/format_reward": 0.9897959232330322, "step": 3389 }, { "completion_length": 309.8163146972656, "epoch": 0.3411320754716981, "grad_norm": 0.6094238758087158, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7268221974372864, "reward_std": 0.15905892103910446, "rewards/accuracy_reward": 0.737026184797287, "rewards/format_reward": 0.9897959232330322, "step": 3390 }, { "completion_length": 237.28570556640625, "epoch": 0.34123270440251574, "grad_norm": 1.328565001487732, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7551020383834839, "reward_std": 0.28279635310173035, "rewards/accuracy_reward": 0.7755101919174194, "rewards/format_reward": 0.9795918464660645, "step": 3391 }, { "completion_length": 202.6938705444336, "epoch": 0.3413333333333333, "grad_norm": 0.7461519241333008, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7625850439071655, "reward_std": 0.21968912333250046, "rewards/accuracy_reward": 0.7727891504764557, "rewards/format_reward": 0.9897959232330322, "step": 3392 }, { "completion_length": 274.97959899902344, "epoch": 0.34143396226415096, "grad_norm": 0.46795573830604553, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7714822888374329, "reward_std": 0.17026438564062119, "rewards/accuracy_reward": 0.8020945191383362, "rewards/format_reward": 0.9693877398967743, "step": 3393 }, { "completion_length": 231.82653045654297, "epoch": 0.34153459119496854, "grad_norm": 0.8291377425193787, "kl": 0.0704345703125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8799465894699097, "reward_std": 0.08865547180175781, "rewards/accuracy_reward": 0.8799465894699097, "rewards/format_reward": 1.0, "step": 3394 }, { "completion_length": 374.82652282714844, "epoch": 0.3416352201257862, "grad_norm": 0.6195891499519348, "kl": 0.046142578125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6923148036003113, "reward_std": 0.18080097436904907, "rewards/accuracy_reward": 0.712723046541214, "rewards/format_reward": 0.9795918166637421, "step": 3395 }, { "completion_length": 247.89794921875, "epoch": 0.34173584905660376, "grad_norm": 1.1675379276275635, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.702016532421112, "reward_std": 0.25634174048900604, "rewards/accuracy_reward": 0.7224246859550476, "rewards/format_reward": 0.9795918166637421, "step": 3396 }, { "completion_length": 238.88774871826172, "epoch": 0.3418364779874214, "grad_norm": 0.5402917861938477, "kl": 0.05908203125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.795918345451355, "reward_std": 0.16188225150108337, "rewards/accuracy_reward": 0.8061224520206451, "rewards/format_reward": 0.9897959232330322, "step": 3397 }, { "completion_length": 223.88774871826172, "epoch": 0.341937106918239, "grad_norm": 0.28306686878204346, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8571428060531616, "reward_std": 0.05399492383003235, "rewards/accuracy_reward": 0.857142835855484, "rewards/format_reward": 1.0, "step": 3398 }, { "completion_length": 305.1836700439453, "epoch": 0.3420377358490566, "grad_norm": 0.6857564449310303, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8818254470825195, "reward_std": 0.13688945770263672, "rewards/accuracy_reward": 0.9124377369880676, "rewards/format_reward": 0.9693877398967743, "step": 3399 }, { "completion_length": 245.9387664794922, "epoch": 0.3421383647798742, "grad_norm": 1.0082799196243286, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7292839884757996, "reward_std": 0.20788691192865372, "rewards/accuracy_reward": 0.7496922016143799, "rewards/format_reward": 0.9795918166637421, "step": 3400 }, { "completion_length": 337.6530456542969, "epoch": 0.34223899371069183, "grad_norm": 0.603073239326477, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.558561086654663, "reward_std": 0.2549109682440758, "rewards/accuracy_reward": 0.5891733467578888, "rewards/format_reward": 0.9693877398967743, "step": 3401 }, { "completion_length": 177.81632232666016, "epoch": 0.3423396226415094, "grad_norm": 2.067230463027954, "kl": 0.0821533203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.722246527671814, "reward_std": 0.17149632424116135, "rewards/accuracy_reward": 0.7426546812057495, "rewards/format_reward": 0.9795918464660645, "step": 3402 }, { "completion_length": 313.948974609375, "epoch": 0.34244025157232705, "grad_norm": 0.6327210664749146, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6604266166687012, "reward_std": 0.08522105030715466, "rewards/accuracy_reward": 0.6604266464710236, "rewards/format_reward": 1.0, "step": 3403 }, { "completion_length": 258.48978424072266, "epoch": 0.34254088050314463, "grad_norm": 0.7421076893806458, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5216191411018372, "reward_std": 0.1913735717535019, "rewards/accuracy_reward": 0.5318232625722885, "rewards/format_reward": 0.9897959232330322, "step": 3404 }, { "completion_length": 343.7040710449219, "epoch": 0.34264150943396227, "grad_norm": 0.5189179182052612, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8234251737594604, "reward_std": 0.25257495045661926, "rewards/accuracy_reward": 0.8438333868980408, "rewards/format_reward": 0.9795918166637421, "step": 3405 }, { "completion_length": 268.9591827392578, "epoch": 0.34274213836477985, "grad_norm": 0.8108221292495728, "kl": 0.0594482421875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.9257884621620178, "reward_std": 0.11864761263132095, "rewards/accuracy_reward": 0.935992568731308, "rewards/format_reward": 0.9897959232330322, "step": 3406 }, { "completion_length": 169.58162689208984, "epoch": 0.3428427672955975, "grad_norm": 4.37560510635376, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8154080510139465, "reward_std": 0.12200292572379112, "rewards/accuracy_reward": 0.8154079914093018, "rewards/format_reward": 1.0, "step": 3407 }, { "completion_length": 235.53060913085938, "epoch": 0.3429433962264151, "grad_norm": 0.7796645164489746, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7891156077384949, "reward_std": 0.18408272415399551, "rewards/accuracy_reward": 0.7993196845054626, "rewards/format_reward": 0.9897959232330322, "step": 3408 }, { "completion_length": 250.89795684814453, "epoch": 0.3430440251572327, "grad_norm": 0.9390854835510254, "kl": 0.0538330078125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5686468482017517, "reward_std": 0.2931079864501953, "rewards/accuracy_reward": 0.5788509845733643, "rewards/format_reward": 0.9897959232330322, "step": 3409 }, { "completion_length": 289.83673095703125, "epoch": 0.34314465408805034, "grad_norm": 0.40412959456443787, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6679748892784119, "reward_std": 0.13633673265576363, "rewards/accuracy_reward": 0.6985871195793152, "rewards/format_reward": 0.9693877398967743, "step": 3410 }, { "completion_length": 191.39795684814453, "epoch": 0.3432452830188679, "grad_norm": 0.5141473412513733, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7682214975357056, "reward_std": 0.10027629137039185, "rewards/accuracy_reward": 0.7784256339073181, "rewards/format_reward": 0.9897959232330322, "step": 3411 }, { "completion_length": 200.64285278320312, "epoch": 0.34334591194968556, "grad_norm": 1.0389901399612427, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.695983111858368, "reward_std": 0.21552110463380814, "rewards/accuracy_reward": 0.6959831118583679, "rewards/format_reward": 1.0, "step": 3412 }, { "completion_length": 302.1836700439453, "epoch": 0.34344654088050314, "grad_norm": 0.5097315311431885, "kl": 0.0804443359375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7287960648536682, "reward_std": 0.11107242852449417, "rewards/accuracy_reward": 0.7492042183876038, "rewards/format_reward": 0.9795918166637421, "step": 3413 }, { "completion_length": 234.60203552246094, "epoch": 0.3435471698113208, "grad_norm": 1.381857991218567, "kl": 0.062744140625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6734524965286255, "reward_std": 0.19819729775190353, "rewards/accuracy_reward": 0.7142688930034637, "rewards/format_reward": 0.9591836333274841, "step": 3414 }, { "completion_length": 233.25509643554688, "epoch": 0.34364779874213836, "grad_norm": 0.5543058514595032, "kl": 0.089111328125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7239302396774292, "reward_std": 0.17856962978839874, "rewards/accuracy_reward": 0.7545425295829773, "rewards/format_reward": 0.9693877398967743, "step": 3415 }, { "completion_length": 141.02040481567383, "epoch": 0.343748427672956, "grad_norm": 0.29328230023384094, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8759621381759644, "reward_std": 0.036845482885837555, "rewards/accuracy_reward": 0.8759621381759644, "rewards/format_reward": 1.0, "step": 3416 }, { "completion_length": 221.7346954345703, "epoch": 0.3438490566037736, "grad_norm": 0.6500306129455566, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6704763770103455, "reward_std": 0.17790087684988976, "rewards/accuracy_reward": 0.6806805431842804, "rewards/format_reward": 0.9897959232330322, "step": 3417 }, { "completion_length": 235.4081573486328, "epoch": 0.3439496855345912, "grad_norm": 0.7532258629798889, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6111670136451721, "reward_std": 0.16539651528000832, "rewards/accuracy_reward": 0.6417793333530426, "rewards/format_reward": 0.9693877398967743, "step": 3418 }, { "completion_length": 304.5816192626953, "epoch": 0.3440503144654088, "grad_norm": 0.6287305951118469, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.668700397014618, "reward_std": 0.17280663177371025, "rewards/accuracy_reward": 0.6687004268169403, "rewards/format_reward": 1.0, "step": 3419 }, { "completion_length": 226.4795913696289, "epoch": 0.3441509433962264, "grad_norm": 1.0887887477874756, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7386375069618225, "reward_std": 0.19953081011772156, "rewards/accuracy_reward": 0.7386375665664673, "rewards/format_reward": 1.0, "step": 3420 }, { "completion_length": 243.66326141357422, "epoch": 0.344251572327044, "grad_norm": 0.9864833950996399, "kl": 0.0587158203125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6616140604019165, "reward_std": 0.23605170845985413, "rewards/accuracy_reward": 0.6616140007972717, "rewards/format_reward": 1.0, "step": 3421 }, { "completion_length": 233.90816497802734, "epoch": 0.34435220125786165, "grad_norm": 0.41325104236602783, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7791545391082764, "reward_std": 0.11792010441422462, "rewards/accuracy_reward": 0.7893585860729218, "rewards/format_reward": 0.9897959232330322, "step": 3422 }, { "completion_length": 278.2346878051758, "epoch": 0.3444528301886792, "grad_norm": 0.5740941762924194, "kl": 0.0537109375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6580781936645508, "reward_std": 0.2048810012638569, "rewards/accuracy_reward": 0.6886904537677765, "rewards/format_reward": 0.9693877398967743, "step": 3423 }, { "completion_length": 190.07142639160156, "epoch": 0.34455345911949686, "grad_norm": 0.6442269086837769, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7858163118362427, "reward_std": 0.11890258640050888, "rewards/accuracy_reward": 0.7960203886032104, "rewards/format_reward": 0.9897959232330322, "step": 3424 }, { "completion_length": 258.53060150146484, "epoch": 0.34465408805031444, "grad_norm": 2.1646900177001953, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8046944737434387, "reward_std": 0.2140817642211914, "rewards/accuracy_reward": 0.8353067338466644, "rewards/format_reward": 0.9693877398967743, "step": 3425 }, { "completion_length": 215.2448959350586, "epoch": 0.3447547169811321, "grad_norm": 0.9326935410499573, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7082199454307556, "reward_std": 0.1431087888777256, "rewards/accuracy_reward": 0.7082199454307556, "rewards/format_reward": 1.0, "step": 3426 }, { "completion_length": 265.3061218261719, "epoch": 0.34485534591194966, "grad_norm": 0.8610422611236572, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6204081177711487, "reward_std": 0.30973346531391144, "rewards/accuracy_reward": 0.6612244546413422, "rewards/format_reward": 0.9591836333274841, "step": 3427 }, { "completion_length": 302.948974609375, "epoch": 0.3449559748427673, "grad_norm": 1.047773003578186, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6753699779510498, "reward_std": 0.3193626254796982, "rewards/accuracy_reward": 0.7263904213905334, "rewards/format_reward": 0.9489795565605164, "step": 3428 }, { "completion_length": 205.1836700439453, "epoch": 0.3450566037735849, "grad_norm": 1.3001912832260132, "kl": 0.1044921875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7104952335357666, "reward_std": 0.27607448399066925, "rewards/accuracy_reward": 0.7513115704059601, "rewards/format_reward": 0.9591836631298065, "step": 3429 }, { "completion_length": 242.09183502197266, "epoch": 0.3451572327044025, "grad_norm": 0.5736391544342041, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8979591131210327, "reward_std": 0.11584595590829849, "rewards/accuracy_reward": 0.8979591727256775, "rewards/format_reward": 1.0, "step": 3430 }, { "completion_length": 265.1938781738281, "epoch": 0.34525786163522015, "grad_norm": 1.3691996335983276, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6955782771110535, "reward_std": 0.32387496531009674, "rewards/accuracy_reward": 0.7159863710403442, "rewards/format_reward": 0.9795918464660645, "step": 3431 }, { "completion_length": 263.4285583496094, "epoch": 0.34535849056603773, "grad_norm": 0.7748928070068359, "kl": 0.0733642578125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6796910166740417, "reward_std": 0.27205103635787964, "rewards/accuracy_reward": 0.7000993490219116, "rewards/format_reward": 0.9795918166637421, "step": 3432 }, { "completion_length": 281.93878173828125, "epoch": 0.34545911949685537, "grad_norm": 1.0063092708587646, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5546302795410156, "reward_std": 0.2836729511618614, "rewards/accuracy_reward": 0.57503841817379, "rewards/format_reward": 0.9795918166637421, "step": 3433 }, { "completion_length": 256.2550964355469, "epoch": 0.34555974842767295, "grad_norm": 0.9732651114463806, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5313828587532043, "reward_std": 0.2500050216913223, "rewards/accuracy_reward": 0.5313829183578491, "rewards/format_reward": 1.0, "step": 3434 }, { "completion_length": 225.86734771728516, "epoch": 0.3456603773584906, "grad_norm": 1.317177176475525, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7653060555458069, "reward_std": 0.23158938437700272, "rewards/accuracy_reward": 0.7857142686843872, "rewards/format_reward": 0.9795918464660645, "step": 3435 }, { "completion_length": 221.81631469726562, "epoch": 0.34576100628930817, "grad_norm": 0.7511321306228638, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.762332022190094, "reward_std": 0.22871366888284683, "rewards/accuracy_reward": 0.772536039352417, "rewards/format_reward": 0.9897959232330322, "step": 3436 }, { "completion_length": 259.5408020019531, "epoch": 0.3458616352201258, "grad_norm": 0.8703606724739075, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7813552021980286, "reward_std": 0.19009451568126678, "rewards/accuracy_reward": 0.8119674623012543, "rewards/format_reward": 0.9693877398967743, "step": 3437 }, { "completion_length": 184.2142791748047, "epoch": 0.3459622641509434, "grad_norm": 0.8656253218650818, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.780156672000885, "reward_std": 0.2022230327129364, "rewards/accuracy_reward": 0.7801567018032074, "rewards/format_reward": 1.0, "step": 3438 }, { "completion_length": 222.66326141357422, "epoch": 0.346062893081761, "grad_norm": 0.6883618831634521, "kl": 0.0589599609375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7693877220153809, "reward_std": 0.24578044563531876, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 0.9693877398967743, "step": 3439 }, { "completion_length": 198.1530532836914, "epoch": 0.3461635220125786, "grad_norm": 0.5241592526435852, "kl": 0.09716796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8231859803199768, "reward_std": 0.07804930210113525, "rewards/accuracy_reward": 0.8333900570869446, "rewards/format_reward": 0.9897959232330322, "step": 3440 }, { "completion_length": 220.1938705444336, "epoch": 0.34626415094339624, "grad_norm": 0.9776548743247986, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7571427822113037, "reward_std": 0.12495577707886696, "rewards/accuracy_reward": 0.7571428418159485, "rewards/format_reward": 1.0, "step": 3441 }, { "completion_length": 210.74488830566406, "epoch": 0.3463647798742138, "grad_norm": 0.9618251919746399, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8248299360275269, "reward_std": 0.1858047991991043, "rewards/accuracy_reward": 0.8452381193637848, "rewards/format_reward": 0.9795918166637421, "step": 3442 }, { "completion_length": 194.1938705444336, "epoch": 0.34646540880503146, "grad_norm": 1.030949354171753, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6670417785644531, "reward_std": 0.14211811870336533, "rewards/accuracy_reward": 0.6670417934656143, "rewards/format_reward": 1.0, "step": 3443 }, { "completion_length": 169.84693145751953, "epoch": 0.34656603773584904, "grad_norm": 0.4766715466976166, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.9126983880996704, "reward_std": 0.08711815625429153, "rewards/accuracy_reward": 0.9229024946689606, "rewards/format_reward": 0.9897959232330322, "step": 3444 }, { "completion_length": 207.1938705444336, "epoch": 0.3466666666666667, "grad_norm": 0.8272756338119507, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8282357454299927, "reward_std": 0.2241300791501999, "rewards/accuracy_reward": 0.8588480055332184, "rewards/format_reward": 0.9693877398967743, "step": 3445 }, { "completion_length": 229.99999237060547, "epoch": 0.34676729559748426, "grad_norm": 1.8366941213607788, "kl": 0.0587158203125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7423888444900513, "reward_std": 0.20671049505472183, "rewards/accuracy_reward": 0.7423888742923737, "rewards/format_reward": 1.0, "step": 3446 }, { "completion_length": 213.67346954345703, "epoch": 0.3468679245283019, "grad_norm": 1.4569458961486816, "kl": 0.107421875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.6205097436904907, "reward_std": 0.12306717783212662, "rewards/accuracy_reward": 0.6307138502597809, "rewards/format_reward": 0.9897959232330322, "step": 3447 }, { "completion_length": 249.11223602294922, "epoch": 0.3469685534591195, "grad_norm": 1.259848713874817, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.5876349806785583, "reward_std": 0.1080559566617012, "rewards/accuracy_reward": 0.5876350402832031, "rewards/format_reward": 1.0, "step": 3448 }, { "completion_length": 215.1938705444336, "epoch": 0.3470691823899371, "grad_norm": 1.0451639890670776, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6958211660385132, "reward_std": 0.20983592420816422, "rewards/accuracy_reward": 0.706025242805481, "rewards/format_reward": 0.9897959232330322, "step": 3449 }, { "completion_length": 218.16326141357422, "epoch": 0.3471698113207547, "grad_norm": 0.25657007098197937, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6248785257339478, "reward_std": 0.060689281672239304, "rewards/accuracy_reward": 0.6248784959316254, "rewards/format_reward": 1.0, "step": 3450 }, { "completion_length": 219.64285278320312, "epoch": 0.34727044025157233, "grad_norm": 0.797308087348938, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8031939268112183, "reward_std": 0.139359749853611, "rewards/accuracy_reward": 0.833806186914444, "rewards/format_reward": 0.9693877398967743, "step": 3451 }, { "completion_length": 268.07142639160156, "epoch": 0.3473710691823899, "grad_norm": 1.0186951160430908, "kl": 0.0596923828125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5502755641937256, "reward_std": 0.23730704933404922, "rewards/accuracy_reward": 0.5502755790948868, "rewards/format_reward": 1.0, "step": 3452 }, { "completion_length": 165.31632232666016, "epoch": 0.34747169811320755, "grad_norm": 0.5675173401832581, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8352537155151367, "reward_std": 0.10115591436624527, "rewards/accuracy_reward": 0.8352537155151367, "rewards/format_reward": 1.0, "step": 3453 }, { "completion_length": 247.29591369628906, "epoch": 0.34757232704402513, "grad_norm": 0.9685073494911194, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6565072536468506, "reward_std": 0.2124355286359787, "rewards/accuracy_reward": 0.6871195733547211, "rewards/format_reward": 0.9693877398967743, "step": 3454 }, { "completion_length": 214.57141876220703, "epoch": 0.34767295597484277, "grad_norm": 0.6442014575004578, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.705997347831726, "reward_std": 0.10719738341867924, "rewards/accuracy_reward": 0.726405531167984, "rewards/format_reward": 0.9795918464660645, "step": 3455 }, { "completion_length": 230.62244415283203, "epoch": 0.3477735849056604, "grad_norm": 0.9605000019073486, "kl": 0.0562744140625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5796889662742615, "reward_std": 0.20328158140182495, "rewards/accuracy_reward": 0.5796889960765839, "rewards/format_reward": 1.0, "step": 3456 }, { "completion_length": 248.04080963134766, "epoch": 0.347874213836478, "grad_norm": 0.6552270650863647, "kl": 0.05859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7653061151504517, "reward_std": 0.2125505581498146, "rewards/accuracy_reward": 0.7857142686843872, "rewards/format_reward": 0.9795918166637421, "step": 3457 }, { "completion_length": 188.4897918701172, "epoch": 0.3479748427672956, "grad_norm": 1.0195058584213257, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6462034583091736, "reward_std": 0.22770194709300995, "rewards/accuracy_reward": 0.646203488111496, "rewards/format_reward": 1.0, "step": 3458 }, { "completion_length": 179.38775634765625, "epoch": 0.3480754716981132, "grad_norm": 0.7700862288475037, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6886296272277832, "reward_std": 0.08986103907227516, "rewards/accuracy_reward": 0.6886297166347504, "rewards/format_reward": 1.0, "step": 3459 }, { "completion_length": 185.7448959350586, "epoch": 0.34817610062893084, "grad_norm": 1.6841440200805664, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7244973182678223, "reward_std": 0.1953432820737362, "rewards/accuracy_reward": 0.7347014248371124, "rewards/format_reward": 0.9897959232330322, "step": 3460 }, { "completion_length": 300.28570556640625, "epoch": 0.3482767295597484, "grad_norm": 0.8173446655273438, "kl": 0.0762939453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6559591889381409, "reward_std": 0.1837110072374344, "rewards/accuracy_reward": 0.6559592485427856, "rewards/format_reward": 1.0, "step": 3461 }, { "completion_length": 234.74488830566406, "epoch": 0.34837735849056606, "grad_norm": 0.5405444502830505, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6988487243652344, "reward_std": 0.10457316040992737, "rewards/accuracy_reward": 0.6988487541675568, "rewards/format_reward": 1.0, "step": 3462 }, { "completion_length": 262.34693908691406, "epoch": 0.34847798742138364, "grad_norm": 0.5741257071495056, "kl": 0.0860595703125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6929498314857483, "reward_std": 0.22254931181669235, "rewards/accuracy_reward": 0.7235621511936188, "rewards/format_reward": 0.9693877398967743, "step": 3463 }, { "completion_length": 230.30611419677734, "epoch": 0.3485786163522013, "grad_norm": 0.5315035581588745, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.815981924533844, "reward_std": 0.10425174981355667, "rewards/accuracy_reward": 0.815981924533844, "rewards/format_reward": 1.0, "step": 3464 }, { "completion_length": 221.75509643554688, "epoch": 0.34867924528301886, "grad_norm": 0.5084035396575928, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.795918345451355, "reward_std": 0.09670459479093552, "rewards/accuracy_reward": 0.7959183752536774, "rewards/format_reward": 1.0, "step": 3465 }, { "completion_length": 269.1632537841797, "epoch": 0.3487798742138365, "grad_norm": 1.198302984237671, "kl": 0.0537109375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5686500072479248, "reward_std": 0.21956075727939606, "rewards/accuracy_reward": 0.5788541436195374, "rewards/format_reward": 0.9897959232330322, "step": 3466 }, { "completion_length": 230.4285659790039, "epoch": 0.3488805031446541, "grad_norm": 0.43822041153907776, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7689988613128662, "reward_std": 0.08352332003414631, "rewards/accuracy_reward": 0.7689988613128662, "rewards/format_reward": 1.0, "step": 3467 }, { "completion_length": 250.23468017578125, "epoch": 0.3489811320754717, "grad_norm": 1.243770956993103, "kl": 0.0814208984375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7746875882148743, "reward_std": 0.125696225091815, "rewards/accuracy_reward": 0.7746876180171967, "rewards/format_reward": 1.0, "step": 3468 }, { "completion_length": 205.4693832397461, "epoch": 0.3490817610062893, "grad_norm": 0.35971421003341675, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6845237612724304, "reward_std": 0.1595427207648754, "rewards/accuracy_reward": 0.7253401577472687, "rewards/format_reward": 0.9591836631298065, "step": 3469 }, { "completion_length": 221.77550506591797, "epoch": 0.34918238993710693, "grad_norm": 1.1969271898269653, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7603914141654968, "reward_std": 0.2400975003838539, "rewards/accuracy_reward": 0.7807996571063995, "rewards/format_reward": 0.9795918464660645, "step": 3470 }, { "completion_length": 277.8673400878906, "epoch": 0.3492830188679245, "grad_norm": 0.8144837617874146, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6802608370780945, "reward_std": 0.17589139193296432, "rewards/accuracy_reward": 0.700669065117836, "rewards/format_reward": 0.9795918464660645, "step": 3471 }, { "completion_length": 248.83673095703125, "epoch": 0.34938364779874215, "grad_norm": 0.6348220705986023, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8301537036895752, "reward_std": 0.04904902074486017, "rewards/accuracy_reward": 0.8301538228988647, "rewards/format_reward": 1.0, "step": 3472 }, { "completion_length": 244.9897918701172, "epoch": 0.34948427672955973, "grad_norm": 0.7901131510734558, "kl": 0.09814453125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.805013656616211, "reward_std": 0.18089089542627335, "rewards/accuracy_reward": 0.8254218399524689, "rewards/format_reward": 0.9795918166637421, "step": 3473 }, { "completion_length": 297.65306091308594, "epoch": 0.34958490566037737, "grad_norm": 0.7895720601081848, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6791714429855347, "reward_std": 0.31072525680065155, "rewards/accuracy_reward": 0.7199878096580505, "rewards/format_reward": 0.9591836631298065, "step": 3474 }, { "completion_length": 336.92857360839844, "epoch": 0.34968553459119495, "grad_norm": 1.3681141138076782, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6320964694023132, "reward_std": 0.16305994614958763, "rewards/accuracy_reward": 0.6320964992046356, "rewards/format_reward": 1.0, "step": 3475 }, { "completion_length": 276.551025390625, "epoch": 0.3497861635220126, "grad_norm": 0.6436260938644409, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7260446548461914, "reward_std": 0.13973674923181534, "rewards/accuracy_reward": 0.7260447144508362, "rewards/format_reward": 1.0, "step": 3476 }, { "completion_length": 291.37754821777344, "epoch": 0.34988679245283016, "grad_norm": 0.5708976984024048, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7038121819496155, "reward_std": 0.1600613370537758, "rewards/accuracy_reward": 0.7140162587165833, "rewards/format_reward": 0.9897959232330322, "step": 3477 }, { "completion_length": 195.93877410888672, "epoch": 0.3499874213836478, "grad_norm": 1.4396179914474487, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8474236130714417, "reward_std": 0.18779906630516052, "rewards/accuracy_reward": 0.8576277196407318, "rewards/format_reward": 0.9897959232330322, "step": 3478 }, { "completion_length": 250.63265228271484, "epoch": 0.3500880503144654, "grad_norm": 0.5218043923377991, "kl": 0.102294921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8193793892860413, "reward_std": 0.1530098393559456, "rewards/accuracy_reward": 0.8295835554599762, "rewards/format_reward": 0.9897959232330322, "step": 3479 }, { "completion_length": 184.01020050048828, "epoch": 0.350188679245283, "grad_norm": 0.9452343583106995, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.71300607919693, "reward_std": 0.1436128318309784, "rewards/accuracy_reward": 0.7232102155685425, "rewards/format_reward": 0.9897959232330322, "step": 3480 }, { "completion_length": 253.4897918701172, "epoch": 0.35028930817610066, "grad_norm": 0.5424906015396118, "kl": 0.0672607421875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8495846390724182, "reward_std": 0.11539776250720024, "rewards/accuracy_reward": 0.8495847284793854, "rewards/format_reward": 1.0, "step": 3481 }, { "completion_length": 243.75509643554688, "epoch": 0.35038993710691824, "grad_norm": 0.5033093094825745, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8068512678146362, "reward_std": 0.1297103501856327, "rewards/accuracy_reward": 0.8170553743839264, "rewards/format_reward": 0.9897959232330322, "step": 3482 }, { "completion_length": 255.9183578491211, "epoch": 0.3504905660377359, "grad_norm": 0.8089259266853333, "kl": 0.0496826171875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6636402606964111, "reward_std": 0.16598333418369293, "rewards/accuracy_reward": 0.6738444268703461, "rewards/format_reward": 0.9897959232330322, "step": 3483 }, { "completion_length": 253.08163452148438, "epoch": 0.35059119496855345, "grad_norm": 1.1521077156066895, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.666572391986847, "reward_std": 0.21444278955459595, "rewards/accuracy_reward": 0.6767764389514923, "rewards/format_reward": 0.9897959232330322, "step": 3484 }, { "completion_length": 215.82652282714844, "epoch": 0.3506918238993711, "grad_norm": 0.6136530041694641, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.698321282863617, "reward_std": 0.10364529117941856, "rewards/accuracy_reward": 0.7085253298282623, "rewards/format_reward": 0.9897959232330322, "step": 3485 }, { "completion_length": 299.52040100097656, "epoch": 0.3507924528301887, "grad_norm": 1.1978251934051514, "kl": 0.108154296875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.6355105638504028, "reward_std": 0.14685367047786713, "rewards/accuracy_reward": 0.6457146406173706, "rewards/format_reward": 0.9897959232330322, "step": 3486 }, { "completion_length": 280.32652282714844, "epoch": 0.3508930817610063, "grad_norm": 0.7193450331687927, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6172919869422913, "reward_std": 0.2022847980260849, "rewards/accuracy_reward": 0.6274959743022919, "rewards/format_reward": 0.9897959232330322, "step": 3487 }, { "completion_length": 188.51020050048828, "epoch": 0.3509937106918239, "grad_norm": 1.0217949151992798, "kl": 0.1044921875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8834550976753235, "reward_std": 0.1231720894575119, "rewards/accuracy_reward": 0.8936591744422913, "rewards/format_reward": 0.9897959232330322, "step": 3488 }, { "completion_length": 245.67345428466797, "epoch": 0.3510943396226415, "grad_norm": 1.0629743337631226, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6107054352760315, "reward_std": 0.19038064777851105, "rewards/accuracy_reward": 0.6209094822406769, "rewards/format_reward": 0.9897959232330322, "step": 3489 }, { "completion_length": 315.1428527832031, "epoch": 0.3511949685534591, "grad_norm": 0.4839588701725006, "kl": 0.0428466796875, "learning_rate": 1e-06, "loss": 0.0017, "reward": 1.5816326141357422, "reward_std": 0.16188224405050278, "rewards/accuracy_reward": 0.5918367356061935, "rewards/format_reward": 0.9897959232330322, "step": 3490 }, { "completion_length": 292.33673095703125, "epoch": 0.35129559748427674, "grad_norm": 0.6683568358421326, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6784903407096863, "reward_std": 0.217966727912426, "rewards/accuracy_reward": 0.6886944770812988, "rewards/format_reward": 0.9897959232330322, "step": 3491 }, { "completion_length": 239.9897918701172, "epoch": 0.3513962264150943, "grad_norm": 0.9313271045684814, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7453538179397583, "reward_std": 0.18573950976133347, "rewards/accuracy_reward": 0.7453538477420807, "rewards/format_reward": 1.0, "step": 3492 }, { "completion_length": 223.0408172607422, "epoch": 0.35149685534591196, "grad_norm": 0.5172983407974243, "kl": 0.0626220703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8013604879379272, "reward_std": 0.13481848314404488, "rewards/accuracy_reward": 0.8013605177402496, "rewards/format_reward": 1.0, "step": 3493 }, { "completion_length": 211.25509643554688, "epoch": 0.35159748427672954, "grad_norm": 0.5149495005607605, "kl": 0.050537109375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7865418195724487, "reward_std": 0.12155195325613022, "rewards/accuracy_reward": 0.7967459261417389, "rewards/format_reward": 0.9897959232330322, "step": 3494 }, { "completion_length": 242.23468780517578, "epoch": 0.3516981132075472, "grad_norm": 0.7030629515647888, "kl": 0.093994140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7815054059028625, "reward_std": 0.18607166409492493, "rewards/accuracy_reward": 0.8019136190414429, "rewards/format_reward": 0.9795918464660645, "step": 3495 }, { "completion_length": 222.1326446533203, "epoch": 0.35179874213836476, "grad_norm": 1.2330615520477295, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7413894534111023, "reward_std": 0.12015441432595253, "rewards/accuracy_reward": 0.7413895130157471, "rewards/format_reward": 1.0, "step": 3496 }, { "completion_length": 349.5102005004883, "epoch": 0.3518993710691824, "grad_norm": 0.6881365180015564, "kl": 0.0562744140625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6680271625518799, "reward_std": 0.25315041095018387, "rewards/accuracy_reward": 0.7190476059913635, "rewards/format_reward": 0.9489795565605164, "step": 3497 }, { "completion_length": 255.55101013183594, "epoch": 0.352, "grad_norm": 0.7287086248397827, "kl": 0.0665283203125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.714285671710968, "reward_std": 0.21450605243444443, "rewards/accuracy_reward": 0.7244897782802582, "rewards/format_reward": 0.9897959232330322, "step": 3498 }, { "completion_length": 271.39795684814453, "epoch": 0.3521006289308176, "grad_norm": 1.202545166015625, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6877832412719727, "reward_std": 0.25958674401044846, "rewards/accuracy_reward": 0.7081914246082306, "rewards/format_reward": 0.9795918464660645, "step": 3499 }, { "completion_length": 269.08162689208984, "epoch": 0.3522012578616352, "grad_norm": 0.7353271842002869, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7904895544052124, "reward_std": 0.16626091673970222, "rewards/accuracy_reward": 0.8108977675437927, "rewards/format_reward": 0.9795918166637421, "step": 3500 }, { "completion_length": 177.31632232666016, "epoch": 0.35230188679245283, "grad_norm": 0.6588127017021179, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8448979258537292, "reward_std": 0.1209433488547802, "rewards/accuracy_reward": 0.8448979556560516, "rewards/format_reward": 1.0, "step": 3501 }, { "completion_length": 219.61223602294922, "epoch": 0.3524025157232704, "grad_norm": 0.9986560940742493, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6439411044120789, "reward_std": 0.3190174624323845, "rewards/accuracy_reward": 0.68475741147995, "rewards/format_reward": 0.9591836333274841, "step": 3502 }, { "completion_length": 254.34693908691406, "epoch": 0.35250314465408805, "grad_norm": 0.5448716282844543, "kl": 0.051025390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7635567784309387, "reward_std": 0.16783398762345314, "rewards/accuracy_reward": 0.7737609148025513, "rewards/format_reward": 0.9897959232330322, "step": 3503 }, { "completion_length": 262.4897918701172, "epoch": 0.35260377358490563, "grad_norm": 0.5823696851730347, "kl": 0.0848388671875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7080047726631165, "reward_std": 0.17783300578594208, "rewards/accuracy_reward": 0.728412926197052, "rewards/format_reward": 0.9795918464660645, "step": 3504 }, { "completion_length": 208.52040100097656, "epoch": 0.35270440251572327, "grad_norm": 0.6985712647438049, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.725690245628357, "reward_std": 0.1667308211326599, "rewards/accuracy_reward": 0.7460984587669373, "rewards/format_reward": 0.9795918166637421, "step": 3505 }, { "completion_length": 366.1122283935547, "epoch": 0.3528050314465409, "grad_norm": 0.6383957266807556, "kl": 0.052001953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6868480443954468, "reward_std": 0.311056911945343, "rewards/accuracy_reward": 0.7378684878349304, "rewards/format_reward": 0.9489795565605164, "step": 3506 }, { "completion_length": 180.2142791748047, "epoch": 0.3529056603773585, "grad_norm": 1.7458670139312744, "kl": 0.0885009765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6702948212623596, "reward_std": 0.19661186635494232, "rewards/accuracy_reward": 0.680498868227005, "rewards/format_reward": 0.9897959232330322, "step": 3507 }, { "completion_length": 250.69387817382812, "epoch": 0.3530062893081761, "grad_norm": 0.7827959656715393, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.753555953502655, "reward_std": 0.16752002760767937, "rewards/accuracy_reward": 0.7535559237003326, "rewards/format_reward": 1.0, "step": 3508 }, { "completion_length": 228.57141876220703, "epoch": 0.3531069182389937, "grad_norm": 0.8697925209999084, "kl": 0.0919189453125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.743490755558014, "reward_std": 0.2765252888202667, "rewards/accuracy_reward": 0.7741030156612396, "rewards/format_reward": 0.9693877398967743, "step": 3509 }, { "completion_length": 289.18365478515625, "epoch": 0.35320754716981134, "grad_norm": 0.5580165982246399, "kl": 0.0584716796875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7278035283088684, "reward_std": 0.18608776479959488, "rewards/accuracy_reward": 0.748211681842804, "rewards/format_reward": 0.9795918464660645, "step": 3510 }, { "completion_length": 236.1428451538086, "epoch": 0.3533081761006289, "grad_norm": 1.9167219400405884, "kl": 0.091064453125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.795606553554535, "reward_std": 0.12878796830773354, "rewards/accuracy_reward": 0.8058106601238251, "rewards/format_reward": 0.9897959232330322, "step": 3511 }, { "completion_length": 204.61223602294922, "epoch": 0.35340880503144656, "grad_norm": 1.22751784324646, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.726717233657837, "reward_std": 0.1656918078660965, "rewards/accuracy_reward": 0.7369213104248047, "rewards/format_reward": 0.9897959232330322, "step": 3512 }, { "completion_length": 297.6428451538086, "epoch": 0.35350943396226414, "grad_norm": 0.9268792271614075, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6194897294044495, "reward_std": 0.322603702545166, "rewards/accuracy_reward": 0.6501020342111588, "rewards/format_reward": 0.9693877398967743, "step": 3513 }, { "completion_length": 221.9387664794922, "epoch": 0.3536100628930818, "grad_norm": 0.3935980796813965, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8287981748580933, "reward_std": 0.07335802167654037, "rewards/accuracy_reward": 0.8287981748580933, "rewards/format_reward": 1.0, "step": 3514 }, { "completion_length": 282.2448959350586, "epoch": 0.35371069182389936, "grad_norm": 0.5801362991333008, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7246443629264832, "reward_std": 0.12618720531463623, "rewards/accuracy_reward": 0.7552566230297089, "rewards/format_reward": 0.9693877398967743, "step": 3515 }, { "completion_length": 198.1836700439453, "epoch": 0.353811320754717, "grad_norm": 1.2466464042663574, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6190620064735413, "reward_std": 0.164039745926857, "rewards/accuracy_reward": 0.6292661428451538, "rewards/format_reward": 0.9897959232330322, "step": 3516 }, { "completion_length": 180.57142639160156, "epoch": 0.3539119496855346, "grad_norm": 1.164454460144043, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8115010857582092, "reward_std": 0.12949015572667122, "rewards/accuracy_reward": 0.8115011155605316, "rewards/format_reward": 1.0, "step": 3517 }, { "completion_length": 209.9081573486328, "epoch": 0.3540125786163522, "grad_norm": 0.6163532733917236, "kl": 0.047607421875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.8631194829940796, "reward_std": 0.12917421385645866, "rewards/accuracy_reward": 0.8631195425987244, "rewards/format_reward": 1.0, "step": 3518 }, { "completion_length": 218.27550506591797, "epoch": 0.3541132075471698, "grad_norm": 0.9545528888702393, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7598742246627808, "reward_std": 0.20195535570383072, "rewards/accuracy_reward": 0.7700783014297485, "rewards/format_reward": 0.9897959232330322, "step": 3519 }, { "completion_length": 223.4081573486328, "epoch": 0.35421383647798743, "grad_norm": 1.1059314012527466, "kl": 0.09326171875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6868667602539062, "reward_std": 0.22009439766407013, "rewards/accuracy_reward": 0.7072749137878418, "rewards/format_reward": 0.9795918166637421, "step": 3520 }, { "completion_length": 223.64285278320312, "epoch": 0.354314465408805, "grad_norm": 0.6088231205940247, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7744169235229492, "reward_std": 0.10117471590638161, "rewards/accuracy_reward": 0.7744168639183044, "rewards/format_reward": 1.0, "step": 3521 }, { "completion_length": 215.4285659790039, "epoch": 0.35441509433962265, "grad_norm": 0.8994325399398804, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7428571581840515, "reward_std": 0.2067156285047531, "rewards/accuracy_reward": 0.7428570687770844, "rewards/format_reward": 1.0, "step": 3522 }, { "completion_length": 202.9081573486328, "epoch": 0.35451572327044023, "grad_norm": 1.1096677780151367, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.784985363483429, "reward_std": 0.19285381585359573, "rewards/accuracy_reward": 0.7849854826927185, "rewards/format_reward": 1.0, "step": 3523 }, { "completion_length": 293.2142791748047, "epoch": 0.35461635220125787, "grad_norm": 0.5492268800735474, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.767417848110199, "reward_std": 0.21095222979784012, "rewards/accuracy_reward": 0.7776220738887787, "rewards/format_reward": 0.9897959232330322, "step": 3524 }, { "completion_length": 311.07142639160156, "epoch": 0.35471698113207545, "grad_norm": 1.1206141710281372, "kl": 0.044921875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6122448444366455, "reward_std": 0.237223282456398, "rewards/accuracy_reward": 0.6530612111091614, "rewards/format_reward": 0.9591836631298065, "step": 3525 }, { "completion_length": 254.88774871826172, "epoch": 0.3548176100628931, "grad_norm": 0.543860137462616, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8778749108314514, "reward_std": 0.21865678578615189, "rewards/accuracy_reward": 0.8982831239700317, "rewards/format_reward": 0.9795918464660645, "step": 3526 }, { "completion_length": 201.87754821777344, "epoch": 0.35491823899371067, "grad_norm": 0.8172701597213745, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.765480637550354, "reward_std": 0.1603841595351696, "rewards/accuracy_reward": 0.7654807269573212, "rewards/format_reward": 1.0, "step": 3527 }, { "completion_length": 231.67346954345703, "epoch": 0.3550188679245283, "grad_norm": 1.7841782569885254, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6373582482337952, "reward_std": 0.22907964885234833, "rewards/accuracy_reward": 0.6679705083370209, "rewards/format_reward": 0.9693877398967743, "step": 3528 }, { "completion_length": 245.81632232666016, "epoch": 0.35511949685534594, "grad_norm": 1.2870230674743652, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7527135014533997, "reward_std": 0.1947420984506607, "rewards/accuracy_reward": 0.7629176676273346, "rewards/format_reward": 0.9897959232330322, "step": 3529 }, { "completion_length": 229.1836700439453, "epoch": 0.3552201257861635, "grad_norm": 2.169715642929077, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7336846590042114, "reward_std": 0.2415539175271988, "rewards/accuracy_reward": 0.7438887655735016, "rewards/format_reward": 0.9897959232330322, "step": 3530 }, { "completion_length": 263.39796447753906, "epoch": 0.35532075471698116, "grad_norm": 0.6920924782752991, "kl": 0.0986328125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6397928595542908, "reward_std": 0.1421484649181366, "rewards/accuracy_reward": 0.6602009832859039, "rewards/format_reward": 0.9795918166637421, "step": 3531 }, { "completion_length": 194.83673095703125, "epoch": 0.35542138364779874, "grad_norm": 1.3370329141616821, "kl": 0.11572265625, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.7315354943275452, "reward_std": 0.23987676203250885, "rewards/accuracy_reward": 0.7417395412921906, "rewards/format_reward": 0.9897959232330322, "step": 3532 }, { "completion_length": 324.77549743652344, "epoch": 0.3555220125786164, "grad_norm": 0.7137077450752258, "kl": 0.05029296875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6640850901603699, "reward_std": 0.22974933683872223, "rewards/accuracy_reward": 0.694697380065918, "rewards/format_reward": 0.9693877398967743, "step": 3533 }, { "completion_length": 194.9897918701172, "epoch": 0.35562264150943396, "grad_norm": 1.050158143043518, "kl": 0.09814453125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7350073456764221, "reward_std": 0.24175165593624115, "rewards/accuracy_reward": 0.7452113926410675, "rewards/format_reward": 0.9897959232330322, "step": 3534 }, { "completion_length": 225.75509643554688, "epoch": 0.3557232704402516, "grad_norm": 0.7364485859870911, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.5954627990722656, "reward_std": 0.14919191226363182, "rewards/accuracy_reward": 0.6056668758392334, "rewards/format_reward": 0.9897959232330322, "step": 3535 }, { "completion_length": 286.40816497802734, "epoch": 0.3558238993710692, "grad_norm": 0.6859821081161499, "kl": 0.056396484375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6570404171943665, "reward_std": 0.16663459688425064, "rewards/accuracy_reward": 0.6774486005306244, "rewards/format_reward": 0.9795918166637421, "step": 3536 }, { "completion_length": 332.70408630371094, "epoch": 0.3559245283018868, "grad_norm": 0.8220475316047668, "kl": 0.0733642578125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.3505544662475586, "reward_std": 0.22911617159843445, "rewards/accuracy_reward": 0.3913707435131073, "rewards/format_reward": 0.9591836631298065, "step": 3537 }, { "completion_length": 208.9285659790039, "epoch": 0.3560251572327044, "grad_norm": 1.1690354347229004, "kl": 0.103271484375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6977495551109314, "reward_std": 0.17894130945205688, "rewards/accuracy_reward": 0.7079536020755768, "rewards/format_reward": 0.9897959232330322, "step": 3538 }, { "completion_length": 257.86734771728516, "epoch": 0.35612578616352203, "grad_norm": 2.688380479812622, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.641728162765503, "reward_std": 0.2648253068327904, "rewards/accuracy_reward": 0.6621363461017609, "rewards/format_reward": 0.9795918166637421, "step": 3539 }, { "completion_length": 221.29591369628906, "epoch": 0.3562264150943396, "grad_norm": 0.5175644755363464, "kl": 0.0516357421875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7893540859222412, "reward_std": 0.13559884205460548, "rewards/accuracy_reward": 0.789354145526886, "rewards/format_reward": 1.0, "step": 3540 }, { "completion_length": 223.18367767333984, "epoch": 0.35632704402515725, "grad_norm": 0.9288031458854675, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.72934889793396, "reward_std": 0.20312350988388062, "rewards/accuracy_reward": 0.7395529747009277, "rewards/format_reward": 0.9897959232330322, "step": 3541 }, { "completion_length": 254.82652282714844, "epoch": 0.35642767295597483, "grad_norm": 1.0414204597473145, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6024669408798218, "reward_std": 0.22016263753175735, "rewards/accuracy_reward": 0.6228752136230469, "rewards/format_reward": 0.9795918166637421, "step": 3542 }, { "completion_length": 321.3367156982422, "epoch": 0.35652830188679246, "grad_norm": 0.9655390381813049, "kl": 0.0523681640625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5762277245521545, "reward_std": 0.33386411517858505, "rewards/accuracy_reward": 0.5966358631849289, "rewards/format_reward": 0.9795918166637421, "step": 3543 }, { "completion_length": 207.83673095703125, "epoch": 0.35662893081761005, "grad_norm": 0.8752913475036621, "kl": 0.097900390625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8625850081443787, "reward_std": 0.11882129684090614, "rewards/accuracy_reward": 0.8625850081443787, "rewards/format_reward": 1.0, "step": 3544 }, { "completion_length": 300.8367156982422, "epoch": 0.3567295597484277, "grad_norm": 1.1098166704177856, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.5949037671089172, "reward_std": 0.18653664737939835, "rewards/accuracy_reward": 0.6153119802474976, "rewards/format_reward": 0.9795918464660645, "step": 3545 }, { "completion_length": 314.3571319580078, "epoch": 0.35683018867924526, "grad_norm": 1.2565606832504272, "kl": 0.053955078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.684750497341156, "reward_std": 0.14558999985456467, "rewards/accuracy_reward": 0.6847505569458008, "rewards/format_reward": 1.0, "step": 3546 }, { "completion_length": 239.83673095703125, "epoch": 0.3569308176100629, "grad_norm": 0.7709594964981079, "kl": 0.05224609375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6469193696975708, "reward_std": 0.20858216285705566, "rewards/accuracy_reward": 0.6469193696975708, "rewards/format_reward": 1.0, "step": 3547 }, { "completion_length": 293.79591369628906, "epoch": 0.3570314465408805, "grad_norm": 0.5181354284286499, "kl": 0.0496826171875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8775509595870972, "reward_std": 0.1270286664366722, "rewards/accuracy_reward": 0.8775509893894196, "rewards/format_reward": 1.0, "step": 3548 }, { "completion_length": 224.35713958740234, "epoch": 0.3571320754716981, "grad_norm": 1.0318666696548462, "kl": 0.10595703125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8558551669120789, "reward_std": 0.06909763719886541, "rewards/accuracy_reward": 0.8558551371097565, "rewards/format_reward": 1.0, "step": 3549 }, { "completion_length": 193.2346954345703, "epoch": 0.3572327044025157, "grad_norm": 1.554993987083435, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8698979020118713, "reward_std": 0.17848117649555206, "rewards/accuracy_reward": 0.8801020383834839, "rewards/format_reward": 0.9897959232330322, "step": 3550 }, { "completion_length": 273.79591369628906, "epoch": 0.35733333333333334, "grad_norm": 0.7785702347755432, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7789115905761719, "reward_std": 0.22668515145778656, "rewards/accuracy_reward": 0.819727897644043, "rewards/format_reward": 0.9591836631298065, "step": 3551 }, { "completion_length": 385.0918273925781, "epoch": 0.3574339622641509, "grad_norm": 0.5454128980636597, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5473021864891052, "reward_std": 0.24174072593450546, "rewards/accuracy_reward": 0.5677103400230408, "rewards/format_reward": 0.9795918166637421, "step": 3552 }, { "completion_length": 284.79590606689453, "epoch": 0.35753459119496855, "grad_norm": 0.9566147327423096, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7143564224243164, "reward_std": 0.22335734963417053, "rewards/accuracy_reward": 0.7347645461559296, "rewards/format_reward": 0.9795918166637421, "step": 3553 }, { "completion_length": 224.4795799255371, "epoch": 0.3576352201257862, "grad_norm": 0.6449536085128784, "kl": 0.1162109375, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7432487607002258, "reward_std": 0.15808358788490295, "rewards/accuracy_reward": 0.7738610506057739, "rewards/format_reward": 0.9693877398967743, "step": 3554 }, { "completion_length": 180.35713958740234, "epoch": 0.35773584905660377, "grad_norm": 0.34897172451019287, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8573371767997742, "reward_std": 0.06328720971941948, "rewards/accuracy_reward": 0.8573371767997742, "rewards/format_reward": 1.0, "step": 3555 }, { "completion_length": 316.2040710449219, "epoch": 0.3578364779874214, "grad_norm": 0.6526514291763306, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5540477633476257, "reward_std": 0.19958074018359184, "rewards/accuracy_reward": 0.5540477335453033, "rewards/format_reward": 1.0, "step": 3556 }, { "completion_length": 261.60203552246094, "epoch": 0.357937106918239, "grad_norm": 0.7446193099021912, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6880002617835999, "reward_std": 0.24197886884212494, "rewards/accuracy_reward": 0.718612551689148, "rewards/format_reward": 0.9693877398967743, "step": 3557 }, { "completion_length": 228.4081573486328, "epoch": 0.3580377358490566, "grad_norm": 0.7321778535842896, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6640534400939941, "reward_std": 0.1416991800069809, "rewards/accuracy_reward": 0.6640534847974777, "rewards/format_reward": 1.0, "step": 3558 }, { "completion_length": 266.5306167602539, "epoch": 0.3581383647798742, "grad_norm": 0.927789032459259, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7460037469863892, "reward_std": 0.20480649173259735, "rewards/accuracy_reward": 0.7562077939510345, "rewards/format_reward": 0.9897959232330322, "step": 3559 }, { "completion_length": 256.5918273925781, "epoch": 0.35823899371069184, "grad_norm": 1.1164456605911255, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8007774353027344, "reward_std": 0.22767274081707, "rewards/accuracy_reward": 0.800777405500412, "rewards/format_reward": 1.0, "step": 3560 }, { "completion_length": 301.2755126953125, "epoch": 0.3583396226415094, "grad_norm": 0.5762072205543518, "kl": 0.0704345703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6122448444366455, "reward_std": 0.20908795297145844, "rewards/accuracy_reward": 0.6224489361047745, "rewards/format_reward": 0.9897959232330322, "step": 3561 }, { "completion_length": 221.56122589111328, "epoch": 0.35844025157232706, "grad_norm": 0.35385075211524963, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7703929543495178, "reward_std": 0.046204062178730965, "rewards/accuracy_reward": 0.7703929841518402, "rewards/format_reward": 1.0, "step": 3562 }, { "completion_length": 214.02040100097656, "epoch": 0.35854088050314464, "grad_norm": 1.9726989269256592, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6979520320892334, "reward_std": 0.08143319096416235, "rewards/accuracy_reward": 0.6979520320892334, "rewards/format_reward": 1.0, "step": 3563 }, { "completion_length": 259.6632614135742, "epoch": 0.3586415094339623, "grad_norm": 0.4015101492404938, "kl": 0.0562744140625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7345542907714844, "reward_std": 0.13488266244530678, "rewards/accuracy_reward": 0.7345543503761292, "rewards/format_reward": 1.0, "step": 3564 }, { "completion_length": 234.1836700439453, "epoch": 0.35874213836477986, "grad_norm": 1.366690993309021, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7463877201080322, "reward_std": 0.11354698613286018, "rewards/accuracy_reward": 0.7463877201080322, "rewards/format_reward": 1.0, "step": 3565 }, { "completion_length": 308.56121826171875, "epoch": 0.3588427672955975, "grad_norm": 0.6985089778900146, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6158721446990967, "reward_std": 0.1134025976061821, "rewards/accuracy_reward": 0.6260763257741928, "rewards/format_reward": 0.9897959232330322, "step": 3566 }, { "completion_length": 241.61223602294922, "epoch": 0.3589433962264151, "grad_norm": 0.5008682608604431, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6481280326843262, "reward_std": 0.1172591783106327, "rewards/accuracy_reward": 0.648128017783165, "rewards/format_reward": 1.0, "step": 3567 }, { "completion_length": 228.26529693603516, "epoch": 0.3590440251572327, "grad_norm": 0.6192969679832458, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.836309552192688, "reward_std": 0.11396532133221626, "rewards/accuracy_reward": 0.8363094925880432, "rewards/format_reward": 1.0, "step": 3568 }, { "completion_length": 263.0816345214844, "epoch": 0.3591446540880503, "grad_norm": 0.9147610068321228, "kl": 0.122802734375, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.6886693239212036, "reward_std": 0.1439054012298584, "rewards/accuracy_reward": 0.709077388048172, "rewards/format_reward": 0.9795918166637421, "step": 3569 }, { "completion_length": 289.15306091308594, "epoch": 0.35924528301886793, "grad_norm": 0.5968934893608093, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6427318453788757, "reward_std": 0.197550181299448, "rewards/accuracy_reward": 0.663140058517456, "rewards/format_reward": 0.9795918166637421, "step": 3570 }, { "completion_length": 238.3571319580078, "epoch": 0.3593459119496855, "grad_norm": 1.2129771709442139, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6691737174987793, "reward_std": 0.18994755297899246, "rewards/accuracy_reward": 0.6793777346611023, "rewards/format_reward": 0.9897959232330322, "step": 3571 }, { "completion_length": 241.90814971923828, "epoch": 0.35944654088050315, "grad_norm": 0.721747875213623, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7115736603736877, "reward_std": 0.20043832808732986, "rewards/accuracy_reward": 0.7217777073383331, "rewards/format_reward": 0.9897959232330322, "step": 3572 }, { "completion_length": 245.12244415283203, "epoch": 0.35954716981132073, "grad_norm": 0.5750643014907837, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6834158301353455, "reward_std": 0.2272597700357437, "rewards/accuracy_reward": 0.7038241028785706, "rewards/format_reward": 0.9795918464660645, "step": 3573 }, { "completion_length": 254.80611419677734, "epoch": 0.35964779874213837, "grad_norm": 1.3685883283615112, "kl": 0.0665283203125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5642996430397034, "reward_std": 0.3154746741056442, "rewards/accuracy_reward": 0.5949119031429291, "rewards/format_reward": 0.9693877398967743, "step": 3574 }, { "completion_length": 257.8673400878906, "epoch": 0.35974842767295595, "grad_norm": 0.5687031745910645, "kl": 0.0582275390625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5484936237335205, "reward_std": 0.13839387893676758, "rewards/accuracy_reward": 0.5484936684370041, "rewards/format_reward": 1.0, "step": 3575 }, { "completion_length": 225.12244415283203, "epoch": 0.3598490566037736, "grad_norm": 0.5049816966056824, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.937311828136444, "reward_std": 0.13686332292854786, "rewards/accuracy_reward": 0.9475159645080566, "rewards/format_reward": 0.9897959232330322, "step": 3576 }, { "completion_length": 211.04080963134766, "epoch": 0.35994968553459117, "grad_norm": 0.5232968926429749, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.842671513557434, "reward_std": 0.14632577449083328, "rewards/accuracy_reward": 0.8426715731620789, "rewards/format_reward": 1.0, "step": 3577 }, { "completion_length": 269.2550964355469, "epoch": 0.3600503144654088, "grad_norm": 0.6233444213867188, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7337470054626465, "reward_std": 0.22806058824062347, "rewards/accuracy_reward": 0.764359325170517, "rewards/format_reward": 0.9693877398967743, "step": 3578 }, { "completion_length": 242.65306091308594, "epoch": 0.36015094339622644, "grad_norm": 2.0985593795776367, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6329180598258972, "reward_std": 0.2542840912938118, "rewards/accuracy_reward": 0.6533262133598328, "rewards/format_reward": 0.9795918166637421, "step": 3579 }, { "completion_length": 287.82652282714844, "epoch": 0.360251572327044, "grad_norm": 1.2438888549804688, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.5669408440589905, "reward_std": 0.1759592965245247, "rewards/accuracy_reward": 0.5771450102329254, "rewards/format_reward": 0.9897959232330322, "step": 3580 }, { "completion_length": 248.4795913696289, "epoch": 0.36035220125786166, "grad_norm": 0.3701714277267456, "kl": 0.0572509765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8775509595870972, "reward_std": 0.11584595590829849, "rewards/accuracy_reward": 0.8877550661563873, "rewards/format_reward": 0.9897959232330322, "step": 3581 }, { "completion_length": 250.02039337158203, "epoch": 0.36045283018867924, "grad_norm": 1.4804086685180664, "kl": 0.111083984375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8064031600952148, "reward_std": 0.1882563829421997, "rewards/accuracy_reward": 0.8268112242221832, "rewards/format_reward": 0.9795918166637421, "step": 3582 }, { "completion_length": 239.7040786743164, "epoch": 0.3605534591194969, "grad_norm": 1.0346660614013672, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7156705260276794, "reward_std": 0.17804372310638428, "rewards/accuracy_reward": 0.7258745729923248, "rewards/format_reward": 0.9897959232330322, "step": 3583 }, { "completion_length": 226.23468780517578, "epoch": 0.36065408805031446, "grad_norm": 1.1850268840789795, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6709908843040466, "reward_std": 0.09338458999991417, "rewards/accuracy_reward": 0.6709909737110138, "rewards/format_reward": 1.0, "step": 3584 }, { "completion_length": 280.4183654785156, "epoch": 0.3607547169811321, "grad_norm": 1.2908357381820679, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.833951711654663, "reward_std": 0.2827663943171501, "rewards/accuracy_reward": 0.854359894990921, "rewards/format_reward": 0.9795918166637421, "step": 3585 }, { "completion_length": 231.4693832397461, "epoch": 0.3608553459119497, "grad_norm": 0.8132057189941406, "kl": 0.1162109375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.755516767501831, "reward_std": 0.23500092327594757, "rewards/accuracy_reward": 0.786128968000412, "rewards/format_reward": 0.9693877398967743, "step": 3586 }, { "completion_length": 278.948974609375, "epoch": 0.3609559748427673, "grad_norm": 0.9924121499061584, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.727078914642334, "reward_std": 0.20487649738788605, "rewards/accuracy_reward": 0.7780993282794952, "rewards/format_reward": 0.9489795565605164, "step": 3587 }, { "completion_length": 203.38775634765625, "epoch": 0.3610566037735849, "grad_norm": 0.6149024367332458, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8129751086235046, "reward_std": 0.11799773201346397, "rewards/accuracy_reward": 0.82317915558815, "rewards/format_reward": 0.9897959232330322, "step": 3588 }, { "completion_length": 222.82652282714844, "epoch": 0.36115723270440253, "grad_norm": 0.3761747181415558, "kl": 0.097900390625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8469387292861938, "reward_std": 0.08099238574504852, "rewards/accuracy_reward": 0.8673469424247742, "rewards/format_reward": 0.9795918464660645, "step": 3589 }, { "completion_length": 251.71428680419922, "epoch": 0.3612578616352201, "grad_norm": 0.6641865968704224, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7755101323127747, "reward_std": 0.23446819931268692, "rewards/accuracy_reward": 0.7959183156490326, "rewards/format_reward": 0.9795918166637421, "step": 3590 }, { "completion_length": 259.6836700439453, "epoch": 0.36135849056603775, "grad_norm": 0.4746130406856537, "kl": 0.0650634765625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7788358330726624, "reward_std": 0.173354834318161, "rewards/accuracy_reward": 0.8094481825828552, "rewards/format_reward": 0.9693877398967743, "step": 3591 }, { "completion_length": 272.60203552246094, "epoch": 0.36145911949685533, "grad_norm": 0.5969839692115784, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7147870659828186, "reward_std": 0.26304951310157776, "rewards/accuracy_reward": 0.7556034028530121, "rewards/format_reward": 0.9591836333274841, "step": 3592 }, { "completion_length": 285.3775329589844, "epoch": 0.36155974842767297, "grad_norm": 0.7215924263000488, "kl": 0.056640625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6055259108543396, "reward_std": 0.19579489529132843, "rewards/accuracy_reward": 0.676954597234726, "rewards/format_reward": 0.9285714030265808, "step": 3593 }, { "completion_length": 207.07141876220703, "epoch": 0.36166037735849055, "grad_norm": 0.9807374477386475, "kl": 0.118896484375, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.8113554120063782, "reward_std": 0.16339374333620071, "rewards/accuracy_reward": 0.8317637145519257, "rewards/format_reward": 0.9795918166637421, "step": 3594 }, { "completion_length": 295.1326446533203, "epoch": 0.3617610062893082, "grad_norm": 0.5453758835792542, "kl": 0.0594482421875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5797773003578186, "reward_std": 0.24791742861270905, "rewards/accuracy_reward": 0.6205936670303345, "rewards/format_reward": 0.9591836631298065, "step": 3595 }, { "completion_length": 208.2755012512207, "epoch": 0.36186163522012577, "grad_norm": 0.9628947377204895, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.69016695022583, "reward_std": 0.14069291949272156, "rewards/accuracy_reward": 0.7105751037597656, "rewards/format_reward": 0.9795918166637421, "step": 3596 }, { "completion_length": 258.07141876220703, "epoch": 0.3619622641509434, "grad_norm": 0.5356568098068237, "kl": 0.0750732421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6779150366783142, "reward_std": 0.1839727684855461, "rewards/accuracy_reward": 0.688119113445282, "rewards/format_reward": 0.9897959232330322, "step": 3597 }, { "completion_length": 308.948974609375, "epoch": 0.362062893081761, "grad_norm": 1.094186544418335, "kl": 0.0662841796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.702363669872284, "reward_std": 0.3020816743373871, "rewards/accuracy_reward": 0.7227718830108643, "rewards/format_reward": 0.9795918464660645, "step": 3598 }, { "completion_length": 234.46937561035156, "epoch": 0.3621635220125786, "grad_norm": 1.2029778957366943, "kl": 0.09765625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7998859882354736, "reward_std": 0.17872720211744308, "rewards/accuracy_reward": 0.8202941715717316, "rewards/format_reward": 0.9795918464660645, "step": 3599 }, { "completion_length": 263.5918273925781, "epoch": 0.3622641509433962, "grad_norm": 0.35263532400131226, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6122934222221375, "reward_std": 0.07959635555744171, "rewards/accuracy_reward": 0.6224975287914276, "rewards/format_reward": 0.9897959232330322, "step": 3600 }, { "completion_length": 258.6938781738281, "epoch": 0.36236477987421384, "grad_norm": 0.6638080477714539, "kl": 0.0682373046875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6353541016578674, "reward_std": 0.18539083376526833, "rewards/accuracy_reward": 0.6659664064645767, "rewards/format_reward": 0.9693877398967743, "step": 3601 }, { "completion_length": 279.55101013183594, "epoch": 0.3624654088050314, "grad_norm": 0.7549987435340881, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5837751030921936, "reward_std": 0.22578030824661255, "rewards/accuracy_reward": 0.6143873333930969, "rewards/format_reward": 0.9693877398967743, "step": 3602 }, { "completion_length": 240.56121063232422, "epoch": 0.36256603773584906, "grad_norm": 0.9383224844932556, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.733369767665863, "reward_std": 0.14917638525366783, "rewards/accuracy_reward": 0.7333698272705078, "rewards/format_reward": 1.0, "step": 3603 }, { "completion_length": 293.3061218261719, "epoch": 0.3626666666666667, "grad_norm": 1.3144545555114746, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6617792844772339, "reward_std": 0.19906110689044, "rewards/accuracy_reward": 0.6719833314418793, "rewards/format_reward": 0.9897959232330322, "step": 3604 }, { "completion_length": 232.7551040649414, "epoch": 0.3627672955974843, "grad_norm": 0.6820589303970337, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.838754117488861, "reward_std": 0.15955977141857147, "rewards/accuracy_reward": 0.8387540876865387, "rewards/format_reward": 1.0, "step": 3605 }, { "completion_length": 268.42857360839844, "epoch": 0.3628679245283019, "grad_norm": 0.8545317649841309, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.648250699043274, "reward_std": 0.20914063602685928, "rewards/accuracy_reward": 0.6890670657157898, "rewards/format_reward": 0.9591836631298065, "step": 3606 }, { "completion_length": 249.75509643554688, "epoch": 0.3629685534591195, "grad_norm": 0.8842860460281372, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.779220700263977, "reward_std": 0.1698284074664116, "rewards/accuracy_reward": 0.7894248366355896, "rewards/format_reward": 0.9897959232330322, "step": 3607 }, { "completion_length": 268.0408020019531, "epoch": 0.36306918238993713, "grad_norm": 0.7931446433067322, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7847043871879578, "reward_std": 0.1706242896616459, "rewards/accuracy_reward": 0.7847043871879578, "rewards/format_reward": 1.0, "step": 3608 }, { "completion_length": 223.76529693603516, "epoch": 0.3631698113207547, "grad_norm": 1.0477948188781738, "kl": 0.0631103515625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.851020336151123, "reward_std": 0.16482511907815933, "rewards/accuracy_reward": 0.8816326260566711, "rewards/format_reward": 0.9693877398967743, "step": 3609 }, { "completion_length": 215.4285659790039, "epoch": 0.36327044025157235, "grad_norm": 0.7139859199523926, "kl": 0.11474609375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.8537018299102783, "reward_std": 0.08008625358343124, "rewards/accuracy_reward": 0.8639059066772461, "rewards/format_reward": 0.9897959232330322, "step": 3610 }, { "completion_length": 231.81632232666016, "epoch": 0.3633710691823899, "grad_norm": 0.9230009317398071, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7335771918296814, "reward_std": 0.2782597169280052, "rewards/accuracy_reward": 0.7641895413398743, "rewards/format_reward": 0.9693877398967743, "step": 3611 }, { "completion_length": 283.82652282714844, "epoch": 0.36347169811320756, "grad_norm": 0.8001709580421448, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8673468828201294, "reward_std": 0.20628086477518082, "rewards/accuracy_reward": 0.8877550661563873, "rewards/format_reward": 0.9795918166637421, "step": 3612 }, { "completion_length": 209.37754821777344, "epoch": 0.36357232704402515, "grad_norm": 0.5395763516426086, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8302054405212402, "reward_std": 0.19587301462888718, "rewards/accuracy_reward": 0.8506136536598206, "rewards/format_reward": 0.9795918166637421, "step": 3613 }, { "completion_length": 246.9081573486328, "epoch": 0.3636729559748428, "grad_norm": 1.2510288953781128, "kl": 0.0938720703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6966060996055603, "reward_std": 0.2514200061559677, "rewards/accuracy_reward": 0.7068101167678833, "rewards/format_reward": 0.9897959232330322, "step": 3614 }, { "completion_length": 273.7142868041992, "epoch": 0.36377358490566036, "grad_norm": 0.5361382365226746, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.5131725668907166, "reward_std": 0.21557174250483513, "rewards/accuracy_reward": 0.5233766138553619, "rewards/format_reward": 0.9897959232330322, "step": 3615 }, { "completion_length": 209.78570556640625, "epoch": 0.363874213836478, "grad_norm": 1.1750619411468506, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7848700284957886, "reward_std": 0.213826984167099, "rewards/accuracy_reward": 0.7950741350650787, "rewards/format_reward": 0.9897959232330322, "step": 3616 }, { "completion_length": 266.04080963134766, "epoch": 0.3639748427672956, "grad_norm": 0.3615163266658783, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6929665803909302, "reward_std": 0.10933054611086845, "rewards/accuracy_reward": 0.7235788404941559, "rewards/format_reward": 0.9693877398967743, "step": 3617 }, { "completion_length": 286.3673400878906, "epoch": 0.3640754716981132, "grad_norm": 0.6544961929321289, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6020407676696777, "reward_std": 0.22716239839792252, "rewards/accuracy_reward": 0.6326530575752258, "rewards/format_reward": 0.9693877398967743, "step": 3618 }, { "completion_length": 220.14285278320312, "epoch": 0.3641761006289308, "grad_norm": 0.5790286064147949, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.9110413789749146, "reward_std": 0.07862937077879906, "rewards/accuracy_reward": 0.9212454557418823, "rewards/format_reward": 0.9897959232330322, "step": 3619 }, { "completion_length": 222.02040100097656, "epoch": 0.36427672955974844, "grad_norm": 0.7449403405189514, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.77273029088974, "reward_std": 0.07426693104207516, "rewards/accuracy_reward": 0.7727302312850952, "rewards/format_reward": 1.0, "step": 3620 }, { "completion_length": 243.9897918701172, "epoch": 0.364377358490566, "grad_norm": 3.611638069152832, "kl": 0.0689697265625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7311996817588806, "reward_std": 0.1574590504169464, "rewards/accuracy_reward": 0.7414038181304932, "rewards/format_reward": 0.9897959232330322, "step": 3621 }, { "completion_length": 216.58162689208984, "epoch": 0.36447798742138365, "grad_norm": 2.2454171180725098, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7613217234611511, "reward_std": 0.135530486702919, "rewards/accuracy_reward": 0.7715257406234741, "rewards/format_reward": 0.9897959232330322, "step": 3622 }, { "completion_length": 267.09183502197266, "epoch": 0.36457861635220123, "grad_norm": 0.5143541097640991, "kl": 0.0657958984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6184841990470886, "reward_std": 0.16730325669050217, "rewards/accuracy_reward": 0.6388923525810242, "rewards/format_reward": 0.9795918464660645, "step": 3623 }, { "completion_length": 214.6224365234375, "epoch": 0.36467924528301887, "grad_norm": 0.8302682638168335, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7241734266281128, "reward_std": 0.1559256836771965, "rewards/accuracy_reward": 0.734377533197403, "rewards/format_reward": 0.9897959232330322, "step": 3624 }, { "completion_length": 197.85713958740234, "epoch": 0.36477987421383645, "grad_norm": 1.1052354574203491, "kl": 0.0716552734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7175824046134949, "reward_std": 0.14027458429336548, "rewards/accuracy_reward": 0.737990602850914, "rewards/format_reward": 0.9795918166637421, "step": 3625 }, { "completion_length": 348.19386291503906, "epoch": 0.3648805031446541, "grad_norm": 0.846576988697052, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.793002963066101, "reward_std": 0.19845294952392578, "rewards/accuracy_reward": 0.7930029630661011, "rewards/format_reward": 1.0, "step": 3626 }, { "completion_length": 201.88774871826172, "epoch": 0.36498113207547167, "grad_norm": 0.9922435879707336, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8650410771369934, "reward_std": 0.16719838231801987, "rewards/accuracy_reward": 0.8752451837062836, "rewards/format_reward": 0.9897959232330322, "step": 3627 }, { "completion_length": 198.55101776123047, "epoch": 0.3650817610062893, "grad_norm": 1.0201431512832642, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8240404725074768, "reward_std": 0.1747956946492195, "rewards/accuracy_reward": 0.8342445492744446, "rewards/format_reward": 0.9897959232330322, "step": 3628 }, { "completion_length": 269.7550964355469, "epoch": 0.36518238993710694, "grad_norm": 0.8400273323059082, "kl": 0.0477294921875, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7093530893325806, "reward_std": 0.1887613758444786, "rewards/accuracy_reward": 0.7195570468902588, "rewards/format_reward": 0.9897959232330322, "step": 3629 }, { "completion_length": 204.4693832397461, "epoch": 0.3652830188679245, "grad_norm": 0.8480929732322693, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7184908986091614, "reward_std": 0.13725130259990692, "rewards/accuracy_reward": 0.7184910476207733, "rewards/format_reward": 1.0, "step": 3630 }, { "completion_length": 164.63264846801758, "epoch": 0.36538364779874216, "grad_norm": 0.8774202466011047, "kl": 0.0733642578125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.928571343421936, "reward_std": 0.13155817985534668, "rewards/accuracy_reward": 0.9285714030265808, "rewards/format_reward": 1.0, "step": 3631 }, { "completion_length": 202.57142639160156, "epoch": 0.36548427672955974, "grad_norm": 0.9294201731681824, "kl": 0.112548828125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.8488449454307556, "reward_std": 0.16605830192565918, "rewards/accuracy_reward": 0.848844975233078, "rewards/format_reward": 1.0, "step": 3632 }, { "completion_length": 229.57142639160156, "epoch": 0.3655849056603774, "grad_norm": 0.563233494758606, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.699934959411621, "reward_std": 0.15921884216368198, "rewards/accuracy_reward": 0.7101390659809113, "rewards/format_reward": 0.9897959232330322, "step": 3633 }, { "completion_length": 275.9183654785156, "epoch": 0.36568553459119496, "grad_norm": 0.8042015433311462, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6310473084449768, "reward_std": 0.13762886449694633, "rewards/accuracy_reward": 0.6310473382472992, "rewards/format_reward": 1.0, "step": 3634 }, { "completion_length": 196.17346954345703, "epoch": 0.3657861635220126, "grad_norm": 1.7993093729019165, "kl": 0.0684814453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.760555386543274, "reward_std": 0.2024402990937233, "rewards/accuracy_reward": 0.7605554163455963, "rewards/format_reward": 1.0, "step": 3635 }, { "completion_length": 288.9081497192383, "epoch": 0.3658867924528302, "grad_norm": 0.5679379105567932, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7259017825126648, "reward_std": 0.10293415002524853, "rewards/accuracy_reward": 0.7361058592796326, "rewards/format_reward": 0.9897959232330322, "step": 3636 }, { "completion_length": 246.74490356445312, "epoch": 0.3659874213836478, "grad_norm": 0.5422914028167725, "kl": 0.0709228515625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5698825120925903, "reward_std": 0.12796031683683395, "rewards/accuracy_reward": 0.5800865739583969, "rewards/format_reward": 0.9897959232330322, "step": 3637 }, { "completion_length": 276.9183578491211, "epoch": 0.3660880503144654, "grad_norm": 0.42893868684768677, "kl": 0.0594482421875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.596792995929718, "reward_std": 0.16077914834022522, "rewards/accuracy_reward": 0.6274052262306213, "rewards/format_reward": 0.9693877398967743, "step": 3638 }, { "completion_length": 252.58163452148438, "epoch": 0.36618867924528303, "grad_norm": 1.150038480758667, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6530612111091614, "reward_std": 0.1955329179763794, "rewards/accuracy_reward": 0.6530612111091614, "rewards/format_reward": 1.0, "step": 3639 }, { "completion_length": 253.7244873046875, "epoch": 0.3662893081761006, "grad_norm": 0.8247852921485901, "kl": 0.0635986328125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7172067761421204, "reward_std": 0.1442454755306244, "rewards/accuracy_reward": 0.7274108529090881, "rewards/format_reward": 0.9897959232330322, "step": 3640 }, { "completion_length": 184.43877029418945, "epoch": 0.36638993710691825, "grad_norm": 0.9498916268348694, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7806527018547058, "reward_std": 0.0567235741764307, "rewards/accuracy_reward": 0.7806527614593506, "rewards/format_reward": 1.0, "step": 3641 }, { "completion_length": 148.7040786743164, "epoch": 0.36649056603773583, "grad_norm": 0.8726946711540222, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8209182620048523, "reward_std": 0.21096039935946465, "rewards/accuracy_reward": 0.8311223983764648, "rewards/format_reward": 0.9897959232330322, "step": 3642 }, { "completion_length": 230.4285659790039, "epoch": 0.36659119496855347, "grad_norm": 1.3723117113113403, "kl": 0.0570068359375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5918367505073547, "reward_std": 0.21369359642267227, "rewards/accuracy_reward": 0.6020407974720001, "rewards/format_reward": 0.9897959232330322, "step": 3643 }, { "completion_length": 206.4285659790039, "epoch": 0.36669182389937105, "grad_norm": 0.5730459690093994, "kl": 0.0986328125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7708516716957092, "reward_std": 0.16197090595960617, "rewards/accuracy_reward": 0.7810557782649994, "rewards/format_reward": 0.9897959232330322, "step": 3644 }, { "completion_length": 240.56121063232422, "epoch": 0.3667924528301887, "grad_norm": 0.4777957797050476, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6839053630828857, "reward_std": 0.11556938290596008, "rewards/accuracy_reward": 0.6941094398498535, "rewards/format_reward": 0.9897959232330322, "step": 3645 }, { "completion_length": 219.40816116333008, "epoch": 0.36689308176100627, "grad_norm": 0.6789426803588867, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7329715490341187, "reward_std": 0.1691611409187317, "rewards/accuracy_reward": 0.743175595998764, "rewards/format_reward": 0.9897959232330322, "step": 3646 }, { "completion_length": 217.07142639160156, "epoch": 0.3669937106918239, "grad_norm": 0.6493328809738159, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6684513688087463, "reward_std": 0.12708625197410583, "rewards/accuracy_reward": 0.688859611749649, "rewards/format_reward": 0.9795918464660645, "step": 3647 }, { "completion_length": 216.2551040649414, "epoch": 0.3670943396226415, "grad_norm": 0.7453833818435669, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7640427350997925, "reward_std": 0.13083549495786428, "rewards/accuracy_reward": 0.7640427350997925, "rewards/format_reward": 1.0, "step": 3648 }, { "completion_length": 271.39794921875, "epoch": 0.3671949685534591, "grad_norm": 0.4566764235496521, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7907047867774963, "reward_std": 0.1744922325015068, "rewards/accuracy_reward": 0.821317046880722, "rewards/format_reward": 0.9693877398967743, "step": 3649 }, { "completion_length": 190.13265228271484, "epoch": 0.3672955974842767, "grad_norm": 1.0280025005340576, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7138147354125977, "reward_std": 0.18064410611987114, "rewards/accuracy_reward": 0.734222948551178, "rewards/format_reward": 0.9795918166637421, "step": 3650 }, { "completion_length": 261.7755126953125, "epoch": 0.36739622641509434, "grad_norm": 0.6456841826438904, "kl": 0.091552734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6879324316978455, "reward_std": 0.08367217145860195, "rewards/accuracy_reward": 0.7083406150341034, "rewards/format_reward": 0.9795918464660645, "step": 3651 }, { "completion_length": 284.5918273925781, "epoch": 0.367496855345912, "grad_norm": 2.177114486694336, "kl": 0.10546875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6182677745819092, "reward_std": 0.22182727605104446, "rewards/accuracy_reward": 0.6386758387088776, "rewards/format_reward": 0.9795918166637421, "step": 3652 }, { "completion_length": 244.6836700439453, "epoch": 0.36759748427672956, "grad_norm": 0.6769066452980042, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8522701263427734, "reward_std": 0.12904749438166618, "rewards/accuracy_reward": 0.862474262714386, "rewards/format_reward": 0.9897959232330322, "step": 3653 }, { "completion_length": 158.31632232666016, "epoch": 0.3676981132075472, "grad_norm": 0.7547321915626526, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8132970333099365, "reward_std": 0.12101452425122261, "rewards/accuracy_reward": 0.8337052166461945, "rewards/format_reward": 0.9795918464660645, "step": 3654 }, { "completion_length": 232.1836700439453, "epoch": 0.3677987421383648, "grad_norm": 0.8950271010398865, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6922144889831543, "reward_std": 0.1767328418791294, "rewards/accuracy_reward": 0.702418640255928, "rewards/format_reward": 0.9897959232330322, "step": 3655 }, { "completion_length": 257.6428527832031, "epoch": 0.3678993710691824, "grad_norm": 0.5830795168876648, "kl": 0.05029296875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.74559885263443, "reward_std": 0.16986758261919022, "rewards/accuracy_reward": 0.7558029294013977, "rewards/format_reward": 0.9897959232330322, "step": 3656 }, { "completion_length": 196.65306091308594, "epoch": 0.368, "grad_norm": 0.9116196632385254, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8114674091339111, "reward_std": 0.17830698564648628, "rewards/accuracy_reward": 0.8216714859008789, "rewards/format_reward": 0.9897959232330322, "step": 3657 }, { "completion_length": 228.0408172607422, "epoch": 0.36810062893081763, "grad_norm": 0.693875253200531, "kl": 0.0594482421875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7898443937301636, "reward_std": 0.21641229093074799, "rewards/accuracy_reward": 0.8000485599040985, "rewards/format_reward": 0.9897959232330322, "step": 3658 }, { "completion_length": 168.57142639160156, "epoch": 0.3682012578616352, "grad_norm": 0.6950671076774597, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8646650910377502, "reward_std": 0.12934733554720879, "rewards/accuracy_reward": 0.874869167804718, "rewards/format_reward": 0.9897959232330322, "step": 3659 }, { "completion_length": 287.89794921875, "epoch": 0.36830188679245285, "grad_norm": 0.4000280201435089, "kl": 0.05224609375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8177430033683777, "reward_std": 0.06646335124969482, "rewards/accuracy_reward": 0.8177430629730225, "rewards/format_reward": 1.0, "step": 3660 }, { "completion_length": 240.85713958740234, "epoch": 0.36840251572327043, "grad_norm": 0.9482919573783875, "kl": 0.0682373046875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.710934340953827, "reward_std": 0.1327260285615921, "rewards/accuracy_reward": 0.7109343111515045, "rewards/format_reward": 1.0, "step": 3661 }, { "completion_length": 265.53060150146484, "epoch": 0.36850314465408807, "grad_norm": 0.541335940361023, "kl": 0.0711669921875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7875525951385498, "reward_std": 0.16448788158595562, "rewards/accuracy_reward": 0.79775670170784, "rewards/format_reward": 0.9897959232330322, "step": 3662 }, { "completion_length": 242.6836700439453, "epoch": 0.36860377358490565, "grad_norm": 1.197002649307251, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7188208103179932, "reward_std": 0.17698334902524948, "rewards/accuracy_reward": 0.7392289936542511, "rewards/format_reward": 0.9795918464660645, "step": 3663 }, { "completion_length": 241.31632232666016, "epoch": 0.3687044025157233, "grad_norm": 1.025498628616333, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.459433674812317, "reward_std": 0.24932505190372467, "rewards/accuracy_reward": 0.4798419177532196, "rewards/format_reward": 0.9795918166637421, "step": 3664 }, { "completion_length": 192.4693832397461, "epoch": 0.36880503144654087, "grad_norm": 0.4637872874736786, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8259741067886353, "reward_std": 0.1360761635005474, "rewards/accuracy_reward": 0.8361780941486359, "rewards/format_reward": 0.9897959232330322, "step": 3665 }, { "completion_length": 255.1734619140625, "epoch": 0.3689056603773585, "grad_norm": 0.8395757079124451, "kl": 0.0667724609375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6955229043960571, "reward_std": 0.19062649831175804, "rewards/accuracy_reward": 0.6955229043960571, "rewards/format_reward": 1.0, "step": 3666 }, { "completion_length": 209.33673095703125, "epoch": 0.3690062893081761, "grad_norm": 0.8676621317863464, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7951164841651917, "reward_std": 0.2023714929819107, "rewards/accuracy_reward": 0.8053206503391266, "rewards/format_reward": 0.9897959232330322, "step": 3667 }, { "completion_length": 197.67346954345703, "epoch": 0.3691069182389937, "grad_norm": 1.5347926616668701, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.746938705444336, "reward_std": 0.24228408187627792, "rewards/accuracy_reward": 0.7469387650489807, "rewards/format_reward": 1.0, "step": 3668 }, { "completion_length": 278.07142639160156, "epoch": 0.3692075471698113, "grad_norm": 0.5006800293922424, "kl": 0.0491943359375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6210573315620422, "reward_std": 0.08545020967721939, "rewards/accuracy_reward": 0.6312614679336548, "rewards/format_reward": 0.9897959232330322, "step": 3669 }, { "completion_length": 269.2448959350586, "epoch": 0.36930817610062894, "grad_norm": 0.6854444742202759, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6380603313446045, "reward_std": 0.23455046117305756, "rewards/accuracy_reward": 0.6482644081115723, "rewards/format_reward": 0.9897959232330322, "step": 3670 }, { "completion_length": 196.2244873046875, "epoch": 0.3694088050314465, "grad_norm": 0.9542466402053833, "kl": 0.0675048828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8033283948898315, "reward_std": 0.213180273771286, "rewards/accuracy_reward": 0.8339406847953796, "rewards/format_reward": 0.9693877398967743, "step": 3671 }, { "completion_length": 267.9387741088867, "epoch": 0.36950943396226416, "grad_norm": 1.018666386604309, "kl": 0.05859375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5983868837356567, "reward_std": 0.21627021580934525, "rewards/accuracy_reward": 0.6187950670719147, "rewards/format_reward": 0.9795918464660645, "step": 3672 }, { "completion_length": 320.56121826171875, "epoch": 0.36961006289308174, "grad_norm": 0.7934388518333435, "kl": 0.1337890625, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.6605867743492126, "reward_std": 0.24379529803991318, "rewards/accuracy_reward": 0.7218113541603088, "rewards/format_reward": 0.9387754797935486, "step": 3673 }, { "completion_length": 275.05101776123047, "epoch": 0.3697106918238994, "grad_norm": 0.9012636542320251, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.5953838229179382, "reward_std": 0.1838945858180523, "rewards/accuracy_reward": 0.6157920062541962, "rewards/format_reward": 0.9795918464660645, "step": 3674 }, { "completion_length": 303.8061218261719, "epoch": 0.36981132075471695, "grad_norm": 0.8280725479125977, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5424058437347412, "reward_std": 0.2580903023481369, "rewards/accuracy_reward": 0.6036304235458374, "rewards/format_reward": 0.938775509595871, "step": 3675 }, { "completion_length": 216.71428680419922, "epoch": 0.3699119496855346, "grad_norm": 0.9532142877578735, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5947036147117615, "reward_std": 0.19550003111362457, "rewards/accuracy_reward": 0.615111768245697, "rewards/format_reward": 0.9795918166637421, "step": 3676 }, { "completion_length": 230.91836547851562, "epoch": 0.37001257861635223, "grad_norm": 0.7396866679191589, "kl": 0.0594482421875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8512795567512512, "reward_std": 0.11042087897658348, "rewards/accuracy_reward": 0.8614836037158966, "rewards/format_reward": 0.9897959232330322, "step": 3677 }, { "completion_length": 261.42857360839844, "epoch": 0.3701132075471698, "grad_norm": 0.5829975605010986, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7609722018241882, "reward_std": 0.19952752441167831, "rewards/accuracy_reward": 0.8017884492874146, "rewards/format_reward": 0.9591836631298065, "step": 3678 }, { "completion_length": 260.9897918701172, "epoch": 0.37021383647798745, "grad_norm": 0.8335294723510742, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7047261595726013, "reward_std": 0.22453342378139496, "rewards/accuracy_reward": 0.7251343131065369, "rewards/format_reward": 0.9795918166637421, "step": 3679 }, { "completion_length": 188.07142639160156, "epoch": 0.370314465408805, "grad_norm": 1.468080759048462, "kl": 0.0892333984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6671909093856812, "reward_std": 0.17520319297909737, "rewards/accuracy_reward": 0.6773949861526489, "rewards/format_reward": 0.9897959232330322, "step": 3680 }, { "completion_length": 219.9285659790039, "epoch": 0.37041509433962266, "grad_norm": 1.1173830032348633, "kl": 0.1329345703125, "learning_rate": 1e-06, "loss": 0.0053, "reward": 1.6536369323730469, "reward_std": 0.2319907695055008, "rewards/accuracy_reward": 0.6842491626739502, "rewards/format_reward": 0.9693877398967743, "step": 3681 }, { "completion_length": 243.57142639160156, "epoch": 0.37051572327044024, "grad_norm": 0.6023637056350708, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.642191767692566, "reward_std": 0.09872639179229736, "rewards/accuracy_reward": 0.6523958742618561, "rewards/format_reward": 0.9897959232330322, "step": 3682 }, { "completion_length": 183.2244873046875, "epoch": 0.3706163522012579, "grad_norm": 0.9442669153213501, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7552142143249512, "reward_std": 0.23619528859853745, "rewards/accuracy_reward": 0.755214124917984, "rewards/format_reward": 1.0, "step": 3683 }, { "completion_length": 188.1530532836914, "epoch": 0.37071698113207546, "grad_norm": 1.074218511581421, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.742332100868225, "reward_std": 0.09960873611271381, "rewards/accuracy_reward": 0.7423321306705475, "rewards/format_reward": 1.0, "step": 3684 }, { "completion_length": 172.80611419677734, "epoch": 0.3708176100628931, "grad_norm": 0.9239740967750549, "kl": 0.115478515625, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.7817851901054382, "reward_std": 0.23205094784498215, "rewards/accuracy_reward": 0.8123973906040192, "rewards/format_reward": 0.9693877398967743, "step": 3685 }, { "completion_length": 227.69387817382812, "epoch": 0.3709182389937107, "grad_norm": 1.0308561325073242, "kl": 0.106689453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7026503086090088, "reward_std": 0.24603140354156494, "rewards/accuracy_reward": 0.7536706924438477, "rewards/format_reward": 0.9489795565605164, "step": 3686 }, { "completion_length": 206.81632232666016, "epoch": 0.3710188679245283, "grad_norm": 0.5599657893180847, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.662420630455017, "reward_std": 0.13681137561798096, "rewards/accuracy_reward": 0.6726246774196625, "rewards/format_reward": 0.9897959232330322, "step": 3687 }, { "completion_length": 179.45917510986328, "epoch": 0.3711194968553459, "grad_norm": 0.7218210697174072, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.580926239490509, "reward_std": 0.12844693660736084, "rewards/accuracy_reward": 0.5911303460597992, "rewards/format_reward": 0.9897959232330322, "step": 3688 }, { "completion_length": 248.6326446533203, "epoch": 0.37122012578616354, "grad_norm": 0.9442651271820068, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7760215997695923, "reward_std": 0.28244340419769287, "rewards/accuracy_reward": 0.806633859872818, "rewards/format_reward": 0.9693877398967743, "step": 3689 }, { "completion_length": 278.9387664794922, "epoch": 0.3713207547169811, "grad_norm": 0.469682902097702, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7661910653114319, "reward_std": 0.16869398951530457, "rewards/accuracy_reward": 0.7865992784500122, "rewards/format_reward": 0.9795918464660645, "step": 3690 }, { "completion_length": 140.22449111938477, "epoch": 0.37142138364779875, "grad_norm": 0.622473418712616, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.9387754797935486, "reward_std": 0.09217509627342224, "rewards/accuracy_reward": 0.938775509595871, "rewards/format_reward": 1.0, "step": 3691 }, { "completion_length": 187.448974609375, "epoch": 0.37152201257861633, "grad_norm": 0.751374363899231, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.9195261001586914, "reward_std": 0.14110318571329117, "rewards/accuracy_reward": 0.9297301173210144, "rewards/format_reward": 0.9897959232330322, "step": 3692 }, { "completion_length": 250.83673095703125, "epoch": 0.37162264150943397, "grad_norm": 0.48466336727142334, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6592565178871155, "reward_std": 0.09276188537478447, "rewards/accuracy_reward": 0.6796647012233734, "rewards/format_reward": 0.9795918464660645, "step": 3693 }, { "completion_length": 209.32652282714844, "epoch": 0.37172327044025155, "grad_norm": 1.2476099729537964, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7533652186393738, "reward_std": 0.3250660002231598, "rewards/accuracy_reward": 0.7737734019756317, "rewards/format_reward": 0.9795918464660645, "step": 3694 }, { "completion_length": 294.7040710449219, "epoch": 0.3718238993710692, "grad_norm": 0.7144805192947388, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6711050271987915, "reward_std": 0.3425716161727905, "rewards/accuracy_reward": 0.711921364068985, "rewards/format_reward": 0.9591836333274841, "step": 3695 }, { "completion_length": 229.05101013183594, "epoch": 0.37192452830188677, "grad_norm": 0.7170383334159851, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7546582221984863, "reward_std": 0.1579597145318985, "rewards/accuracy_reward": 0.7546583712100983, "rewards/format_reward": 1.0, "step": 3696 }, { "completion_length": 204.58163452148438, "epoch": 0.3720251572327044, "grad_norm": 1.6907145977020264, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7730364799499512, "reward_std": 0.21693947911262512, "rewards/accuracy_reward": 0.783240556716919, "rewards/format_reward": 0.9897959232330322, "step": 3697 }, { "completion_length": 281.0306167602539, "epoch": 0.372125786163522, "grad_norm": 1.279634714126587, "kl": 0.089111328125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.5990375876426697, "reward_std": 0.3026982694864273, "rewards/accuracy_reward": 0.6500580310821533, "rewards/format_reward": 0.9489795565605164, "step": 3698 }, { "completion_length": 282.8163299560547, "epoch": 0.3722264150943396, "grad_norm": 0.9220958352088928, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7671623826026917, "reward_std": 0.11923185735940933, "rewards/accuracy_reward": 0.777366429567337, "rewards/format_reward": 0.9897959232330322, "step": 3699 }, { "completion_length": 258.5306091308594, "epoch": 0.3723270440251572, "grad_norm": 1.0551520586013794, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7699222564697266, "reward_std": 0.3601732552051544, "rewards/accuracy_reward": 0.7903303802013397, "rewards/format_reward": 0.9795918464660645, "step": 3700 }, { "completion_length": 234.4591827392578, "epoch": 0.37242767295597484, "grad_norm": 0.595241129398346, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.584902286529541, "reward_std": 0.19246669486165047, "rewards/accuracy_reward": 0.6053104549646378, "rewards/format_reward": 0.9795918166637421, "step": 3701 }, { "completion_length": 182.2448959350586, "epoch": 0.3725283018867925, "grad_norm": 1.3280537128448486, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8658891916275024, "reward_std": 0.18764280527830124, "rewards/accuracy_reward": 0.8760932981967926, "rewards/format_reward": 0.9897959232330322, "step": 3702 }, { "completion_length": 220.29591369628906, "epoch": 0.37262893081761006, "grad_norm": 0.3745839595794678, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.771136999130249, "reward_std": 0.1039032731205225, "rewards/accuracy_reward": 0.7711369693279266, "rewards/format_reward": 1.0, "step": 3703 }, { "completion_length": 173.77550506591797, "epoch": 0.3727295597484277, "grad_norm": 1.525331974029541, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7750021815299988, "reward_std": 0.086643535643816, "rewards/accuracy_reward": 0.7750021815299988, "rewards/format_reward": 1.0, "step": 3704 }, { "completion_length": 296.91835021972656, "epoch": 0.3728301886792453, "grad_norm": 0.770474374294281, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5209300518035889, "reward_std": 0.18316209688782692, "rewards/accuracy_reward": 0.5311340838670731, "rewards/format_reward": 0.9897959232330322, "step": 3705 }, { "completion_length": 264.9285659790039, "epoch": 0.3729308176100629, "grad_norm": 0.7715761661529541, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.504340648651123, "reward_std": 0.23585723340511322, "rewards/accuracy_reward": 0.5349529981613159, "rewards/format_reward": 0.9693877398967743, "step": 3706 }, { "completion_length": 248.70407104492188, "epoch": 0.3730314465408805, "grad_norm": 0.7029613256454468, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.797342598438263, "reward_std": 0.21345195174217224, "rewards/accuracy_reward": 0.838158905506134, "rewards/format_reward": 0.9591836333274841, "step": 3707 }, { "completion_length": 217.1836700439453, "epoch": 0.37313207547169813, "grad_norm": 0.690239667892456, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7235003113746643, "reward_std": 0.15877415984869003, "rewards/accuracy_reward": 0.7337043881416321, "rewards/format_reward": 0.9897959232330322, "step": 3708 }, { "completion_length": 221.1326446533203, "epoch": 0.3732327044025157, "grad_norm": 1.0750130414962769, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7145735025405884, "reward_std": 0.3227595388889313, "rewards/accuracy_reward": 0.7349816858768463, "rewards/format_reward": 0.9795918166637421, "step": 3709 }, { "completion_length": 289.19386291503906, "epoch": 0.37333333333333335, "grad_norm": 0.9637110829353333, "kl": 0.0538330078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6970196962356567, "reward_std": 0.324063703417778, "rewards/accuracy_reward": 0.7174278795719147, "rewards/format_reward": 0.9795918464660645, "step": 3710 }, { "completion_length": 245.5204086303711, "epoch": 0.37343396226415093, "grad_norm": 0.491940975189209, "kl": 0.05712890625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6760203838348389, "reward_std": 0.1251949481666088, "rewards/accuracy_reward": 0.6964285373687744, "rewards/format_reward": 0.9795918464660645, "step": 3711 }, { "completion_length": 174.2448959350586, "epoch": 0.37353459119496857, "grad_norm": 1.0132882595062256, "kl": 0.0701904296875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.838304579257965, "reward_std": 0.15032347291707993, "rewards/accuracy_reward": 0.8485086262226105, "rewards/format_reward": 0.9897959232330322, "step": 3712 }, { "completion_length": 239.42855834960938, "epoch": 0.37363522012578615, "grad_norm": 0.6175498366355896, "kl": 0.0689697265625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6490363478660583, "reward_std": 0.16005230322480202, "rewards/accuracy_reward": 0.6796486526727676, "rewards/format_reward": 0.9693877398967743, "step": 3713 }, { "completion_length": 197.53060913085938, "epoch": 0.3737358490566038, "grad_norm": 0.47830188274383545, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8469792604446411, "reward_std": 0.04879495454952121, "rewards/accuracy_reward": 0.8469792604446411, "rewards/format_reward": 1.0, "step": 3714 }, { "completion_length": 277.27550506591797, "epoch": 0.37383647798742137, "grad_norm": 0.5829153060913086, "kl": 0.091552734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8482112884521484, "reward_std": 0.16639446467161179, "rewards/accuracy_reward": 0.8788235187530518, "rewards/format_reward": 0.9693877398967743, "step": 3715 }, { "completion_length": 229.41836547851562, "epoch": 0.373937106918239, "grad_norm": 0.7375503778457642, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.65080988407135, "reward_std": 0.13260192796587944, "rewards/accuracy_reward": 0.6508098840713501, "rewards/format_reward": 1.0, "step": 3716 }, { "completion_length": 266.6836700439453, "epoch": 0.3740377358490566, "grad_norm": 0.6617140769958496, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7114914059638977, "reward_std": 0.2819664925336838, "rewards/accuracy_reward": 0.7523078322410583, "rewards/format_reward": 0.9591836333274841, "step": 3717 }, { "completion_length": 299.6938781738281, "epoch": 0.3741383647798742, "grad_norm": 0.44417229294776917, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6976505517959595, "reward_std": 0.0672613587230444, "rewards/accuracy_reward": 0.6976505219936371, "rewards/format_reward": 1.0, "step": 3718 }, { "completion_length": 215.78571319580078, "epoch": 0.3742389937106918, "grad_norm": 0.8665960431098938, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6583537459373474, "reward_std": 0.18214235454797745, "rewards/accuracy_reward": 0.6685578227043152, "rewards/format_reward": 0.9897959232330322, "step": 3719 }, { "completion_length": 299.1122283935547, "epoch": 0.37433962264150944, "grad_norm": 0.8914442658424377, "kl": 0.0640869140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7813944220542908, "reward_std": 0.23633447289466858, "rewards/accuracy_reward": 0.8018026053905487, "rewards/format_reward": 0.9795918166637421, "step": 3720 }, { "completion_length": 274.87754821777344, "epoch": 0.374440251572327, "grad_norm": 0.8931838870048523, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5975866317749023, "reward_std": 0.21356233954429626, "rewards/accuracy_reward": 0.6077907383441925, "rewards/format_reward": 0.9897959232330322, "step": 3721 }, { "completion_length": 201.448974609375, "epoch": 0.37454088050314466, "grad_norm": 0.8721048831939697, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7318904995918274, "reward_std": 0.11867824196815491, "rewards/accuracy_reward": 0.7318905293941498, "rewards/format_reward": 1.0, "step": 3722 }, { "completion_length": 224.6836700439453, "epoch": 0.37464150943396224, "grad_norm": 0.764519453048706, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7381377816200256, "reward_std": 0.11019887775182724, "rewards/accuracy_reward": 0.7381377518177032, "rewards/format_reward": 1.0, "step": 3723 }, { "completion_length": 248.6428451538086, "epoch": 0.3747421383647799, "grad_norm": 0.6329447031021118, "kl": 0.109619140625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7571086287498474, "reward_std": 0.15216808021068573, "rewards/accuracy_reward": 0.7673126459121704, "rewards/format_reward": 0.9897959232330322, "step": 3724 }, { "completion_length": 315.5408172607422, "epoch": 0.37484276729559746, "grad_norm": 0.6147875189781189, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7030452489852905, "reward_std": 0.23043163120746613, "rewards/accuracy_reward": 0.7336574792861938, "rewards/format_reward": 0.9693877398967743, "step": 3725 }, { "completion_length": 200.2040786743164, "epoch": 0.3749433962264151, "grad_norm": 1.470246434211731, "kl": 0.105712890625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7640765309333801, "reward_std": 0.1716998592019081, "rewards/accuracy_reward": 0.7844845652580261, "rewards/format_reward": 0.9795918464660645, "step": 3726 }, { "completion_length": 243.76529693603516, "epoch": 0.37504402515723273, "grad_norm": 0.6004742980003357, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7877177596092224, "reward_std": 0.12080949731171131, "rewards/accuracy_reward": 0.7979218363761902, "rewards/format_reward": 0.9897959232330322, "step": 3727 }, { "completion_length": 216.30611419677734, "epoch": 0.3751446540880503, "grad_norm": 0.43084490299224854, "kl": 0.0504150390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8469387292861938, "reward_std": 0.026997461915016174, "rewards/accuracy_reward": 0.8469387590885162, "rewards/format_reward": 1.0, "step": 3728 }, { "completion_length": 164.47958374023438, "epoch": 0.37524528301886795, "grad_norm": 0.5399868488311768, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7755101919174194, "reward_std": 0.06517763808369637, "rewards/accuracy_reward": 0.7755101919174194, "rewards/format_reward": 1.0, "step": 3729 }, { "completion_length": 230.49999237060547, "epoch": 0.37534591194968553, "grad_norm": 0.5596557855606079, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.9368318319320679, "reward_std": 0.06321016326546669, "rewards/accuracy_reward": 0.9368318617343903, "rewards/format_reward": 1.0, "step": 3730 }, { "completion_length": 249.78570556640625, "epoch": 0.37544654088050317, "grad_norm": 0.3991447687149048, "kl": 0.0736083984375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.839259922504425, "reward_std": 0.06995372474193573, "rewards/accuracy_reward": 0.8392599523067474, "rewards/format_reward": 1.0, "step": 3731 }, { "completion_length": 228.63265228271484, "epoch": 0.37554716981132075, "grad_norm": 0.809602677822113, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8222016096115112, "reward_std": 0.1499922201037407, "rewards/accuracy_reward": 0.8324057161808014, "rewards/format_reward": 0.9897959232330322, "step": 3732 }, { "completion_length": 253.58162689208984, "epoch": 0.3756477987421384, "grad_norm": 1.3662729263305664, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8396482467651367, "reward_std": 0.16907519847154617, "rewards/accuracy_reward": 0.8396482169628143, "rewards/format_reward": 1.0, "step": 3733 }, { "completion_length": 216.10204315185547, "epoch": 0.37574842767295596, "grad_norm": 0.7849447131156921, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7928261756896973, "reward_std": 0.11942082643508911, "rewards/accuracy_reward": 0.7928261756896973, "rewards/format_reward": 1.0, "step": 3734 }, { "completion_length": 252.1632537841797, "epoch": 0.3758490566037736, "grad_norm": 0.4274495244026184, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7799628376960754, "reward_std": 0.10078969970345497, "rewards/accuracy_reward": 0.7901669442653656, "rewards/format_reward": 0.9897959232330322, "step": 3735 }, { "completion_length": 289.5102081298828, "epoch": 0.3759496855345912, "grad_norm": 1.1788792610168457, "kl": 0.0804443359375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.631292462348938, "reward_std": 0.32802191376686096, "rewards/accuracy_reward": 0.6517006754875183, "rewards/format_reward": 0.9795918464660645, "step": 3736 }, { "completion_length": 203.1326446533203, "epoch": 0.3760503144654088, "grad_norm": 0.835875391960144, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8299319744110107, "reward_std": 0.07199322804808617, "rewards/accuracy_reward": 0.8299319744110107, "rewards/format_reward": 1.0, "step": 3737 }, { "completion_length": 241.11224365234375, "epoch": 0.3761509433962264, "grad_norm": 0.8167966604232788, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7244897484779358, "reward_std": 0.19220631569623947, "rewards/accuracy_reward": 0.7244897782802582, "rewards/format_reward": 1.0, "step": 3738 }, { "completion_length": 231.87754821777344, "epoch": 0.37625157232704404, "grad_norm": 0.8851573467254639, "kl": 0.11181640625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.8886483311653137, "reward_std": 0.09329163655638695, "rewards/accuracy_reward": 0.8988524377346039, "rewards/format_reward": 0.9897959232330322, "step": 3739 }, { "completion_length": 210.33672332763672, "epoch": 0.3763522012578616, "grad_norm": 0.7852834463119507, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7825857400894165, "reward_std": 0.138448353856802, "rewards/accuracy_reward": 0.7825857400894165, "rewards/format_reward": 1.0, "step": 3740 }, { "completion_length": 268.2550964355469, "epoch": 0.37645283018867925, "grad_norm": 0.7902425527572632, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6511377096176147, "reward_std": 0.11691546812653542, "rewards/accuracy_reward": 0.6511377394199371, "rewards/format_reward": 1.0, "step": 3741 }, { "completion_length": 197.78570556640625, "epoch": 0.37655345911949684, "grad_norm": 0.9494227170944214, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.9480077028274536, "reward_std": 0.1375584900379181, "rewards/accuracy_reward": 0.9582118391990662, "rewards/format_reward": 0.9897959232330322, "step": 3742 }, { "completion_length": 172.01020050048828, "epoch": 0.3766540880503145, "grad_norm": 1.5716255903244019, "kl": 0.1162109375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.7752419710159302, "reward_std": 0.07855316065251827, "rewards/accuracy_reward": 0.7854461073875427, "rewards/format_reward": 0.9897959232330322, "step": 3743 }, { "completion_length": 187.77550506591797, "epoch": 0.37675471698113205, "grad_norm": 1.4087706804275513, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.844897985458374, "reward_std": 0.1958378553390503, "rewards/accuracy_reward": 0.8653060793876648, "rewards/format_reward": 0.9795918166637421, "step": 3744 }, { "completion_length": 251.44896697998047, "epoch": 0.3768553459119497, "grad_norm": 1.2779401540756226, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.741977870464325, "reward_std": 0.15358631312847137, "rewards/accuracy_reward": 0.7419779002666473, "rewards/format_reward": 1.0, "step": 3745 }, { "completion_length": 181.30612182617188, "epoch": 0.37695597484276727, "grad_norm": 0.33658239245414734, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8109814524650574, "reward_std": 0.056899599730968475, "rewards/accuracy_reward": 0.8211855888366699, "rewards/format_reward": 0.9897959232330322, "step": 3746 }, { "completion_length": 212.88774871826172, "epoch": 0.3770566037735849, "grad_norm": 9.262727737426758, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7448979020118713, "reward_std": 0.1123781893402338, "rewards/accuracy_reward": 0.7551020383834839, "rewards/format_reward": 0.9897959232330322, "step": 3747 }, { "completion_length": 222.16326141357422, "epoch": 0.3771572327044025, "grad_norm": 0.6399028301239014, "kl": 0.0689697265625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.72119140625, "reward_std": 0.18915381282567978, "rewards/accuracy_reward": 0.7415996491909027, "rewards/format_reward": 0.9795918464660645, "step": 3748 }, { "completion_length": 229.33673095703125, "epoch": 0.3772578616352201, "grad_norm": 0.8451594114303589, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8211631774902344, "reward_std": 0.17258768528699875, "rewards/accuracy_reward": 0.821163147687912, "rewards/format_reward": 1.0, "step": 3749 }, { "completion_length": 213.58162689208984, "epoch": 0.37735849056603776, "grad_norm": 0.9188305139541626, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7201366424560547, "reward_std": 0.12870438396930695, "rewards/accuracy_reward": 0.7201366722583771, "rewards/format_reward": 1.0, "step": 3750 }, { "completion_length": 164.62244415283203, "epoch": 0.37745911949685534, "grad_norm": 0.6400062441825867, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8825479745864868, "reward_std": 0.15942658483982086, "rewards/accuracy_reward": 0.892752081155777, "rewards/format_reward": 0.9897959232330322, "step": 3751 }, { "completion_length": 213.82653045654297, "epoch": 0.377559748427673, "grad_norm": 0.554692268371582, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5881438851356506, "reward_std": 0.15548309683799744, "rewards/accuracy_reward": 0.598347932100296, "rewards/format_reward": 0.9897959232330322, "step": 3752 }, { "completion_length": 253.94896697998047, "epoch": 0.37766037735849056, "grad_norm": 0.8521057367324829, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.5921061038970947, "reward_std": 0.11504145339131355, "rewards/accuracy_reward": 0.5921061933040619, "rewards/format_reward": 1.0, "step": 3753 }, { "completion_length": 258.9897918701172, "epoch": 0.3777610062893082, "grad_norm": 1.6143566370010376, "kl": 0.091552734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7813971638679504, "reward_std": 0.1775973066687584, "rewards/accuracy_reward": 0.7916012108325958, "rewards/format_reward": 0.9897959232330322, "step": 3754 }, { "completion_length": 250.3673324584961, "epoch": 0.3778616352201258, "grad_norm": 1.1449859142303467, "kl": 0.103271484375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6118448972702026, "reward_std": 0.27735358476638794, "rewards/accuracy_reward": 0.6220490336418152, "rewards/format_reward": 0.9897959232330322, "step": 3755 }, { "completion_length": 236.16326141357422, "epoch": 0.3779622641509434, "grad_norm": 0.3738233745098114, "kl": 0.06689453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7683717012405396, "reward_std": 0.07320886105298996, "rewards/accuracy_reward": 0.7683717906475067, "rewards/format_reward": 1.0, "step": 3756 }, { "completion_length": 234.20408630371094, "epoch": 0.378062893081761, "grad_norm": 1.817933201789856, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8228458166122437, "reward_std": 0.20274916291236877, "rewards/accuracy_reward": 0.8330498337745667, "rewards/format_reward": 0.9897959232330322, "step": 3757 }, { "completion_length": 272.37754821777344, "epoch": 0.37816352201257863, "grad_norm": 0.7009832262992859, "kl": 0.0853271484375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.732003092765808, "reward_std": 0.20370261371135712, "rewards/accuracy_reward": 0.7524113059043884, "rewards/format_reward": 0.9795918166637421, "step": 3758 }, { "completion_length": 280.9285583496094, "epoch": 0.3782641509433962, "grad_norm": 0.8919194936752319, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7827563881874084, "reward_std": 0.20876868814229965, "rewards/accuracy_reward": 0.7929604053497314, "rewards/format_reward": 0.9897959232330322, "step": 3759 }, { "completion_length": 241.35713958740234, "epoch": 0.37836477987421385, "grad_norm": 0.582848072052002, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8563896417617798, "reward_std": 0.12970667332410812, "rewards/accuracy_reward": 0.8563896119594574, "rewards/format_reward": 1.0, "step": 3760 }, { "completion_length": 214.6938705444336, "epoch": 0.37846540880503143, "grad_norm": 0.7288119792938232, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8436112999916077, "reward_std": 0.1476340964436531, "rewards/accuracy_reward": 0.8640194833278656, "rewards/format_reward": 0.9795918166637421, "step": 3761 }, { "completion_length": 270.70408630371094, "epoch": 0.37856603773584907, "grad_norm": 0.5001665353775024, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6779838800430298, "reward_std": 0.21340929716825485, "rewards/accuracy_reward": 0.6983920633792877, "rewards/format_reward": 0.9795918166637421, "step": 3762 }, { "completion_length": 221.551025390625, "epoch": 0.37866666666666665, "grad_norm": 0.8782468438148499, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8787710666656494, "reward_std": 0.09655445069074631, "rewards/accuracy_reward": 0.878771036863327, "rewards/format_reward": 1.0, "step": 3763 }, { "completion_length": 222.8571319580078, "epoch": 0.3787672955974843, "grad_norm": 0.48842155933380127, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.724291205406189, "reward_std": 0.07576341554522514, "rewards/accuracy_reward": 0.7242911458015442, "rewards/format_reward": 1.0, "step": 3764 }, { "completion_length": 217.6734619140625, "epoch": 0.37886792452830187, "grad_norm": 0.819523811340332, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.903002917766571, "reward_std": 0.14926785230636597, "rewards/accuracy_reward": 0.903002917766571, "rewards/format_reward": 1.0, "step": 3765 }, { "completion_length": 208.54080963134766, "epoch": 0.3789685534591195, "grad_norm": 0.5071551203727722, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.9441205263137817, "reward_std": 0.08108741976320744, "rewards/accuracy_reward": 0.9543246030807495, "rewards/format_reward": 0.9897959232330322, "step": 3766 }, { "completion_length": 227.49999237060547, "epoch": 0.3790691823899371, "grad_norm": 0.6461684703826904, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7591401934623718, "reward_std": 0.11656594462692738, "rewards/accuracy_reward": 0.7693442404270172, "rewards/format_reward": 0.9897959232330322, "step": 3767 }, { "completion_length": 260.57141876220703, "epoch": 0.3791698113207547, "grad_norm": 0.3780255913734436, "kl": 0.0740966796875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7359893321990967, "reward_std": 0.054233990609645844, "rewards/accuracy_reward": 0.7359893620014191, "rewards/format_reward": 1.0, "step": 3768 }, { "completion_length": 207.89795684814453, "epoch": 0.3792704402515723, "grad_norm": 0.8715969324111938, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6448007822036743, "reward_std": 0.168905608355999, "rewards/accuracy_reward": 0.6448007524013519, "rewards/format_reward": 1.0, "step": 3769 }, { "completion_length": 230.86734008789062, "epoch": 0.37937106918238994, "grad_norm": 0.6335284113883972, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8209141492843628, "reward_std": 0.11675978824496269, "rewards/accuracy_reward": 0.8209141790866852, "rewards/format_reward": 1.0, "step": 3770 }, { "completion_length": 288.39795684814453, "epoch": 0.3794716981132075, "grad_norm": 0.8007790446281433, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7019885182380676, "reward_std": 0.2027604766190052, "rewards/accuracy_reward": 0.7223966121673584, "rewards/format_reward": 0.9795918166637421, "step": 3771 }, { "completion_length": 156.05101776123047, "epoch": 0.37957232704402516, "grad_norm": 0.7581789493560791, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8595634698867798, "reward_std": 0.10632287710905075, "rewards/accuracy_reward": 0.86976757645607, "rewards/format_reward": 0.9897959232330322, "step": 3772 }, { "completion_length": 236.01020050048828, "epoch": 0.37967295597484274, "grad_norm": 2.3074119091033936, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8739228248596191, "reward_std": 0.213870607316494, "rewards/accuracy_reward": 0.8841269612312317, "rewards/format_reward": 0.9897959232330322, "step": 3773 }, { "completion_length": 198.67346954345703, "epoch": 0.3797735849056604, "grad_norm": 1.1547300815582275, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7323152422904968, "reward_std": 0.17253362387418747, "rewards/accuracy_reward": 0.7425194084644318, "rewards/format_reward": 0.9897959232330322, "step": 3774 }, { "completion_length": 232.1938705444336, "epoch": 0.379874213836478, "grad_norm": 0.6385747790336609, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6899637579917908, "reward_std": 0.08887330442667007, "rewards/accuracy_reward": 0.7001678645610809, "rewards/format_reward": 0.9897959232330322, "step": 3775 }, { "completion_length": 241.5102081298828, "epoch": 0.3799748427672956, "grad_norm": 0.6946775317192078, "kl": 0.0784912109375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7523509860038757, "reward_std": 0.13266951590776443, "rewards/accuracy_reward": 0.7523510456085205, "rewards/format_reward": 1.0, "step": 3776 }, { "completion_length": 257.4387741088867, "epoch": 0.38007547169811323, "grad_norm": 0.42245396971702576, "kl": 0.053955078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7596371173858643, "reward_std": 0.12471431121230125, "rewards/accuracy_reward": 0.7596371471881866, "rewards/format_reward": 1.0, "step": 3777 }, { "completion_length": 210.62244415283203, "epoch": 0.3801761006289308, "grad_norm": 0.5082048177719116, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8176870346069336, "reward_std": 0.05866309627890587, "rewards/accuracy_reward": 0.8176870346069336, "rewards/format_reward": 1.0, "step": 3778 }, { "completion_length": 201.5408172607422, "epoch": 0.38027672955974845, "grad_norm": 1.1812363862991333, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7549198269844055, "reward_std": 0.05689811520278454, "rewards/accuracy_reward": 0.7549198269844055, "rewards/format_reward": 1.0, "step": 3779 }, { "completion_length": 203.95917510986328, "epoch": 0.38037735849056603, "grad_norm": 1.328962802886963, "kl": 0.05126953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8223981857299805, "reward_std": 0.1295805722475052, "rewards/accuracy_reward": 0.8223981857299805, "rewards/format_reward": 1.0, "step": 3780 }, { "completion_length": 265.84693145751953, "epoch": 0.38047798742138367, "grad_norm": 0.49323800206184387, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6913951635360718, "reward_std": 0.13712865114212036, "rewards/accuracy_reward": 0.7118033170700073, "rewards/format_reward": 0.9795918166637421, "step": 3781 }, { "completion_length": 211.44898223876953, "epoch": 0.38057861635220125, "grad_norm": 0.8189756274223328, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.846764862537384, "reward_std": 0.1875399500131607, "rewards/accuracy_reward": 0.8671730160713196, "rewards/format_reward": 0.9795918464660645, "step": 3782 }, { "completion_length": 263.4081573486328, "epoch": 0.3806792452830189, "grad_norm": 0.6117920875549316, "kl": 0.0626220703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7213153839111328, "reward_std": 0.13936736807227135, "rewards/accuracy_reward": 0.7213155031204224, "rewards/format_reward": 1.0, "step": 3783 }, { "completion_length": 227.948974609375, "epoch": 0.38077987421383647, "grad_norm": 1.420033574104309, "kl": 0.105712890625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6286612749099731, "reward_std": 0.20171374082565308, "rewards/accuracy_reward": 0.6592735648155212, "rewards/format_reward": 0.9693877398967743, "step": 3784 }, { "completion_length": 271.0306091308594, "epoch": 0.3808805031446541, "grad_norm": 0.6911710500717163, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.760754644870758, "reward_std": 0.23118441551923752, "rewards/accuracy_reward": 0.7709587514400482, "rewards/format_reward": 0.9897959232330322, "step": 3785 }, { "completion_length": 265.82653045654297, "epoch": 0.3809811320754717, "grad_norm": 0.666885256767273, "kl": 0.0799560546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6516578793525696, "reward_std": 0.17366435378789902, "rewards/accuracy_reward": 0.6822700798511505, "rewards/format_reward": 0.9693877398967743, "step": 3786 }, { "completion_length": 265.66326904296875, "epoch": 0.3810817610062893, "grad_norm": 0.601172685623169, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6905434727668762, "reward_std": 0.12006735801696777, "rewards/accuracy_reward": 0.700747549533844, "rewards/format_reward": 0.9897959232330322, "step": 3787 }, { "completion_length": 220.9795913696289, "epoch": 0.3811823899371069, "grad_norm": 0.6229717135429382, "kl": 0.110107421875, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7946549654006958, "reward_std": 0.125251654535532, "rewards/accuracy_reward": 0.8150631487369537, "rewards/format_reward": 0.9795918166637421, "step": 3788 }, { "completion_length": 179.7448959350586, "epoch": 0.38128301886792454, "grad_norm": 1.192196249961853, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.895043671131134, "reward_std": 0.15016483515501022, "rewards/accuracy_reward": 0.9052478075027466, "rewards/format_reward": 0.9897959232330322, "step": 3789 }, { "completion_length": 228.7040786743164, "epoch": 0.3813836477987421, "grad_norm": 0.43053653836250305, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.81516695022583, "reward_std": 0.06491892784833908, "rewards/accuracy_reward": 0.8151669502258301, "rewards/format_reward": 1.0, "step": 3790 }, { "completion_length": 270.7244873046875, "epoch": 0.38148427672955976, "grad_norm": 0.6344072818756104, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.4752185940742493, "reward_std": 0.15878088772296906, "rewards/accuracy_reward": 0.4854227155447006, "rewards/format_reward": 0.9897959232330322, "step": 3791 }, { "completion_length": 254.2448959350586, "epoch": 0.38158490566037734, "grad_norm": 0.6270182728767395, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7507482767105103, "reward_std": 0.11808884143829346, "rewards/accuracy_reward": 0.7507482767105103, "rewards/format_reward": 1.0, "step": 3792 }, { "completion_length": 245.05101776123047, "epoch": 0.381685534591195, "grad_norm": 0.6811021566390991, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7406463027000427, "reward_std": 0.15244214236736298, "rewards/accuracy_reward": 0.7508503198623657, "rewards/format_reward": 0.9897959232330322, "step": 3793 }, { "completion_length": 192.23468780517578, "epoch": 0.38178616352201256, "grad_norm": 285.74298095703125, "kl": 0.1134033203125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7694810032844543, "reward_std": 0.11179951578378677, "rewards/accuracy_reward": 0.7694810330867767, "rewards/format_reward": 1.0, "step": 3794 }, { "completion_length": 230.1938705444336, "epoch": 0.3818867924528302, "grad_norm": 0.8143447637557983, "kl": 0.10302734375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.705996036529541, "reward_std": 0.16335231065750122, "rewards/accuracy_reward": 0.7162001132965088, "rewards/format_reward": 0.9897959232330322, "step": 3795 }, { "completion_length": 212.2959213256836, "epoch": 0.3819874213836478, "grad_norm": 0.5612183213233948, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7301533818244934, "reward_std": 0.11843843013048172, "rewards/accuracy_reward": 0.740357518196106, "rewards/format_reward": 0.9897959232330322, "step": 3796 }, { "completion_length": 156.51020050048828, "epoch": 0.3820880503144654, "grad_norm": 1.0858770608901978, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.928571343421936, "reward_std": 0.10335781052708626, "rewards/accuracy_reward": 0.9285714030265808, "rewards/format_reward": 1.0, "step": 3797 }, { "completion_length": 283.79591369628906, "epoch": 0.382188679245283, "grad_norm": 0.5598835349082947, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6530134081840515, "reward_std": 0.1297578364610672, "rewards/accuracy_reward": 0.6530133485794067, "rewards/format_reward": 1.0, "step": 3798 }, { "completion_length": 142.62244415283203, "epoch": 0.38228930817610063, "grad_norm": 5.554194927215576, "kl": 0.103515625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.885204017162323, "reward_std": 0.09559785947203636, "rewards/accuracy_reward": 0.8852040469646454, "rewards/format_reward": 1.0, "step": 3799 }, { "completion_length": 170.08162689208984, "epoch": 0.38238993710691827, "grad_norm": 2.0849037170410156, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8061224222183228, "reward_std": 0.12388552352786064, "rewards/accuracy_reward": 0.8163264989852905, "rewards/format_reward": 0.9897959232330322, "step": 3800 }, { "completion_length": 246.64285278320312, "epoch": 0.38249056603773585, "grad_norm": 0.769481897354126, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7654806971549988, "reward_std": 0.16955658793449402, "rewards/accuracy_reward": 0.7858888506889343, "rewards/format_reward": 0.9795918464660645, "step": 3801 }, { "completion_length": 236.88775634765625, "epoch": 0.3825911949685535, "grad_norm": 0.9398831129074097, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7445356845855713, "reward_std": 0.24190133064985275, "rewards/accuracy_reward": 0.7751479148864746, "rewards/format_reward": 0.9693877398967743, "step": 3802 }, { "completion_length": 218.28570556640625, "epoch": 0.38269182389937106, "grad_norm": 0.6201139688491821, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.856215238571167, "reward_std": 0.09991716220974922, "rewards/accuracy_reward": 0.86641925573349, "rewards/format_reward": 0.9897959232330322, "step": 3803 }, { "completion_length": 237.49999237060547, "epoch": 0.3827924528301887, "grad_norm": 0.5208386778831482, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6975268721580505, "reward_std": 0.1222793348133564, "rewards/accuracy_reward": 0.6975268423557281, "rewards/format_reward": 1.0, "step": 3804 }, { "completion_length": 253.96937561035156, "epoch": 0.3828930817610063, "grad_norm": 0.599158763885498, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7649477124214172, "reward_std": 0.11153952218592167, "rewards/accuracy_reward": 0.764947772026062, "rewards/format_reward": 1.0, "step": 3805 }, { "completion_length": 170.9897918701172, "epoch": 0.3829937106918239, "grad_norm": 0.5999860167503357, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8571428656578064, "reward_std": 0.13506823033094406, "rewards/accuracy_reward": 0.8673469424247742, "rewards/format_reward": 0.9897959232330322, "step": 3806 }, { "completion_length": 241.4285659790039, "epoch": 0.3830943396226415, "grad_norm": 0.6818563938140869, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5497449040412903, "reward_std": 0.18510308116674423, "rewards/accuracy_reward": 0.5599489659070969, "rewards/format_reward": 0.9897959232330322, "step": 3807 }, { "completion_length": 240.49999237060547, "epoch": 0.38319496855345914, "grad_norm": 0.8366340398788452, "kl": 0.0543212890625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7044432163238525, "reward_std": 0.22157857567071915, "rewards/accuracy_reward": 0.7044432163238525, "rewards/format_reward": 1.0, "step": 3808 }, { "completion_length": 198.33673095703125, "epoch": 0.3832955974842767, "grad_norm": 0.7359010577201843, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8921282887458801, "reward_std": 0.1615971326828003, "rewards/accuracy_reward": 0.9023323655128479, "rewards/format_reward": 0.9897959232330322, "step": 3809 }, { "completion_length": 275.3877410888672, "epoch": 0.38339622641509435, "grad_norm": 0.8708093166351318, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.531462550163269, "reward_std": 0.22595979645848274, "rewards/accuracy_reward": 0.5416666567325592, "rewards/format_reward": 0.9897959232330322, "step": 3810 }, { "completion_length": 289.8163146972656, "epoch": 0.38349685534591194, "grad_norm": 1.529794454574585, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7081632018089294, "reward_std": 0.23532789945602417, "rewards/accuracy_reward": 0.718367338180542, "rewards/format_reward": 0.9897959232330322, "step": 3811 }, { "completion_length": 229.44896697998047, "epoch": 0.3835974842767296, "grad_norm": 0.9620353579521179, "kl": 0.103515625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7230918407440186, "reward_std": 0.148193821310997, "rewards/accuracy_reward": 0.7230918705463409, "rewards/format_reward": 1.0, "step": 3812 }, { "completion_length": 213.37754821777344, "epoch": 0.38369811320754715, "grad_norm": 0.758647084236145, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8918789625167847, "reward_std": 0.13735884428024292, "rewards/accuracy_reward": 0.902083158493042, "rewards/format_reward": 0.9897959232330322, "step": 3813 }, { "completion_length": 277.346923828125, "epoch": 0.3837987421383648, "grad_norm": 0.8393958210945129, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7531583309173584, "reward_std": 0.19079089537262917, "rewards/accuracy_reward": 0.763362467288971, "rewards/format_reward": 0.9897959232330322, "step": 3814 }, { "completion_length": 189.1734619140625, "epoch": 0.38389937106918237, "grad_norm": 0.870138943195343, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6458565592765808, "reward_std": 0.11861025914549828, "rewards/accuracy_reward": 0.6458565294742584, "rewards/format_reward": 1.0, "step": 3815 }, { "completion_length": 190.4591827392578, "epoch": 0.384, "grad_norm": 0.763573408126831, "kl": 0.10400390625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.866562008857727, "reward_std": 0.07884978502988815, "rewards/accuracy_reward": 0.8665620386600494, "rewards/format_reward": 1.0, "step": 3816 }, { "completion_length": 278.88775634765625, "epoch": 0.3841006289308176, "grad_norm": 0.9842130541801453, "kl": 0.111328125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.6249676942825317, "reward_std": 0.22783705592155457, "rewards/accuracy_reward": 0.6351718306541443, "rewards/format_reward": 0.9897959232330322, "step": 3817 }, { "completion_length": 356.1326446533203, "epoch": 0.3842012578616352, "grad_norm": 0.7295455932617188, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6006379127502441, "reward_std": 0.2595982998609543, "rewards/accuracy_reward": 0.6108420491218567, "rewards/format_reward": 0.9897959232330322, "step": 3818 }, { "completion_length": 220.08163452148438, "epoch": 0.3843018867924528, "grad_norm": 0.4961760938167572, "kl": 0.0667724609375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8140395283699036, "reward_std": 0.1158565804362297, "rewards/accuracy_reward": 0.8140394985675812, "rewards/format_reward": 1.0, "step": 3819 }, { "completion_length": 222.04080963134766, "epoch": 0.38440251572327044, "grad_norm": 0.4979591369628906, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.788882315158844, "reward_std": 0.10139832273125648, "rewards/accuracy_reward": 0.8092904984951019, "rewards/format_reward": 0.9795918464660645, "step": 3820 }, { "completion_length": 278.2755126953125, "epoch": 0.384503144654088, "grad_norm": 0.5612809062004089, "kl": 0.0537109375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7788548469543457, "reward_std": 0.14931000024080276, "rewards/accuracy_reward": 0.799263060092926, "rewards/format_reward": 0.9795918464660645, "step": 3821 }, { "completion_length": 231.31632232666016, "epoch": 0.38460377358490566, "grad_norm": 0.5841994881629944, "kl": 0.0584716796875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.829931914806366, "reward_std": 0.10684679821133614, "rewards/accuracy_reward": 0.8299319744110107, "rewards/format_reward": 1.0, "step": 3822 }, { "completion_length": 252.4183578491211, "epoch": 0.38470440251572324, "grad_norm": 15.028814315795898, "kl": 0.089111328125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7307440042495728, "reward_std": 0.13183720409870148, "rewards/accuracy_reward": 0.7409480810165405, "rewards/format_reward": 0.9897959232330322, "step": 3823 }, { "completion_length": 260.59183502197266, "epoch": 0.3848050314465409, "grad_norm": 0.7900965213775635, "kl": 0.0653076171875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.834424078464508, "reward_std": 0.15103374794125557, "rewards/accuracy_reward": 0.8344240784645081, "rewards/format_reward": 1.0, "step": 3824 }, { "completion_length": 243.6734619140625, "epoch": 0.3849056603773585, "grad_norm": 0.7735999226570129, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6450150609016418, "reward_std": 0.2011568620800972, "rewards/accuracy_reward": 0.6552191972732544, "rewards/format_reward": 0.9897959232330322, "step": 3825 }, { "completion_length": 195.62244415283203, "epoch": 0.3850062893081761, "grad_norm": 0.754438042640686, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7618787288665771, "reward_std": 0.1518978737294674, "rewards/accuracy_reward": 0.7924910187721252, "rewards/format_reward": 0.9693877398967743, "step": 3826 }, { "completion_length": 172.07142639160156, "epoch": 0.38510691823899373, "grad_norm": 1.19745671749115, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8280288577079773, "reward_std": 0.1098196692764759, "rewards/accuracy_reward": 0.8280289173126221, "rewards/format_reward": 1.0, "step": 3827 }, { "completion_length": 199.35713958740234, "epoch": 0.3852075471698113, "grad_norm": 1.669440746307373, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7879273295402527, "reward_std": 0.07617994770407677, "rewards/accuracy_reward": 0.7879273593425751, "rewards/format_reward": 1.0, "step": 3828 }, { "completion_length": 225.4285659790039, "epoch": 0.38530817610062895, "grad_norm": 1.4698832035064697, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.770764410495758, "reward_std": 0.16291158646345139, "rewards/accuracy_reward": 0.7707644999027252, "rewards/format_reward": 1.0, "step": 3829 }, { "completion_length": 267.77550506591797, "epoch": 0.38540880503144653, "grad_norm": 0.9757474660873413, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.642380714416504, "reward_std": 0.2210388332605362, "rewards/accuracy_reward": 0.6627889275550842, "rewards/format_reward": 0.9795918166637421, "step": 3830 }, { "completion_length": 209.22447967529297, "epoch": 0.38550943396226417, "grad_norm": 1.8658227920532227, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7529761791229248, "reward_std": 0.1187153086066246, "rewards/accuracy_reward": 0.763180285692215, "rewards/format_reward": 0.9897959232330322, "step": 3831 }, { "completion_length": 191.4591827392578, "epoch": 0.38561006289308175, "grad_norm": 0.5631555914878845, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.9067054986953735, "reward_std": 0.1108936071395874, "rewards/accuracy_reward": 0.9169095754623413, "rewards/format_reward": 0.9897959232330322, "step": 3832 }, { "completion_length": 196.12244415283203, "epoch": 0.3857106918238994, "grad_norm": 0.541668713092804, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7912310361862183, "reward_std": 0.16095135360956192, "rewards/accuracy_reward": 0.8014350533485413, "rewards/format_reward": 0.9897959232330322, "step": 3833 }, { "completion_length": 199.2755126953125, "epoch": 0.38581132075471697, "grad_norm": 1.1088237762451172, "kl": 0.123291015625, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.683302104473114, "reward_std": 0.2719837799668312, "rewards/accuracy_reward": 0.7037102282047272, "rewards/format_reward": 0.9795918166637421, "step": 3834 }, { "completion_length": 193.28571319580078, "epoch": 0.3859119496855346, "grad_norm": 0.8616171479225159, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8958847522735596, "reward_std": 0.10653125867247581, "rewards/accuracy_reward": 0.8958846926689148, "rewards/format_reward": 1.0, "step": 3835 }, { "completion_length": 224.05101013183594, "epoch": 0.3860125786163522, "grad_norm": 0.5811265707015991, "kl": 0.055908203125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6632652878761292, "reward_std": 0.1461700201034546, "rewards/accuracy_reward": 0.6836734712123871, "rewards/format_reward": 0.9795918464660645, "step": 3836 }, { "completion_length": 216.25509643554688, "epoch": 0.3861132075471698, "grad_norm": 0.5601514577865601, "kl": 0.055419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6427918076515198, "reward_std": 0.14483032748103142, "rewards/accuracy_reward": 0.6529958844184875, "rewards/format_reward": 0.9897959232330322, "step": 3837 }, { "completion_length": 230.09182739257812, "epoch": 0.3862138364779874, "grad_norm": 0.5298914909362793, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.846789300441742, "reward_std": 0.1372704915702343, "rewards/accuracy_reward": 0.8569933176040649, "rewards/format_reward": 0.9897959232330322, "step": 3838 }, { "completion_length": 211.31632232666016, "epoch": 0.38631446540880504, "grad_norm": 0.722131073474884, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6983102560043335, "reward_std": 0.11619622632861137, "rewards/accuracy_reward": 0.698310375213623, "rewards/format_reward": 1.0, "step": 3839 }, { "completion_length": 256.9183654785156, "epoch": 0.3864150943396226, "grad_norm": 1.6506941318511963, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7164833545684814, "reward_std": 0.19887464493513107, "rewards/accuracy_reward": 0.7368914783000946, "rewards/format_reward": 0.9795918166637421, "step": 3840 }, { "completion_length": 194.2448959350586, "epoch": 0.38651572327044026, "grad_norm": 0.4606841206550598, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8415937423706055, "reward_std": 0.0784364677965641, "rewards/accuracy_reward": 0.8415937423706055, "rewards/format_reward": 1.0, "step": 3841 }, { "completion_length": 179.24488830566406, "epoch": 0.38661635220125784, "grad_norm": 0.6963791251182556, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.810620367527008, "reward_std": 0.1099783144891262, "rewards/accuracy_reward": 0.8208244144916534, "rewards/format_reward": 0.9897959232330322, "step": 3842 }, { "completion_length": 298.5918273925781, "epoch": 0.3867169811320755, "grad_norm": 0.9129866361618042, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5920292735099792, "reward_std": 0.2676699608564377, "rewards/accuracy_reward": 0.6124375462532043, "rewards/format_reward": 0.9795918464660645, "step": 3843 }, { "completion_length": 224.16326141357422, "epoch": 0.38681761006289306, "grad_norm": 0.7705075740814209, "kl": 0.0537109375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8594104051589966, "reward_std": 0.12349307909607887, "rewards/accuracy_reward": 0.8594104051589966, "rewards/format_reward": 1.0, "step": 3844 }, { "completion_length": 263.18367767333984, "epoch": 0.3869182389937107, "grad_norm": 0.7438871264457703, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6629974246025085, "reward_std": 0.22532284259796143, "rewards/accuracy_reward": 0.6834056675434113, "rewards/format_reward": 0.9795918464660645, "step": 3845 }, { "completion_length": 276.2959213256836, "epoch": 0.3870188679245283, "grad_norm": 0.6462225317955017, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7300776243209839, "reward_std": 0.1286982074379921, "rewards/accuracy_reward": 0.7300777435302734, "rewards/format_reward": 1.0, "step": 3846 }, { "completion_length": 207.53060150146484, "epoch": 0.3871194968553459, "grad_norm": 0.4877423048019409, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.732653021812439, "reward_std": 0.08099238574504852, "rewards/accuracy_reward": 0.7428571283817291, "rewards/format_reward": 0.9897959232330322, "step": 3847 }, { "completion_length": 188.2653045654297, "epoch": 0.38722012578616355, "grad_norm": 0.8886816501617432, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8886165022850037, "reward_std": 0.09473618399351835, "rewards/accuracy_reward": 0.888616532087326, "rewards/format_reward": 1.0, "step": 3848 }, { "completion_length": 263.92857360839844, "epoch": 0.38732075471698113, "grad_norm": 0.5251642465591431, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8259596228599548, "reward_std": 0.1406181901693344, "rewards/accuracy_reward": 0.8259596526622772, "rewards/format_reward": 1.0, "step": 3849 }, { "completion_length": 310.1836700439453, "epoch": 0.38742138364779877, "grad_norm": 0.634734570980072, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7861029505729675, "reward_std": 0.2222580388188362, "rewards/accuracy_reward": 0.8167152106761932, "rewards/format_reward": 0.9693877398967743, "step": 3850 }, { "completion_length": 194.36734008789062, "epoch": 0.38752201257861635, "grad_norm": 0.6220117211341858, "kl": 0.09716796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6530045866966248, "reward_std": 0.04947652108967304, "rewards/accuracy_reward": 0.6530046612024307, "rewards/format_reward": 1.0, "step": 3851 }, { "completion_length": 240.63265228271484, "epoch": 0.387622641509434, "grad_norm": 0.8458830714225769, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7875933647155762, "reward_std": 0.08790137991309166, "rewards/accuracy_reward": 0.797797441482544, "rewards/format_reward": 0.9897959232330322, "step": 3852 }, { "completion_length": 234.34693145751953, "epoch": 0.38772327044025157, "grad_norm": 1.3168692588806152, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6525378823280334, "reward_std": 0.22721760720014572, "rewards/accuracy_reward": 0.6627419739961624, "rewards/format_reward": 0.9897959232330322, "step": 3853 }, { "completion_length": 183.56121826171875, "epoch": 0.3878238993710692, "grad_norm": 0.5116778612136841, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.9333557486534119, "reward_std": 0.046536578447557986, "rewards/accuracy_reward": 0.9435598254203796, "rewards/format_reward": 0.9897959232330322, "step": 3854 }, { "completion_length": 197.4693832397461, "epoch": 0.3879245283018868, "grad_norm": 0.9413464069366455, "kl": 0.1064453125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6874071955680847, "reward_std": 0.08466620557010174, "rewards/accuracy_reward": 0.6874071061611176, "rewards/format_reward": 1.0, "step": 3855 }, { "completion_length": 210.60203552246094, "epoch": 0.3880251572327044, "grad_norm": 0.5559149980545044, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7576530575752258, "reward_std": 0.1518062800168991, "rewards/accuracy_reward": 0.7576530277729034, "rewards/format_reward": 1.0, "step": 3856 }, { "completion_length": 224.75509643554688, "epoch": 0.388125786163522, "grad_norm": 0.5832604765892029, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7403215169906616, "reward_std": 0.10592355206608772, "rewards/accuracy_reward": 0.740321546792984, "rewards/format_reward": 1.0, "step": 3857 }, { "completion_length": 235.27550506591797, "epoch": 0.38822641509433964, "grad_norm": 1.748255729675293, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.4955529570579529, "reward_std": 0.19809406250715256, "rewards/accuracy_reward": 0.5057569891214371, "rewards/format_reward": 0.9897959232330322, "step": 3858 }, { "completion_length": 229.62244415283203, "epoch": 0.3883270440251572, "grad_norm": 1.1143759489059448, "kl": 0.11962890625, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7364177703857422, "reward_std": 0.1675354167819023, "rewards/accuracy_reward": 0.74662184715271, "rewards/format_reward": 0.9897959232330322, "step": 3859 }, { "completion_length": 279.15306091308594, "epoch": 0.38842767295597486, "grad_norm": 0.882300615310669, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6707482933998108, "reward_std": 0.25289086997509, "rewards/accuracy_reward": 0.7013605833053589, "rewards/format_reward": 0.9693877398967743, "step": 3860 }, { "completion_length": 222.32652282714844, "epoch": 0.38852830188679244, "grad_norm": 0.41069313883781433, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6074211597442627, "reward_std": 0.08339127898216248, "rewards/accuracy_reward": 0.6176252067089081, "rewards/format_reward": 0.9897959232330322, "step": 3861 }, { "completion_length": 200.32653045654297, "epoch": 0.3886289308176101, "grad_norm": 0.5105489492416382, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8095237612724304, "reward_std": 0.11950517818331718, "rewards/accuracy_reward": 0.8197278082370758, "rewards/format_reward": 0.9897959232330322, "step": 3862 }, { "completion_length": 257.76529693603516, "epoch": 0.38872955974842766, "grad_norm": 0.7451268434524536, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7127671837806702, "reward_std": 0.1959337368607521, "rewards/accuracy_reward": 0.7331754267215729, "rewards/format_reward": 0.9795918166637421, "step": 3863 }, { "completion_length": 242.81632232666016, "epoch": 0.3888301886792453, "grad_norm": 0.9778797030448914, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6364880204200745, "reward_std": 0.14261960983276367, "rewards/accuracy_reward": 0.6568961441516876, "rewards/format_reward": 0.9795918464660645, "step": 3864 }, { "completion_length": 296.37754821777344, "epoch": 0.3889308176100629, "grad_norm": 0.5239843726158142, "kl": 0.093994140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.757996678352356, "reward_std": 0.10497426427900791, "rewards/accuracy_reward": 0.7579967677593231, "rewards/format_reward": 1.0, "step": 3865 }, { "completion_length": 297.45916748046875, "epoch": 0.3890314465408805, "grad_norm": 1.0116300582885742, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7018378376960754, "reward_std": 0.1948264166712761, "rewards/accuracy_reward": 0.712041974067688, "rewards/format_reward": 0.9897959232330322, "step": 3866 }, { "completion_length": 236.08162689208984, "epoch": 0.3891320754716981, "grad_norm": 0.6225327253341675, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8196508884429932, "reward_std": 0.1326608806848526, "rewards/accuracy_reward": 0.8196509182453156, "rewards/format_reward": 1.0, "step": 3867 }, { "completion_length": 260.8061065673828, "epoch": 0.38923270440251573, "grad_norm": 0.8482920527458191, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.630838930606842, "reward_std": 0.10305102914571762, "rewards/accuracy_reward": 0.641043096780777, "rewards/format_reward": 0.9897959232330322, "step": 3868 }, { "completion_length": 290.4183654785156, "epoch": 0.3893333333333333, "grad_norm": 0.6846246123313904, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7271946668624878, "reward_std": 0.17048878222703934, "rewards/accuracy_reward": 0.737398773431778, "rewards/format_reward": 0.9897959232330322, "step": 3869 }, { "completion_length": 216.1938705444336, "epoch": 0.38943396226415095, "grad_norm": 0.5457384586334229, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8571428060531616, "reward_std": 0.1461700201034546, "rewards/accuracy_reward": 0.8775510191917419, "rewards/format_reward": 0.9795918464660645, "step": 3870 }, { "completion_length": 213.55101776123047, "epoch": 0.3895345911949685, "grad_norm": 0.4704809784889221, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.874343991279602, "reward_std": 0.09688142314553261, "rewards/accuracy_reward": 0.89475217461586, "rewards/format_reward": 0.9795918464660645, "step": 3871 }, { "completion_length": 195.69387817382812, "epoch": 0.38963522012578616, "grad_norm": 0.7841703295707703, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8227161765098572, "reward_std": 0.14053845778107643, "rewards/accuracy_reward": 0.8227162063121796, "rewards/format_reward": 1.0, "step": 3872 }, { "completion_length": 244.63265228271484, "epoch": 0.3897358490566038, "grad_norm": 0.7606310248374939, "kl": 0.054931640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.672497570514679, "reward_std": 0.2317659631371498, "rewards/accuracy_reward": 0.7235179543495178, "rewards/format_reward": 0.9489795863628387, "step": 3873 }, { "completion_length": 242.08162689208984, "epoch": 0.3898364779874214, "grad_norm": 0.9804813861846924, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8012633323669434, "reward_std": 0.17212018743157387, "rewards/accuracy_reward": 0.8114674091339111, "rewards/format_reward": 0.9897959232330322, "step": 3874 }, { "completion_length": 265.78570556640625, "epoch": 0.389937106918239, "grad_norm": 1.1600366830825806, "kl": 0.089111328125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8178571462631226, "reward_std": 0.17631282657384872, "rewards/accuracy_reward": 0.8280612230300903, "rewards/format_reward": 0.9897959232330322, "step": 3875 }, { "completion_length": 237.7244873046875, "epoch": 0.3900377358490566, "grad_norm": 0.8908131718635559, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5862256288528442, "reward_std": 0.24000661075115204, "rewards/accuracy_reward": 0.596429705619812, "rewards/format_reward": 0.9897959232330322, "step": 3876 }, { "completion_length": 168.99999618530273, "epoch": 0.39013836477987424, "grad_norm": 1.229148268699646, "kl": 0.1026611328125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.9199133515357971, "reward_std": 0.14574912935495377, "rewards/accuracy_reward": 0.9301174581050873, "rewards/format_reward": 0.9897959232330322, "step": 3877 }, { "completion_length": 234.1428451538086, "epoch": 0.3902389937106918, "grad_norm": 0.6100449562072754, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7245144844055176, "reward_std": 0.06713535264134407, "rewards/accuracy_reward": 0.7449225783348083, "rewards/format_reward": 0.9795918166637421, "step": 3878 }, { "completion_length": 180.01020050048828, "epoch": 0.39033962264150945, "grad_norm": 4.953997611999512, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.677298665046692, "reward_std": 0.18376688659191132, "rewards/accuracy_reward": 0.6977068781852722, "rewards/format_reward": 0.9795918166637421, "step": 3879 }, { "completion_length": 184.1836700439453, "epoch": 0.39044025157232704, "grad_norm": 6.376617431640625, "kl": 0.097900390625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7968918681144714, "reward_std": 0.15606936067342758, "rewards/accuracy_reward": 0.796891838312149, "rewards/format_reward": 1.0, "step": 3880 }, { "completion_length": 194.55101013183594, "epoch": 0.39054088050314467, "grad_norm": 4.5432915687561035, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7693877220153809, "reward_std": 0.29205475747585297, "rewards/accuracy_reward": 0.7897959053516388, "rewards/format_reward": 0.9795918166637421, "step": 3881 }, { "completion_length": 232.2142791748047, "epoch": 0.39064150943396225, "grad_norm": 0.5064576268196106, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7262883186340332, "reward_std": 0.17718200385570526, "rewards/accuracy_reward": 0.7671047151088715, "rewards/format_reward": 0.9591836333274841, "step": 3882 }, { "completion_length": 259.04080963134766, "epoch": 0.3907421383647799, "grad_norm": 0.735198438167572, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7390552759170532, "reward_std": 0.15640220418572426, "rewards/accuracy_reward": 0.7390554249286652, "rewards/format_reward": 1.0, "step": 3883 }, { "completion_length": 200.448974609375, "epoch": 0.39084276729559747, "grad_norm": 1.0274547338485718, "kl": 0.0772705078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8469387292861938, "reward_std": 0.20016493648290634, "rewards/accuracy_reward": 0.8775510191917419, "rewards/format_reward": 0.9693877398967743, "step": 3884 }, { "completion_length": 282.7142791748047, "epoch": 0.3909433962264151, "grad_norm": 0.8952163457870483, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7022840976715088, "reward_std": 0.23283790424466133, "rewards/accuracy_reward": 0.7328963577747345, "rewards/format_reward": 0.9693877398967743, "step": 3885 }, { "completion_length": 218.08163452148438, "epoch": 0.3910440251572327, "grad_norm": 0.888084352016449, "kl": 0.094970703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6848638653755188, "reward_std": 0.20284977555274963, "rewards/accuracy_reward": 0.7052721083164215, "rewards/format_reward": 0.9795918166637421, "step": 3886 }, { "completion_length": 185.54080963134766, "epoch": 0.3911446540880503, "grad_norm": 0.6008368134498596, "kl": 0.0635986328125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.834402322769165, "reward_std": 0.14586299657821655, "rewards/accuracy_reward": 0.8344022929668427, "rewards/format_reward": 1.0, "step": 3887 }, { "completion_length": 196.28570556640625, "epoch": 0.3912452830188679, "grad_norm": 0.5045564770698547, "kl": 0.104248046875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8503401279449463, "reward_std": 0.15288307890295982, "rewards/accuracy_reward": 0.8605442345142365, "rewards/format_reward": 0.9897959232330322, "step": 3888 }, { "completion_length": 251.24490356445312, "epoch": 0.39134591194968554, "grad_norm": 0.9774131178855896, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6193384528160095, "reward_std": 0.2376660853624344, "rewards/accuracy_reward": 0.6397466063499451, "rewards/format_reward": 0.9795918166637421, "step": 3889 }, { "completion_length": 213.2448959350586, "epoch": 0.3914465408805031, "grad_norm": 0.7720361351966858, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8371056914329529, "reward_std": 0.1418730951845646, "rewards/accuracy_reward": 0.8371057212352753, "rewards/format_reward": 1.0, "step": 3890 }, { "completion_length": 249.06121826171875, "epoch": 0.39154716981132076, "grad_norm": 0.8237950205802917, "kl": 0.119140625, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.6049827933311462, "reward_std": 0.20751168578863144, "rewards/accuracy_reward": 0.6253910064697266, "rewards/format_reward": 0.9795918464660645, "step": 3891 }, { "completion_length": 227.64285278320312, "epoch": 0.39164779874213834, "grad_norm": 2.344895124435425, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6123195886611938, "reward_std": 0.17733369767665863, "rewards/accuracy_reward": 0.6327278017997742, "rewards/format_reward": 0.9795918166637421, "step": 3892 }, { "completion_length": 239.87755584716797, "epoch": 0.391748427672956, "grad_norm": 10.027229309082031, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.668948233127594, "reward_std": 0.20007285475730896, "rewards/accuracy_reward": 0.6791523098945618, "rewards/format_reward": 0.9897959232330322, "step": 3893 }, { "completion_length": 267.5612106323242, "epoch": 0.39184905660377356, "grad_norm": 0.6220915913581848, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6479125022888184, "reward_std": 0.1426123082637787, "rewards/accuracy_reward": 0.6683206558227539, "rewards/format_reward": 0.9795918166637421, "step": 3894 }, { "completion_length": 251.9285659790039, "epoch": 0.3919496855345912, "grad_norm": 0.8450244665145874, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6770188808441162, "reward_std": 0.22009935230016708, "rewards/accuracy_reward": 0.6770188808441162, "rewards/format_reward": 1.0, "step": 3895 }, { "completion_length": 247.83673095703125, "epoch": 0.3920503144654088, "grad_norm": 0.9389752149581909, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7930798530578613, "reward_std": 0.1905909925699234, "rewards/accuracy_reward": 0.7930797636508942, "rewards/format_reward": 1.0, "step": 3896 }, { "completion_length": 240.31632232666016, "epoch": 0.3921509433962264, "grad_norm": 5.737931728363037, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7104278802871704, "reward_std": 0.28272853046655655, "rewards/accuracy_reward": 0.7104278802871704, "rewards/format_reward": 1.0, "step": 3897 }, { "completion_length": 330.8673400878906, "epoch": 0.39225157232704405, "grad_norm": 0.6105962991714478, "kl": 0.0738525390625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.784580409526825, "reward_std": 0.1941867247223854, "rewards/accuracy_reward": 0.7947845757007599, "rewards/format_reward": 0.9897959232330322, "step": 3898 }, { "completion_length": 286.6734619140625, "epoch": 0.39235220125786163, "grad_norm": 0.5131685733795166, "kl": 0.0684814453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7319727540016174, "reward_std": 0.15924794971942902, "rewards/accuracy_reward": 0.7319727838039398, "rewards/format_reward": 1.0, "step": 3899 }, { "completion_length": 234.47958374023438, "epoch": 0.39245283018867927, "grad_norm": 0.5003334879875183, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7547749876976013, "reward_std": 0.08302491065114737, "rewards/accuracy_reward": 0.7547749876976013, "rewards/format_reward": 1.0, "step": 3900 }, { "completion_length": 321.06121826171875, "epoch": 0.39255345911949685, "grad_norm": 0.8373193144798279, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6162676215171814, "reward_std": 0.15151729062199593, "rewards/accuracy_reward": 0.6162676513195038, "rewards/format_reward": 1.0, "step": 3901 }, { "completion_length": 200.94898223876953, "epoch": 0.3926540880503145, "grad_norm": 0.9831578731536865, "kl": 0.103271484375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6591836214065552, "reward_std": 0.14091406762599945, "rewards/accuracy_reward": 0.6693877577781677, "rewards/format_reward": 0.9897959232330322, "step": 3902 }, { "completion_length": 148.17346954345703, "epoch": 0.39275471698113207, "grad_norm": 0.5075992941856384, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.868088960647583, "reward_std": 0.08364217728376389, "rewards/accuracy_reward": 0.8680889904499054, "rewards/format_reward": 1.0, "step": 3903 }, { "completion_length": 249.39794921875, "epoch": 0.3928553459119497, "grad_norm": 0.8100882768630981, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.802197813987732, "reward_std": 0.1654737889766693, "rewards/accuracy_reward": 0.8124018907546997, "rewards/format_reward": 0.9897959232330322, "step": 3904 }, { "completion_length": 232.88774871826172, "epoch": 0.3929559748427673, "grad_norm": 0.7423397898674011, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.738984227180481, "reward_std": 0.21271391212940216, "rewards/accuracy_reward": 0.7593923807144165, "rewards/format_reward": 0.9795918166637421, "step": 3905 }, { "completion_length": 189.2040786743164, "epoch": 0.3930566037735849, "grad_norm": 1.0416427850723267, "kl": 0.0728759765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8044564127922058, "reward_std": 0.15585633367300034, "rewards/accuracy_reward": 0.824864536523819, "rewards/format_reward": 0.9795918166637421, "step": 3906 }, { "completion_length": 242.7040786743164, "epoch": 0.3931572327044025, "grad_norm": 0.668290376663208, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6945586800575256, "reward_std": 0.15442029386758804, "rewards/accuracy_reward": 0.6945587694644928, "rewards/format_reward": 1.0, "step": 3907 }, { "completion_length": 214.6530532836914, "epoch": 0.39325786163522014, "grad_norm": 0.919751763343811, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5766487121582031, "reward_std": 0.19235488027334213, "rewards/accuracy_reward": 0.5868529379367828, "rewards/format_reward": 0.9897959232330322, "step": 3908 }, { "completion_length": 199.89795684814453, "epoch": 0.3933584905660377, "grad_norm": 0.8631817698478699, "kl": 0.115234375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.776773989200592, "reward_std": 0.15454541146755219, "rewards/accuracy_reward": 0.7869780659675598, "rewards/format_reward": 0.9897959232330322, "step": 3909 }, { "completion_length": 220.07142639160156, "epoch": 0.39345911949685536, "grad_norm": 1.223617672920227, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7375364303588867, "reward_std": 0.17654820904135704, "rewards/accuracy_reward": 0.7375363707542419, "rewards/format_reward": 1.0, "step": 3910 }, { "completion_length": 184.4795913696289, "epoch": 0.39355974842767294, "grad_norm": 1.198587417602539, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7877715229988098, "reward_std": 0.11442694067955017, "rewards/accuracy_reward": 0.7979756891727448, "rewards/format_reward": 0.9897959232330322, "step": 3911 }, { "completion_length": 241.2244873046875, "epoch": 0.3936603773584906, "grad_norm": 0.7190102934837341, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.5776591897010803, "reward_std": 0.11014333367347717, "rewards/accuracy_reward": 0.5776592195034027, "rewards/format_reward": 1.0, "step": 3912 }, { "completion_length": 229.11223602294922, "epoch": 0.39376100628930816, "grad_norm": 0.9517884850502014, "kl": 0.06689453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.713710904121399, "reward_std": 0.19184017181396484, "rewards/accuracy_reward": 0.7137110233306885, "rewards/format_reward": 1.0, "step": 3913 }, { "completion_length": 216.9591827392578, "epoch": 0.3938616352201258, "grad_norm": 0.5605078339576721, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.772232472896576, "reward_std": 0.16596746817231178, "rewards/accuracy_reward": 0.7926406562328339, "rewards/format_reward": 0.9795918464660645, "step": 3914 }, { "completion_length": 208.0204086303711, "epoch": 0.3939622641509434, "grad_norm": 2.1174373626708984, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6972702145576477, "reward_std": 0.16466569900512695, "rewards/accuracy_reward": 0.7074743211269379, "rewards/format_reward": 0.9897959232330322, "step": 3915 }, { "completion_length": 199.2653045654297, "epoch": 0.394062893081761, "grad_norm": 0.7618703246116638, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.743733286857605, "reward_std": 0.11088811978697777, "rewards/accuracy_reward": 0.7437332570552826, "rewards/format_reward": 1.0, "step": 3916 }, { "completion_length": 214.97958374023438, "epoch": 0.3941635220125786, "grad_norm": 0.8066134452819824, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7453240156173706, "reward_std": 0.17074401676654816, "rewards/accuracy_reward": 0.7657321989536285, "rewards/format_reward": 0.9795918464660645, "step": 3917 }, { "completion_length": 274.9387664794922, "epoch": 0.39426415094339623, "grad_norm": 0.7767655253410339, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.778473138809204, "reward_std": 0.18884602934122086, "rewards/accuracy_reward": 0.7784731984138489, "rewards/format_reward": 1.0, "step": 3918 }, { "completion_length": 278.79591369628906, "epoch": 0.3943647798742138, "grad_norm": 0.9996238946914673, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7700448632240295, "reward_std": 0.20730045437812805, "rewards/accuracy_reward": 0.8006570935249329, "rewards/format_reward": 0.9693877398967743, "step": 3919 }, { "completion_length": 183.62245178222656, "epoch": 0.39446540880503145, "grad_norm": 0.3788783550262451, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.9319728016853333, "reward_std": 0.05399492383003235, "rewards/accuracy_reward": 0.9319728016853333, "rewards/format_reward": 1.0, "step": 3920 }, { "completion_length": 213.10203552246094, "epoch": 0.39456603773584903, "grad_norm": 0.7354552149772644, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8095985651016235, "reward_std": 0.1416328325867653, "rewards/accuracy_reward": 0.8198026418685913, "rewards/format_reward": 0.9897959232330322, "step": 3921 }, { "completion_length": 230.73468780517578, "epoch": 0.39466666666666667, "grad_norm": 0.7979711890220642, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.9358974695205688, "reward_std": 0.08067097514867783, "rewards/accuracy_reward": 0.9358974397182465, "rewards/format_reward": 1.0, "step": 3922 }, { "completion_length": 189.17346954345703, "epoch": 0.3947672955974843, "grad_norm": 0.766998827457428, "kl": 0.121826171875, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.7511461973190308, "reward_std": 0.10983525961637497, "rewards/accuracy_reward": 0.7511461675167084, "rewards/format_reward": 1.0, "step": 3923 }, { "completion_length": 193.8163299560547, "epoch": 0.3948679245283019, "grad_norm": 0.862913191318512, "kl": 0.098876953125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.615646243095398, "reward_std": 0.20884182304143906, "rewards/accuracy_reward": 0.6360543966293335, "rewards/format_reward": 0.9795918464660645, "step": 3924 }, { "completion_length": 222.2653045654297, "epoch": 0.3949685534591195, "grad_norm": 0.7677781581878662, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8499999642372131, "reward_std": 0.08970578014850616, "rewards/accuracy_reward": 0.8602040410041809, "rewards/format_reward": 0.9897959232330322, "step": 3925 }, { "completion_length": 225.7653045654297, "epoch": 0.3950691823899371, "grad_norm": 0.3822724223136902, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.740616798400879, "reward_std": 0.05738130956888199, "rewards/accuracy_reward": 0.7406168282032013, "rewards/format_reward": 1.0, "step": 3926 }, { "completion_length": 216.34693908691406, "epoch": 0.39516981132075474, "grad_norm": 0.5049576163291931, "kl": 0.0721435546875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8448456525802612, "reward_std": 0.1272418275475502, "rewards/accuracy_reward": 0.8652537763118744, "rewards/format_reward": 0.9795918464660645, "step": 3927 }, { "completion_length": 246.3775405883789, "epoch": 0.3952704402515723, "grad_norm": 0.982963502407074, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.745382010936737, "reward_std": 0.2664090618491173, "rewards/accuracy_reward": 0.7657902240753174, "rewards/format_reward": 0.9795918166637421, "step": 3928 }, { "completion_length": 269.0714340209961, "epoch": 0.39537106918238996, "grad_norm": 0.6729795932769775, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7165915369987488, "reward_std": 0.20780353248119354, "rewards/accuracy_reward": 0.7369997203350067, "rewards/format_reward": 0.9795918464660645, "step": 3929 }, { "completion_length": 204.05101776123047, "epoch": 0.39547169811320754, "grad_norm": 1.0146616697311401, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.740879237651825, "reward_std": 0.1481434591114521, "rewards/accuracy_reward": 0.7510833740234375, "rewards/format_reward": 0.9897959232330322, "step": 3930 }, { "completion_length": 208.37754821777344, "epoch": 0.3955723270440252, "grad_norm": 0.6240067481994629, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8309038281440735, "reward_std": 0.10041883960366249, "rewards/accuracy_reward": 0.8411079049110413, "rewards/format_reward": 0.9897959232330322, "step": 3931 }, { "completion_length": 217.6938705444336, "epoch": 0.39567295597484275, "grad_norm": 0.8313925266265869, "kl": 0.065185546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7166642546653748, "reward_std": 0.12568877264857292, "rewards/accuracy_reward": 0.7166643440723419, "rewards/format_reward": 1.0, "step": 3932 }, { "completion_length": 265.10204315185547, "epoch": 0.3957735849056604, "grad_norm": 0.5336279273033142, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6141321063041687, "reward_std": 0.16893095523118973, "rewards/accuracy_reward": 0.6243362724781036, "rewards/format_reward": 0.9897959232330322, "step": 3933 }, { "completion_length": 314.44895935058594, "epoch": 0.395874213836478, "grad_norm": 0.8501619696617126, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7682000398635864, "reward_std": 0.1593937873840332, "rewards/accuracy_reward": 0.768200010061264, "rewards/format_reward": 1.0, "step": 3934 }, { "completion_length": 200.14285278320312, "epoch": 0.3959748427672956, "grad_norm": 0.7078911066055298, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8469387292861938, "reward_std": 0.15402613580226898, "rewards/accuracy_reward": 0.857142835855484, "rewards/format_reward": 0.9897959232330322, "step": 3935 }, { "completion_length": 211.12244415283203, "epoch": 0.3960754716981132, "grad_norm": 0.7148005366325378, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7288330793380737, "reward_std": 0.18732402846217155, "rewards/accuracy_reward": 0.7594453394412994, "rewards/format_reward": 0.9693877398967743, "step": 3936 }, { "completion_length": 245.60203552246094, "epoch": 0.3961761006289308, "grad_norm": 1.0105482339859009, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8505494594573975, "reward_std": 0.10635908227413893, "rewards/accuracy_reward": 0.8505494594573975, "rewards/format_reward": 1.0, "step": 3937 }, { "completion_length": 203.14285278320312, "epoch": 0.3962767295597484, "grad_norm": 1.536623477935791, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7518576383590698, "reward_std": 0.16666298359632492, "rewards/accuracy_reward": 0.7620617747306824, "rewards/format_reward": 0.9897959232330322, "step": 3938 }, { "completion_length": 168.73468780517578, "epoch": 0.39637735849056605, "grad_norm": 0.5527622699737549, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8483148217201233, "reward_std": 0.09839783608913422, "rewards/accuracy_reward": 0.8585188686847687, "rewards/format_reward": 0.9897959232330322, "step": 3939 }, { "completion_length": 249.74488830566406, "epoch": 0.3964779874213836, "grad_norm": 2.980628252029419, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8249635100364685, "reward_std": 0.21885421872138977, "rewards/accuracy_reward": 0.8249635398387909, "rewards/format_reward": 1.0, "step": 3940 }, { "completion_length": 283.0408172607422, "epoch": 0.39657861635220126, "grad_norm": 1.105745553970337, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6478847861289978, "reward_std": 0.21365846693515778, "rewards/accuracy_reward": 0.6478848159313202, "rewards/format_reward": 1.0, "step": 3941 }, { "completion_length": 247.01020050048828, "epoch": 0.39667924528301884, "grad_norm": 0.5046871900558472, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8513866662979126, "reward_std": 0.06814221478998661, "rewards/accuracy_reward": 0.851386696100235, "rewards/format_reward": 1.0, "step": 3942 }, { "completion_length": 286.9081573486328, "epoch": 0.3967798742138365, "grad_norm": 0.9919641613960266, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.4870328307151794, "reward_std": 0.2187763899564743, "rewards/accuracy_reward": 0.5176450461149216, "rewards/format_reward": 0.9693877398967743, "step": 3943 }, { "completion_length": 230.9897918701172, "epoch": 0.39688050314465406, "grad_norm": 0.8931418657302856, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7386820912361145, "reward_std": 0.16685720533132553, "rewards/accuracy_reward": 0.7386820018291473, "rewards/format_reward": 1.0, "step": 3944 }, { "completion_length": 203.92857360839844, "epoch": 0.3969811320754717, "grad_norm": 0.8462009429931641, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7210509777069092, "reward_std": 0.11483738198876381, "rewards/accuracy_reward": 0.7312551140785217, "rewards/format_reward": 0.9897959232330322, "step": 3945 }, { "completion_length": 143.89795684814453, "epoch": 0.39708176100628934, "grad_norm": 0.9092337489128113, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7842565178871155, "reward_std": 0.06354967504739761, "rewards/accuracy_reward": 0.7842565476894379, "rewards/format_reward": 1.0, "step": 3946 }, { "completion_length": 262.2142791748047, "epoch": 0.3971823899371069, "grad_norm": 0.7204684615135193, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6386613845825195, "reward_std": 0.2617637440562248, "rewards/accuracy_reward": 0.6386613696813583, "rewards/format_reward": 1.0, "step": 3947 }, { "completion_length": 199.10203552246094, "epoch": 0.39728301886792455, "grad_norm": 1.1683684587478638, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.817946195602417, "reward_std": 0.1343136988580227, "rewards/accuracy_reward": 0.817946195602417, "rewards/format_reward": 1.0, "step": 3948 }, { "completion_length": 281.8571472167969, "epoch": 0.39738364779874213, "grad_norm": 1.0213751792907715, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.629807949066162, "reward_std": 0.1966973841190338, "rewards/accuracy_reward": 0.6298080384731293, "rewards/format_reward": 1.0, "step": 3949 }, { "completion_length": 243.3571319580078, "epoch": 0.39748427672955977, "grad_norm": 6.657944679260254, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.4943310022354126, "reward_std": 0.20946760475635529, "rewards/accuracy_reward": 0.504535123705864, "rewards/format_reward": 0.9897959232330322, "step": 3950 }, { "completion_length": 216.1734619140625, "epoch": 0.39758490566037735, "grad_norm": 0.800290048122406, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7695878148078918, "reward_std": 0.11408127844333649, "rewards/accuracy_reward": 0.7695878148078918, "rewards/format_reward": 1.0, "step": 3951 }, { "completion_length": 274.89794921875, "epoch": 0.397685534591195, "grad_norm": 1.003415584564209, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.440298318862915, "reward_std": 0.18656779825687408, "rewards/accuracy_reward": 0.46070654690265656, "rewards/format_reward": 0.9795918464660645, "step": 3952 }, { "completion_length": 290.4081573486328, "epoch": 0.39778616352201257, "grad_norm": 0.5663608908653259, "kl": 0.0545654296875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.634904384613037, "reward_std": 0.22931531071662903, "rewards/accuracy_reward": 0.6655166447162628, "rewards/format_reward": 0.9693877398967743, "step": 3953 }, { "completion_length": 274.09183502197266, "epoch": 0.3978867924528302, "grad_norm": 0.6223661303520203, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8177841901779175, "reward_std": 0.12302933633327484, "rewards/accuracy_reward": 0.8279882967472076, "rewards/format_reward": 0.9897959232330322, "step": 3954 }, { "completion_length": 302.4897918701172, "epoch": 0.3979874213836478, "grad_norm": 0.534231960773468, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6397068500518799, "reward_std": 0.13455720990896225, "rewards/accuracy_reward": 0.6499109268188477, "rewards/format_reward": 0.9897959232330322, "step": 3955 }, { "completion_length": 234.53060913085938, "epoch": 0.3980880503144654, "grad_norm": 0.8705397248268127, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.732899010181427, "reward_std": 0.22192802280187607, "rewards/accuracy_reward": 0.7431030571460724, "rewards/format_reward": 0.9897959232330322, "step": 3956 }, { "completion_length": 253.51019287109375, "epoch": 0.398188679245283, "grad_norm": 0.9835338592529297, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6650108695030212, "reward_std": 0.17435608059167862, "rewards/accuracy_reward": 0.6854190528392792, "rewards/format_reward": 0.9795918166637421, "step": 3957 }, { "completion_length": 258.9897918701172, "epoch": 0.39828930817610064, "grad_norm": 0.8904538750648499, "kl": 0.0765380859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7440881133079529, "reward_std": 0.15204988792538643, "rewards/accuracy_reward": 0.7542921602725983, "rewards/format_reward": 0.9897959232330322, "step": 3958 }, { "completion_length": 256.82652282714844, "epoch": 0.3983899371069182, "grad_norm": 0.5110400915145874, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7180343866348267, "reward_std": 0.17182719707489014, "rewards/accuracy_reward": 0.728238433599472, "rewards/format_reward": 0.9897959232330322, "step": 3959 }, { "completion_length": 211.35713958740234, "epoch": 0.39849056603773586, "grad_norm": 0.8618495464324951, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7693796753883362, "reward_std": 0.14332473278045654, "rewards/accuracy_reward": 0.7693796455860138, "rewards/format_reward": 1.0, "step": 3960 }, { "completion_length": 254.28570556640625, "epoch": 0.39859119496855344, "grad_norm": 0.6984726190567017, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7695415019989014, "reward_std": 0.1491372138261795, "rewards/accuracy_reward": 0.7695415914058685, "rewards/format_reward": 1.0, "step": 3961 }, { "completion_length": 279.8163146972656, "epoch": 0.3986918238993711, "grad_norm": 0.9053042531013489, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.750838279724121, "reward_std": 0.30184420943260193, "rewards/accuracy_reward": 0.771246463060379, "rewards/format_reward": 0.9795918464660645, "step": 3962 }, { "completion_length": 256.86734771728516, "epoch": 0.39879245283018866, "grad_norm": 1.1075758934020996, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7069507837295532, "reward_std": 0.27933235466480255, "rewards/accuracy_reward": 0.7477670609951019, "rewards/format_reward": 0.9591836631298065, "step": 3963 }, { "completion_length": 261.44898223876953, "epoch": 0.3988930817610063, "grad_norm": 0.48111554980278015, "kl": 0.111572265625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.5982707142829895, "reward_std": 0.09761619055643678, "rewards/accuracy_reward": 0.6186789125204086, "rewards/format_reward": 0.9795918166637421, "step": 3964 }, { "completion_length": 233.14285278320312, "epoch": 0.3989937106918239, "grad_norm": 0.722888708114624, "kl": 0.054443359375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.795918345451355, "reward_std": 0.1256423071026802, "rewards/accuracy_reward": 0.7959183752536774, "rewards/format_reward": 1.0, "step": 3965 }, { "completion_length": 198.32653045654297, "epoch": 0.3990943396226415, "grad_norm": 0.7263057827949524, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7956458926200867, "reward_std": 0.1327177658677101, "rewards/accuracy_reward": 0.7956459522247314, "rewards/format_reward": 1.0, "step": 3966 }, { "completion_length": 241.38774871826172, "epoch": 0.3991949685534591, "grad_norm": 0.5319359302520752, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.9387754797935486, "reward_std": 0.11917255818843842, "rewards/accuracy_reward": 0.9591836631298065, "rewards/format_reward": 0.9795918464660645, "step": 3967 }, { "completion_length": 278.24488830566406, "epoch": 0.39929559748427673, "grad_norm": 1.211581826210022, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.674441158771515, "reward_std": 0.2368442267179489, "rewards/accuracy_reward": 0.6846452951431274, "rewards/format_reward": 0.9897959232330322, "step": 3968 }, { "completion_length": 143.65306091308594, "epoch": 0.3993962264150943, "grad_norm": 0.6163914799690247, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.9265305995941162, "reward_std": 0.0713108740746975, "rewards/accuracy_reward": 0.9265305697917938, "rewards/format_reward": 1.0, "step": 3969 }, { "completion_length": 222.07141876220703, "epoch": 0.39949685534591195, "grad_norm": 0.5831423997879028, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8165881633758545, "reward_std": 0.1411740556359291, "rewards/accuracy_reward": 0.8267922699451447, "rewards/format_reward": 0.9897959232330322, "step": 3970 }, { "completion_length": 270.16326904296875, "epoch": 0.3995974842767296, "grad_norm": 0.7516262531280518, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7551019787788391, "reward_std": 0.24054989591240883, "rewards/accuracy_reward": 0.795918345451355, "rewards/format_reward": 0.9591836631298065, "step": 3971 }, { "completion_length": 259.63265228271484, "epoch": 0.39969811320754717, "grad_norm": 0.5950855016708374, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6776156425476074, "reward_std": 0.16594994813203812, "rewards/accuracy_reward": 0.7082278430461884, "rewards/format_reward": 0.9693877398967743, "step": 3972 }, { "completion_length": 284.99999237060547, "epoch": 0.3997987421383648, "grad_norm": 0.7328643798828125, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7447704076766968, "reward_std": 0.12004069238901138, "rewards/accuracy_reward": 0.7447704374790192, "rewards/format_reward": 1.0, "step": 3973 }, { "completion_length": 232.24488830566406, "epoch": 0.3998993710691824, "grad_norm": 1.3706787824630737, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6964954733848572, "reward_std": 0.21620047837495804, "rewards/accuracy_reward": 0.7066996395587921, "rewards/format_reward": 0.9897959232330322, "step": 3974 }, { "completion_length": 289.18365478515625, "epoch": 0.4, "grad_norm": 0.855620801448822, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7060060501098633, "reward_std": 0.17511076480150223, "rewards/accuracy_reward": 0.7264142632484436, "rewards/format_reward": 0.9795918166637421, "step": 3975 }, { "completion_length": 205.75509643554688, "epoch": 0.4001006289308176, "grad_norm": 0.5252542495727539, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7188541889190674, "reward_std": 0.17011915147304535, "rewards/accuracy_reward": 0.7596705555915833, "rewards/format_reward": 0.9591836333274841, "step": 3976 }, { "completion_length": 269.32652282714844, "epoch": 0.40020125786163524, "grad_norm": 1.1082676649093628, "kl": 0.051513671875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7492489218711853, "reward_std": 0.1984406430274248, "rewards/accuracy_reward": 0.7594530284404755, "rewards/format_reward": 0.9897959232330322, "step": 3977 }, { "completion_length": 217.97958374023438, "epoch": 0.4003018867924528, "grad_norm": 0.7309991717338562, "kl": 0.105712890625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8561710119247437, "reward_std": 0.239231638610363, "rewards/accuracy_reward": 0.8867832720279694, "rewards/format_reward": 0.9693877398967743, "step": 3978 }, { "completion_length": 309.4387664794922, "epoch": 0.40040251572327046, "grad_norm": 0.8349888920783997, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.5383503437042236, "reward_std": 0.21783586591482162, "rewards/accuracy_reward": 0.5791666507720947, "rewards/format_reward": 0.9591836631298065, "step": 3979 }, { "completion_length": 270.2040710449219, "epoch": 0.40050314465408804, "grad_norm": 0.672010600566864, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7452380657196045, "reward_std": 0.1640334278345108, "rewards/accuracy_reward": 0.7656462788581848, "rewards/format_reward": 0.9795918464660645, "step": 3980 }, { "completion_length": 209.7448959350586, "epoch": 0.4006037735849057, "grad_norm": 0.73163241147995, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.87497478723526, "reward_std": 0.11061271652579308, "rewards/accuracy_reward": 0.8851788938045502, "rewards/format_reward": 0.9897959232330322, "step": 3981 }, { "completion_length": 214.4081573486328, "epoch": 0.40070440251572326, "grad_norm": 0.9971668124198914, "kl": 0.1103515625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.753276288509369, "reward_std": 0.19786608219146729, "rewards/accuracy_reward": 0.7634803354740143, "rewards/format_reward": 0.9897959232330322, "step": 3982 }, { "completion_length": 215.57142639160156, "epoch": 0.4008050314465409, "grad_norm": 0.8823988437652588, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6735087633132935, "reward_std": 0.16442649066448212, "rewards/accuracy_reward": 0.7041210234165192, "rewards/format_reward": 0.9693877398967743, "step": 3983 }, { "completion_length": 275.29591369628906, "epoch": 0.4009056603773585, "grad_norm": 0.44827359914779663, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7968769073486328, "reward_std": 0.10327053815126419, "rewards/accuracy_reward": 0.7968769371509552, "rewards/format_reward": 1.0, "step": 3984 }, { "completion_length": 218.7142791748047, "epoch": 0.4010062893081761, "grad_norm": 0.6624168157577515, "kl": 0.116943359375, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.761904776096344, "reward_std": 0.18102358281612396, "rewards/accuracy_reward": 0.7721088528633118, "rewards/format_reward": 0.9897959232330322, "step": 3985 }, { "completion_length": 256.3163146972656, "epoch": 0.4011069182389937, "grad_norm": 1.636818766593933, "kl": 0.0631103515625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8673468828201294, "reward_std": 0.18933051079511642, "rewards/accuracy_reward": 0.8673469126224518, "rewards/format_reward": 1.0, "step": 3986 }, { "completion_length": 182.21428680419922, "epoch": 0.40120754716981133, "grad_norm": 0.872788667678833, "kl": 0.0758056640625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.775753140449524, "reward_std": 0.12485972419381142, "rewards/accuracy_reward": 0.7859572470188141, "rewards/format_reward": 0.9897959232330322, "step": 3987 }, { "completion_length": 146.53060913085938, "epoch": 0.4013081761006289, "grad_norm": 4.410463809967041, "kl": 0.104248046875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8704866766929626, "reward_std": 0.134536512196064, "rewards/accuracy_reward": 0.8806907534599304, "rewards/format_reward": 0.9897959232330322, "step": 3988 }, { "completion_length": 252.0, "epoch": 0.40140880503144655, "grad_norm": 1.2138322591781616, "kl": 0.0986328125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7244897484779358, "reward_std": 0.130389004945755, "rewards/accuracy_reward": 0.7346938848495483, "rewards/format_reward": 0.9897959232330322, "step": 3989 }, { "completion_length": 252.7244873046875, "epoch": 0.40150943396226413, "grad_norm": 0.646375834941864, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.700908899307251, "reward_std": 0.17163237929344177, "rewards/accuracy_reward": 0.7213170826435089, "rewards/format_reward": 0.9795918166637421, "step": 3990 }, { "completion_length": 191.25509643554688, "epoch": 0.40161006289308177, "grad_norm": 1.6380490064620972, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7827523350715637, "reward_std": 0.11760241910815239, "rewards/accuracy_reward": 0.7827523648738861, "rewards/format_reward": 1.0, "step": 3991 }, { "completion_length": 215.1530532836914, "epoch": 0.40171069182389935, "grad_norm": 0.6955118179321289, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6510204076766968, "reward_std": 0.12370206415653229, "rewards/accuracy_reward": 0.6612244844436646, "rewards/format_reward": 0.9897959232330322, "step": 3992 }, { "completion_length": 229.7244873046875, "epoch": 0.401811320754717, "grad_norm": 1.4167579412460327, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8095238208770752, "reward_std": 0.13552315160632133, "rewards/accuracy_reward": 0.8095237910747528, "rewards/format_reward": 1.0, "step": 3993 }, { "completion_length": 184.37754821777344, "epoch": 0.40191194968553456, "grad_norm": 0.718207836151123, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8306218981742859, "reward_std": 0.05686481576412916, "rewards/accuracy_reward": 0.8306219577789307, "rewards/format_reward": 1.0, "step": 3994 }, { "completion_length": 205.51020050048828, "epoch": 0.4020125786163522, "grad_norm": 0.4256708323955536, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8450342416763306, "reward_std": 0.12835338339209557, "rewards/accuracy_reward": 0.8552383780479431, "rewards/format_reward": 0.9897959232330322, "step": 3995 }, { "completion_length": 219.7040786743164, "epoch": 0.40211320754716984, "grad_norm": 0.6580592393875122, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.769841194152832, "reward_std": 0.19813326001167297, "rewards/accuracy_reward": 0.79024937748909, "rewards/format_reward": 0.9795918464660645, "step": 3996 }, { "completion_length": 235.01020050048828, "epoch": 0.4022138364779874, "grad_norm": 0.40540483593940735, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7323129177093506, "reward_std": 0.11547679454088211, "rewards/accuracy_reward": 0.7425170540809631, "rewards/format_reward": 0.9897959232330322, "step": 3997 }, { "completion_length": 273.5, "epoch": 0.40231446540880506, "grad_norm": 1.7523744106292725, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.5349401831626892, "reward_std": 0.21295346319675446, "rewards/accuracy_reward": 0.5451442301273346, "rewards/format_reward": 0.9897959232330322, "step": 3998 }, { "completion_length": 158.7040786743164, "epoch": 0.40241509433962264, "grad_norm": 0.6575663685798645, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.9028174877166748, "reward_std": 0.09367911517620087, "rewards/accuracy_reward": 0.9232256412506104, "rewards/format_reward": 0.9795918464660645, "step": 3999 }, { "completion_length": 230.20407104492188, "epoch": 0.4025157232704403, "grad_norm": 1.0302844047546387, "kl": 0.0595703125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7596938014030457, "reward_std": 0.1820548139512539, "rewards/accuracy_reward": 0.7698979377746582, "rewards/format_reward": 0.9897959232330322, "step": 4000 }, { "completion_length": 225.36734771728516, "epoch": 0.40261635220125785, "grad_norm": 0.9372726082801819, "kl": 0.0816650390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8064919710159302, "reward_std": 0.22113674134016037, "rewards/accuracy_reward": 0.8064919412136078, "rewards/format_reward": 1.0, "step": 4001 }, { "completion_length": 227.7448959350586, "epoch": 0.4027169811320755, "grad_norm": 1.3399930000305176, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7459583282470703, "reward_std": 0.13114285841584206, "rewards/accuracy_reward": 0.7459583580493927, "rewards/format_reward": 1.0, "step": 4002 }, { "completion_length": 200.36734008789062, "epoch": 0.4028176100628931, "grad_norm": 1.033217430114746, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7428571581840515, "reward_std": 0.18844159692525864, "rewards/accuracy_reward": 0.7428570985794067, "rewards/format_reward": 1.0, "step": 4003 }, { "completion_length": 229.54080963134766, "epoch": 0.4029182389937107, "grad_norm": 0.4262644946575165, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8610786199569702, "reward_std": 0.05998511868529022, "rewards/accuracy_reward": 0.8610787093639374, "rewards/format_reward": 1.0, "step": 4004 }, { "completion_length": 236.82652282714844, "epoch": 0.4030188679245283, "grad_norm": 0.8629129528999329, "kl": 0.0792236328125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8164202570915222, "reward_std": 0.21485860645771027, "rewards/accuracy_reward": 0.8266243636608124, "rewards/format_reward": 0.9897959232330322, "step": 4005 }, { "completion_length": 235.72447967529297, "epoch": 0.4031194968553459, "grad_norm": 0.5284227728843689, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.779793918132782, "reward_std": 0.1612158752977848, "rewards/accuracy_reward": 0.8002021014690399, "rewards/format_reward": 0.9795918166637421, "step": 4006 }, { "completion_length": 262.9081573486328, "epoch": 0.4032201257861635, "grad_norm": 0.6853431463241577, "kl": 0.050048828125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8097196221351624, "reward_std": 0.10862277448177338, "rewards/accuracy_reward": 0.8097196519374847, "rewards/format_reward": 1.0, "step": 4007 }, { "completion_length": 280.12245178222656, "epoch": 0.40332075471698114, "grad_norm": 0.8499264717102051, "kl": 0.0745849609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7925170063972473, "reward_std": 0.13270122185349464, "rewards/accuracy_reward": 0.8027210533618927, "rewards/format_reward": 0.9897959232330322, "step": 4008 }, { "completion_length": 230.45917510986328, "epoch": 0.4034213836477987, "grad_norm": 2.157339334487915, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6410648822784424, "reward_std": 0.11724605411291122, "rewards/accuracy_reward": 0.6512689739465714, "rewards/format_reward": 0.9897959232330322, "step": 4009 }, { "completion_length": 243.9183578491211, "epoch": 0.40352201257861636, "grad_norm": 1.5355781316757202, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7837215065956116, "reward_std": 0.19395069777965546, "rewards/accuracy_reward": 0.7939255237579346, "rewards/format_reward": 0.9897959232330322, "step": 4010 }, { "completion_length": 160.09183502197266, "epoch": 0.40362264150943394, "grad_norm": 1.0347208976745605, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7365472912788391, "reward_std": 0.1265188232064247, "rewards/accuracy_reward": 0.7467514276504517, "rewards/format_reward": 0.9897959232330322, "step": 4011 }, { "completion_length": 269.82652282714844, "epoch": 0.4037232704402516, "grad_norm": 1.7352275848388672, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7073128819465637, "reward_std": 0.13961808383464813, "rewards/accuracy_reward": 0.7073129117488861, "rewards/format_reward": 1.0, "step": 4012 }, { "completion_length": 218.60203552246094, "epoch": 0.40382389937106916, "grad_norm": 1.055011510848999, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.6240187883377075, "reward_std": 0.24594911187887192, "rewards/accuracy_reward": 0.6546310782432556, "rewards/format_reward": 0.9693877398967743, "step": 4013 }, { "completion_length": 180.15306091308594, "epoch": 0.4039245283018868, "grad_norm": 0.7380872964859009, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.9206348657608032, "reward_std": 0.04490640014410019, "rewards/accuracy_reward": 0.9206348955631256, "rewards/format_reward": 1.0, "step": 4014 }, { "completion_length": 200.83673095703125, "epoch": 0.4040251572327044, "grad_norm": 0.7053493857383728, "kl": 0.0640869140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8029397130012512, "reward_std": 0.21525736153125763, "rewards/accuracy_reward": 0.8029397130012512, "rewards/format_reward": 1.0, "step": 4015 }, { "completion_length": 255.55101013183594, "epoch": 0.404125786163522, "grad_norm": 0.5389996767044067, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7652687430381775, "reward_std": 0.13699424639344215, "rewards/accuracy_reward": 0.7754727900028229, "rewards/format_reward": 0.9897959232330322, "step": 4016 }, { "completion_length": 273.49998474121094, "epoch": 0.4042264150943396, "grad_norm": 0.5613970160484314, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7189542055130005, "reward_std": 0.08837034180760384, "rewards/accuracy_reward": 0.7189542651176453, "rewards/format_reward": 1.0, "step": 4017 }, { "completion_length": 319.2550964355469, "epoch": 0.40432704402515723, "grad_norm": 0.8486363887786865, "kl": 0.0574951171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.724314272403717, "reward_std": 0.1424456164240837, "rewards/accuracy_reward": 0.73451828956604, "rewards/format_reward": 0.9897959232330322, "step": 4018 }, { "completion_length": 269.64286041259766, "epoch": 0.4044276729559748, "grad_norm": 0.9052799344062805, "kl": 0.0640869140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5083521008491516, "reward_std": 0.27157267183065414, "rewards/accuracy_reward": 0.5287603437900543, "rewards/format_reward": 0.9795918464660645, "step": 4019 }, { "completion_length": 211.2551040649414, "epoch": 0.40452830188679245, "grad_norm": 0.5081414580345154, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8523101806640625, "reward_std": 0.1160467192530632, "rewards/accuracy_reward": 0.8625142872333527, "rewards/format_reward": 0.9897959232330322, "step": 4020 }, { "completion_length": 224.77550506591797, "epoch": 0.4046289308176101, "grad_norm": 0.40290892124176025, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6654967069625854, "reward_std": 0.08893489837646484, "rewards/accuracy_reward": 0.6757008135318756, "rewards/format_reward": 0.9897959232330322, "step": 4021 }, { "completion_length": 274.35713958740234, "epoch": 0.40472955974842767, "grad_norm": 0.6378856897354126, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7679643034934998, "reward_std": 0.1611875221133232, "rewards/accuracy_reward": 0.7781684100627899, "rewards/format_reward": 0.9897959232330322, "step": 4022 }, { "completion_length": 260.2244873046875, "epoch": 0.4048301886792453, "grad_norm": 0.8116259574890137, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7943500876426697, "reward_std": 0.22304116189479828, "rewards/accuracy_reward": 0.804554283618927, "rewards/format_reward": 0.9897959232330322, "step": 4023 }, { "completion_length": 220.32653045654297, "epoch": 0.4049308176100629, "grad_norm": 0.9981334805488586, "kl": 0.0535888671875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7420726418495178, "reward_std": 0.1095447950065136, "rewards/accuracy_reward": 0.7420726120471954, "rewards/format_reward": 1.0, "step": 4024 }, { "completion_length": 216.52040100097656, "epoch": 0.4050314465408805, "grad_norm": 1.3342891931533813, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8590864539146423, "reward_std": 0.13820418156683445, "rewards/accuracy_reward": 0.8590864539146423, "rewards/format_reward": 1.0, "step": 4025 }, { "completion_length": 237.89794921875, "epoch": 0.4051320754716981, "grad_norm": 1.0973974466323853, "kl": 0.0946044921875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6344752311706543, "reward_std": 0.2659117989242077, "rewards/accuracy_reward": 0.6548833549022675, "rewards/format_reward": 0.9795918166637421, "step": 4026 }, { "completion_length": 200.58162689208984, "epoch": 0.40523270440251574, "grad_norm": 0.7575535774230957, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.801500916481018, "reward_std": 0.1306748278439045, "rewards/accuracy_reward": 0.8015008866786957, "rewards/format_reward": 1.0, "step": 4027 }, { "completion_length": 265.84693145751953, "epoch": 0.4053333333333333, "grad_norm": 1.075010061264038, "kl": 0.0860595703125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.490933120250702, "reward_std": 0.2656994163990021, "rewards/accuracy_reward": 0.5317495316267014, "rewards/format_reward": 0.9591836333274841, "step": 4028 }, { "completion_length": 246.74488830566406, "epoch": 0.40543396226415096, "grad_norm": 0.9725423455238342, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7356550097465515, "reward_std": 0.17410079017281532, "rewards/accuracy_reward": 0.7560631930828094, "rewards/format_reward": 0.9795918166637421, "step": 4029 }, { "completion_length": 203.93877410888672, "epoch": 0.40553459119496854, "grad_norm": 1.0736865997314453, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.820165753364563, "reward_std": 0.21966632455587387, "rewards/accuracy_reward": 0.8303698301315308, "rewards/format_reward": 0.9897959232330322, "step": 4030 }, { "completion_length": 211.05101776123047, "epoch": 0.4056352201257862, "grad_norm": 1.1528249979019165, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7015365362167358, "reward_std": 0.11067233607172966, "rewards/accuracy_reward": 0.711740642786026, "rewards/format_reward": 0.9897959232330322, "step": 4031 }, { "completion_length": 214.59183502197266, "epoch": 0.40573584905660376, "grad_norm": 0.6644588708877563, "kl": 0.109619140625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7339999079704285, "reward_std": 0.11048809811472893, "rewards/accuracy_reward": 0.7442040145397186, "rewards/format_reward": 0.9897959232330322, "step": 4032 }, { "completion_length": 189.4897918701172, "epoch": 0.4058364779874214, "grad_norm": 0.4703061580657959, "kl": 0.0601806640625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8636220693588257, "reward_std": 0.0796540416777134, "rewards/accuracy_reward": 0.8738261461257935, "rewards/format_reward": 0.9897959232330322, "step": 4033 }, { "completion_length": 225.2142791748047, "epoch": 0.405937106918239, "grad_norm": 1.6590232849121094, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.835491955280304, "reward_std": 0.19241966307163239, "rewards/accuracy_reward": 0.8456960320472717, "rewards/format_reward": 0.9897959232330322, "step": 4034 }, { "completion_length": 243.08162689208984, "epoch": 0.4060377358490566, "grad_norm": 0.6143873929977417, "kl": 0.0726318359375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7222899198532104, "reward_std": 0.1432412751019001, "rewards/accuracy_reward": 0.7222899198532104, "rewards/format_reward": 1.0, "step": 4035 }, { "completion_length": 243.1836700439453, "epoch": 0.4061383647798742, "grad_norm": 0.6325328946113586, "kl": 0.0631103515625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.632138729095459, "reward_std": 0.09721533209085464, "rewards/accuracy_reward": 0.6321387887001038, "rewards/format_reward": 1.0, "step": 4036 }, { "completion_length": 270.0918273925781, "epoch": 0.40623899371069183, "grad_norm": 0.9328485131263733, "kl": 0.0892333984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6159577369689941, "reward_std": 0.22113683819770813, "rewards/accuracy_reward": 0.6465699672698975, "rewards/format_reward": 0.9693877398967743, "step": 4037 }, { "completion_length": 278.6428451538086, "epoch": 0.4063396226415094, "grad_norm": 0.5624096393585205, "kl": 0.05517578125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7721088528633118, "reward_std": 0.1528547927737236, "rewards/accuracy_reward": 0.7823128998279572, "rewards/format_reward": 0.9897959232330322, "step": 4038 }, { "completion_length": 273.99999237060547, "epoch": 0.40644025157232705, "grad_norm": 0.546883761882782, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8231292366981506, "reward_std": 0.15516917407512665, "rewards/accuracy_reward": 0.8333333432674408, "rewards/format_reward": 0.9897959232330322, "step": 4039 }, { "completion_length": 229.1530532836914, "epoch": 0.40654088050314463, "grad_norm": 0.2606992721557617, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8202511072158813, "reward_std": 0.016086263582110405, "rewards/accuracy_reward": 0.8202511966228485, "rewards/format_reward": 1.0, "step": 4040 }, { "completion_length": 258.67346954345703, "epoch": 0.40664150943396227, "grad_norm": 0.6179063320159912, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6994003653526306, "reward_std": 0.28696128726005554, "rewards/accuracy_reward": 0.7504207789897919, "rewards/format_reward": 0.9489795565605164, "step": 4041 }, { "completion_length": 190.42857360839844, "epoch": 0.40674213836477985, "grad_norm": 0.7827262282371521, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8453927040100098, "reward_std": 0.18617400527000427, "rewards/accuracy_reward": 0.8453927338123322, "rewards/format_reward": 1.0, "step": 4042 }, { "completion_length": 340.8673400878906, "epoch": 0.4068427672955975, "grad_norm": 0.6261295676231384, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7045152187347412, "reward_std": 0.19376541953533888, "rewards/accuracy_reward": 0.714719295501709, "rewards/format_reward": 0.9897959232330322, "step": 4043 }, { "completion_length": 289.8163299560547, "epoch": 0.4069433962264151, "grad_norm": 0.812619149684906, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6793129444122314, "reward_std": 0.17832306772470474, "rewards/accuracy_reward": 0.6895170509815216, "rewards/format_reward": 0.9897959232330322, "step": 4044 }, { "completion_length": 177.71428680419922, "epoch": 0.4070440251572327, "grad_norm": 0.6921588182449341, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8271700739860535, "reward_std": 0.10700039938092232, "rewards/accuracy_reward": 0.8373742401599884, "rewards/format_reward": 0.9897959232330322, "step": 4045 }, { "completion_length": 279.78570556640625, "epoch": 0.40714465408805034, "grad_norm": 1.2996799945831299, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6791383028030396, "reward_std": 0.2500215694308281, "rewards/accuracy_reward": 0.7301587462425232, "rewards/format_reward": 0.9489795565605164, "step": 4046 }, { "completion_length": 183.13265228271484, "epoch": 0.4072452830188679, "grad_norm": 0.8327673673629761, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6938775181770325, "reward_std": 0.20637566596269608, "rewards/accuracy_reward": 0.734693855047226, "rewards/format_reward": 0.9591836333274841, "step": 4047 }, { "completion_length": 231.55101013183594, "epoch": 0.40734591194968556, "grad_norm": 0.5252875685691833, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.684523105621338, "reward_std": 0.09139671921730042, "rewards/accuracy_reward": 0.7049313187599182, "rewards/format_reward": 0.9795918166637421, "step": 4048 }, { "completion_length": 235.05101776123047, "epoch": 0.40744654088050314, "grad_norm": 0.5502169132232666, "kl": 0.0704345703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8848395943641663, "reward_std": 0.10690551623702049, "rewards/accuracy_reward": 0.884839653968811, "rewards/format_reward": 1.0, "step": 4049 }, { "completion_length": 241.9897918701172, "epoch": 0.4075471698113208, "grad_norm": 0.41060033440589905, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.804532766342163, "reward_std": 0.0704541951417923, "rewards/accuracy_reward": 0.8045327067375183, "rewards/format_reward": 1.0, "step": 4050 }, { "completion_length": 233.97958374023438, "epoch": 0.40764779874213836, "grad_norm": 6.478658199310303, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.795918345451355, "reward_std": 0.17769698798656464, "rewards/accuracy_reward": 0.8061224222183228, "rewards/format_reward": 0.9897959232330322, "step": 4051 }, { "completion_length": 148.06122589111328, "epoch": 0.407748427672956, "grad_norm": 1.3923221826553345, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7757344841957092, "reward_std": 0.14571357518434525, "rewards/accuracy_reward": 0.7859385013580322, "rewards/format_reward": 0.9897959232330322, "step": 4052 }, { "completion_length": 311.2142791748047, "epoch": 0.4078490566037736, "grad_norm": 0.7152457237243652, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7965986132621765, "reward_std": 0.18245331943035126, "rewards/accuracy_reward": 0.83741495013237, "rewards/format_reward": 0.9591836631298065, "step": 4053 }, { "completion_length": 218.06121063232422, "epoch": 0.4079496855345912, "grad_norm": 0.8699944019317627, "kl": 0.115478515625, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.7129858136177063, "reward_std": 0.12933296337723732, "rewards/accuracy_reward": 0.7333940267562866, "rewards/format_reward": 0.9795918166637421, "step": 4054 }, { "completion_length": 194.20407104492188, "epoch": 0.4080503144654088, "grad_norm": 0.3686065971851349, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7242280840873718, "reward_std": 0.03825039882212877, "rewards/accuracy_reward": 0.7242281138896942, "rewards/format_reward": 1.0, "step": 4055 }, { "completion_length": 251.74488830566406, "epoch": 0.40815094339622643, "grad_norm": 0.6356488466262817, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6809523701667786, "reward_std": 0.14849036559462547, "rewards/accuracy_reward": 0.6911564469337463, "rewards/format_reward": 0.9897959232330322, "step": 4056 }, { "completion_length": 261.9285583496094, "epoch": 0.408251572327044, "grad_norm": 1.5776515007019043, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.718713104724884, "reward_std": 0.2816624790430069, "rewards/accuracy_reward": 0.7391213476657867, "rewards/format_reward": 0.9795918166637421, "step": 4057 }, { "completion_length": 245.34693145751953, "epoch": 0.40835220125786165, "grad_norm": 0.6488103866577148, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7101649045944214, "reward_std": 0.150412917137146, "rewards/accuracy_reward": 0.7305729985237122, "rewards/format_reward": 0.9795918464660645, "step": 4058 }, { "completion_length": 294.9693908691406, "epoch": 0.40845283018867923, "grad_norm": 22.980682373046875, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.5100136995315552, "reward_std": 0.2540271654725075, "rewards/accuracy_reward": 0.540626049041748, "rewards/format_reward": 0.9693877398967743, "step": 4059 }, { "completion_length": 270.06121826171875, "epoch": 0.40855345911949686, "grad_norm": 0.6188498735427856, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6609410047531128, "reward_std": 0.21622303873300552, "rewards/accuracy_reward": 0.6915532648563385, "rewards/format_reward": 0.9693877398967743, "step": 4060 }, { "completion_length": 262.1836700439453, "epoch": 0.40865408805031445, "grad_norm": 2.2411282062530518, "kl": 0.097900390625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.5779263377189636, "reward_std": 0.36792898178100586, "rewards/accuracy_reward": 0.639150857925415, "rewards/format_reward": 0.938775509595871, "step": 4061 }, { "completion_length": 289.72447204589844, "epoch": 0.4087547169811321, "grad_norm": 0.8434415459632874, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7656299471855164, "reward_std": 0.20104879140853882, "rewards/accuracy_reward": 0.7962422966957092, "rewards/format_reward": 0.9693877398967743, "step": 4062 }, { "completion_length": 255.36734771728516, "epoch": 0.40885534591194966, "grad_norm": 0.5732443332672119, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8163264989852905, "reward_std": 0.16513635218143463, "rewards/accuracy_reward": 0.857142835855484, "rewards/format_reward": 0.9591836631298065, "step": 4063 }, { "completion_length": 278.84693908691406, "epoch": 0.4089559748427673, "grad_norm": 0.5510483384132385, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6985397934913635, "reward_std": 0.1991049014031887, "rewards/accuracy_reward": 0.7087438404560089, "rewards/format_reward": 0.9897959232330322, "step": 4064 }, { "completion_length": 290.02040100097656, "epoch": 0.4090566037735849, "grad_norm": 0.34753453731536865, "kl": 0.05322265625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8279542326927185, "reward_std": 0.08758836146444082, "rewards/accuracy_reward": 0.848362386226654, "rewards/format_reward": 0.9795918166637421, "step": 4065 }, { "completion_length": 282.79590606689453, "epoch": 0.4091572327044025, "grad_norm": 0.4880070090293884, "kl": 0.0400390625, "learning_rate": 1e-06, "loss": 0.0016, "reward": 1.7326530814170837, "reward_std": 0.1297873891890049, "rewards/accuracy_reward": 0.7428571283817291, "rewards/format_reward": 0.9897959232330322, "step": 4066 }, { "completion_length": 269.40816497802734, "epoch": 0.4092578616352201, "grad_norm": 1.0664225816726685, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7662019729614258, "reward_std": 0.20281197130680084, "rewards/accuracy_reward": 0.8070183098316193, "rewards/format_reward": 0.9591836333274841, "step": 4067 }, { "completion_length": 264.52040100097656, "epoch": 0.40935849056603774, "grad_norm": 0.5788770914077759, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6880764961242676, "reward_std": 0.1806502491235733, "rewards/accuracy_reward": 0.7084847688674927, "rewards/format_reward": 0.9795918166637421, "step": 4068 }, { "completion_length": 356.7550811767578, "epoch": 0.4094591194968554, "grad_norm": 0.5347527265548706, "kl": 0.04541015625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7746134996414185, "reward_std": 0.19038600474596024, "rewards/accuracy_reward": 0.7848175466060638, "rewards/format_reward": 0.9897959232330322, "step": 4069 }, { "completion_length": 254.75509643554688, "epoch": 0.40955974842767295, "grad_norm": 0.5945121049880981, "kl": 0.0714111328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8994168043136597, "reward_std": 0.12691237032413483, "rewards/accuracy_reward": 0.8994168937206268, "rewards/format_reward": 1.0, "step": 4070 }, { "completion_length": 298.2244873046875, "epoch": 0.4096603773584906, "grad_norm": 0.44822269678115845, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6037121415138245, "reward_std": 0.106067412532866, "rewards/accuracy_reward": 0.6139162480831146, "rewards/format_reward": 0.9897959232330322, "step": 4071 }, { "completion_length": 306.7346878051758, "epoch": 0.40976100628930817, "grad_norm": 0.6082136034965515, "kl": 0.0770263671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7585033774375916, "reward_std": 0.19902190566062927, "rewards/accuracy_reward": 0.7789115905761719, "rewards/format_reward": 0.9795918464660645, "step": 4072 }, { "completion_length": 281.4285659790039, "epoch": 0.4098616352201258, "grad_norm": 0.921680212020874, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.721469759941101, "reward_std": 0.22080448269844055, "rewards/accuracy_reward": 0.7520820200443268, "rewards/format_reward": 0.9693877398967743, "step": 4073 }, { "completion_length": 216.6734619140625, "epoch": 0.4099622641509434, "grad_norm": 1.1844298839569092, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7505232691764832, "reward_std": 0.1652248576283455, "rewards/accuracy_reward": 0.7607273757457733, "rewards/format_reward": 0.9897959232330322, "step": 4074 }, { "completion_length": 266.61224365234375, "epoch": 0.410062893081761, "grad_norm": 1.0679271221160889, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7623281478881836, "reward_std": 0.10318576917052269, "rewards/accuracy_reward": 0.7725322544574738, "rewards/format_reward": 0.9897959232330322, "step": 4075 }, { "completion_length": 231.82653045654297, "epoch": 0.4101635220125786, "grad_norm": 0.8212795257568359, "kl": 0.05517578125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8663750886917114, "reward_std": 0.15998714789748192, "rewards/accuracy_reward": 0.8663750886917114, "rewards/format_reward": 1.0, "step": 4076 }, { "completion_length": 324.1428527832031, "epoch": 0.41026415094339624, "grad_norm": 0.9641093611717224, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.633116900920868, "reward_std": 0.22084270790219307, "rewards/accuracy_reward": 0.6433209627866745, "rewards/format_reward": 0.9897959232330322, "step": 4077 }, { "completion_length": 208.4081573486328, "epoch": 0.4103647798742138, "grad_norm": 15.102198600769043, "kl": 0.436767578125, "learning_rate": 1e-06, "loss": 0.0175, "reward": 1.7971146702766418, "reward_std": 0.19071266055107117, "rewards/accuracy_reward": 0.817522794008255, "rewards/format_reward": 0.9795918464660645, "step": 4078 }, { "completion_length": 176.12244415283203, "epoch": 0.41046540880503146, "grad_norm": 1.0285974740982056, "kl": 0.0860595703125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7847424149513245, "reward_std": 0.14423340559005737, "rewards/accuracy_reward": 0.7949465215206146, "rewards/format_reward": 0.9897959232330322, "step": 4079 }, { "completion_length": 162.60203552246094, "epoch": 0.41056603773584904, "grad_norm": 1.2031947374343872, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.844928801059723, "reward_std": 0.1276557855308056, "rewards/accuracy_reward": 0.8449288308620453, "rewards/format_reward": 1.0, "step": 4080 }, { "completion_length": 276.0, "epoch": 0.4106666666666667, "grad_norm": 0.6177939772605896, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5799444317817688, "reward_std": 0.1494830884039402, "rewards/accuracy_reward": 0.5901485085487366, "rewards/format_reward": 0.9897959232330322, "step": 4081 }, { "completion_length": 243.7959213256836, "epoch": 0.41076729559748426, "grad_norm": 0.9230484366416931, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8078230619430542, "reward_std": 0.14149803295731544, "rewards/accuracy_reward": 0.8486394286155701, "rewards/format_reward": 0.9591836333274841, "step": 4082 }, { "completion_length": 209.9795913696289, "epoch": 0.4108679245283019, "grad_norm": 0.4819997549057007, "kl": 0.0440673828125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.9019672274589539, "reward_std": 0.14016015827655792, "rewards/accuracy_reward": 0.9121714532375336, "rewards/format_reward": 0.9897959232330322, "step": 4083 }, { "completion_length": 249.47958374023438, "epoch": 0.4109685534591195, "grad_norm": 0.5966246724128723, "kl": 0.0582275390625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8392857313156128, "reward_std": 0.11988542601466179, "rewards/accuracy_reward": 0.839285671710968, "rewards/format_reward": 1.0, "step": 4084 }, { "completion_length": 200.27550506591797, "epoch": 0.4110691823899371, "grad_norm": 2.298297882080078, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7962761521339417, "reward_std": 0.18491317331790924, "rewards/accuracy_reward": 0.806480199098587, "rewards/format_reward": 0.9897959232330322, "step": 4085 }, { "completion_length": 263.55101776123047, "epoch": 0.4111698113207547, "grad_norm": 0.7345283627510071, "kl": 0.055908203125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.727060317993164, "reward_std": 0.2216697782278061, "rewards/accuracy_reward": 0.747468501329422, "rewards/format_reward": 0.9795918464660645, "step": 4086 }, { "completion_length": 316.9693908691406, "epoch": 0.41127044025157233, "grad_norm": 0.5342912077903748, "kl": 0.0718994140625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.515816330909729, "reward_std": 0.10096174106001854, "rewards/accuracy_reward": 0.5158163011074066, "rewards/format_reward": 1.0, "step": 4087 }, { "completion_length": 247.56122589111328, "epoch": 0.4113710691823899, "grad_norm": 0.9107992649078369, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6371005773544312, "reward_std": 0.283274844288826, "rewards/accuracy_reward": 0.6575087606906891, "rewards/format_reward": 0.9795918464660645, "step": 4088 }, { "completion_length": 262.5918273925781, "epoch": 0.41147169811320755, "grad_norm": 0.9594276547431946, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.605409324169159, "reward_std": 0.14543483778834343, "rewards/accuracy_reward": 0.6054093986749649, "rewards/format_reward": 1.0, "step": 4089 }, { "completion_length": 236.05101776123047, "epoch": 0.41157232704402513, "grad_norm": 0.8861820101737976, "kl": 0.12158203125, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.724361002445221, "reward_std": 0.18741464614868164, "rewards/accuracy_reward": 0.724361002445221, "rewards/format_reward": 1.0, "step": 4090 }, { "completion_length": 327.8163146972656, "epoch": 0.41167295597484277, "grad_norm": 0.6241971254348755, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7008282542228699, "reward_std": 0.1734335720539093, "rewards/accuracy_reward": 0.7314404547214508, "rewards/format_reward": 0.9693877398967743, "step": 4091 }, { "completion_length": 310.23468017578125, "epoch": 0.41177358490566035, "grad_norm": 1.4698405265808105, "kl": 0.0802001953125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5923225283622742, "reward_std": 0.2216896489262581, "rewards/accuracy_reward": 0.5923226475715637, "rewards/format_reward": 1.0, "step": 4092 }, { "completion_length": 203.28571319580078, "epoch": 0.411874213836478, "grad_norm": 0.4219312071800232, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.758905291557312, "reward_std": 0.11202865839004517, "rewards/accuracy_reward": 0.769109457731247, "rewards/format_reward": 0.9897959232330322, "step": 4093 }, { "completion_length": 227.14285278320312, "epoch": 0.4119748427672956, "grad_norm": 0.7679807543754578, "kl": 0.0479736328125, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.8421767950057983, "reward_std": 0.13707783818244934, "rewards/accuracy_reward": 0.8523809313774109, "rewards/format_reward": 0.9897959232330322, "step": 4094 }, { "completion_length": 285.7857131958008, "epoch": 0.4120754716981132, "grad_norm": 2.2191884517669678, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7824152112007141, "reward_std": 0.08901529759168625, "rewards/accuracy_reward": 0.7824151813983917, "rewards/format_reward": 1.0, "step": 4095 }, { "completion_length": 222.82653045654297, "epoch": 0.41217610062893084, "grad_norm": 1.1482300758361816, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.727401316165924, "reward_std": 0.103268563747406, "rewards/accuracy_reward": 0.7274012565612793, "rewards/format_reward": 1.0, "step": 4096 }, { "completion_length": 300.3061218261719, "epoch": 0.4122767295597484, "grad_norm": 0.8836933970451355, "kl": 0.10107421875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6688576340675354, "reward_std": 0.19317659735679626, "rewards/accuracy_reward": 0.6790616810321808, "rewards/format_reward": 0.9897959232330322, "step": 4097 }, { "completion_length": 252.4591827392578, "epoch": 0.41237735849056606, "grad_norm": 0.4626370370388031, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6371881365776062, "reward_std": 0.12459426373243332, "rewards/accuracy_reward": 0.6575963497161865, "rewards/format_reward": 0.9795918166637421, "step": 4098 }, { "completion_length": 250.27550506591797, "epoch": 0.41247798742138364, "grad_norm": 0.5968019366264343, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.696255624294281, "reward_std": 0.15280230157077312, "rewards/accuracy_reward": 0.7064597010612488, "rewards/format_reward": 0.9897959232330322, "step": 4099 }, { "completion_length": 251.54080200195312, "epoch": 0.4125786163522013, "grad_norm": 0.5994654893875122, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.762172818183899, "reward_std": 0.1697799302637577, "rewards/accuracy_reward": 0.7621728479862213, "rewards/format_reward": 1.0, "step": 4100 }, { "completion_length": 312.83673095703125, "epoch": 0.41267924528301886, "grad_norm": 0.5809749960899353, "kl": 0.0699462890625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.688205063343048, "reward_std": 0.19078632444143295, "rewards/accuracy_reward": 0.6984091401100159, "rewards/format_reward": 0.9897959232330322, "step": 4101 }, { "completion_length": 160.9591827392578, "epoch": 0.4127798742138365, "grad_norm": 1.0634759664535522, "kl": 0.12890625, "learning_rate": 1e-06, "loss": 0.0052, "reward": 1.7566590905189514, "reward_std": 0.09555353224277496, "rewards/accuracy_reward": 0.7566591203212738, "rewards/format_reward": 1.0, "step": 4102 }, { "completion_length": 291.4081573486328, "epoch": 0.4128805031446541, "grad_norm": 0.9433063268661499, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8284786343574524, "reward_std": 0.20973005145788193, "rewards/accuracy_reward": 0.8386827409267426, "rewards/format_reward": 0.9897959232330322, "step": 4103 }, { "completion_length": 192.61224365234375, "epoch": 0.4129811320754717, "grad_norm": 0.9688272476196289, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6968080401420593, "reward_std": 0.15854205936193466, "rewards/accuracy_reward": 0.7070119976997375, "rewards/format_reward": 0.9897959232330322, "step": 4104 }, { "completion_length": 307.3673400878906, "epoch": 0.4130817610062893, "grad_norm": 1.1700763702392578, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.673698365688324, "reward_std": 0.1968246027827263, "rewards/accuracy_reward": 0.6941064894199371, "rewards/format_reward": 0.9795918166637421, "step": 4105 }, { "completion_length": 227.6326446533203, "epoch": 0.41318238993710693, "grad_norm": 1.0819965600967407, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.866627037525177, "reward_std": 0.15724001824855804, "rewards/accuracy_reward": 0.866627037525177, "rewards/format_reward": 1.0, "step": 4106 }, { "completion_length": 188.62244415283203, "epoch": 0.4132830188679245, "grad_norm": 3.9337410926818848, "kl": 0.0709228515625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6738094687461853, "reward_std": 0.14319278672337532, "rewards/accuracy_reward": 0.6840136051177979, "rewards/format_reward": 0.9897959232330322, "step": 4107 }, { "completion_length": 243.54080963134766, "epoch": 0.41338364779874215, "grad_norm": 1.2253485918045044, "kl": 0.0670166015625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7146549820899963, "reward_std": 0.22754377126693726, "rewards/accuracy_reward": 0.7554714381694794, "rewards/format_reward": 0.9591836333274841, "step": 4108 }, { "completion_length": 254.16326141357422, "epoch": 0.41348427672955973, "grad_norm": 0.5450845956802368, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6893325448036194, "reward_std": 0.12500333413481712, "rewards/accuracy_reward": 0.6995366811752319, "rewards/format_reward": 0.9897959232330322, "step": 4109 }, { "completion_length": 258.72447967529297, "epoch": 0.41358490566037737, "grad_norm": 0.5190072059631348, "kl": 0.0552978515625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8461127281188965, "reward_std": 0.12788113951683044, "rewards/accuracy_reward": 0.8563168048858643, "rewards/format_reward": 0.9897959232330322, "step": 4110 }, { "completion_length": 212.9285659790039, "epoch": 0.41368553459119495, "grad_norm": 1.2016934156417847, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6850014328956604, "reward_std": 0.23135097324848175, "rewards/accuracy_reward": 0.7054096162319183, "rewards/format_reward": 0.9795918166637421, "step": 4111 }, { "completion_length": 249.49999237060547, "epoch": 0.4137861635220126, "grad_norm": 1.3913687467575073, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.719851553440094, "reward_std": 0.21755436807870865, "rewards/accuracy_reward": 0.7198515832424164, "rewards/format_reward": 1.0, "step": 4112 }, { "completion_length": 253.6632537841797, "epoch": 0.41388679245283017, "grad_norm": 0.626398503780365, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.685403287410736, "reward_std": 0.16207893192768097, "rewards/accuracy_reward": 0.6956074237823486, "rewards/format_reward": 0.9897959232330322, "step": 4113 }, { "completion_length": 310.7652893066406, "epoch": 0.4139874213836478, "grad_norm": 1.2213680744171143, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.57459956407547, "reward_std": 0.25988155603408813, "rewards/accuracy_reward": 0.5848037153482437, "rewards/format_reward": 0.9897959232330322, "step": 4114 }, { "completion_length": 202.66326141357422, "epoch": 0.4140880503144654, "grad_norm": 1.261989712715149, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.784962236881256, "reward_std": 0.1942100152373314, "rewards/accuracy_reward": 0.8053703308105469, "rewards/format_reward": 0.9795918166637421, "step": 4115 }, { "completion_length": 209.9897918701172, "epoch": 0.414188679245283, "grad_norm": 3.9313619136810303, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8183363676071167, "reward_std": 0.10806340351700783, "rewards/accuracy_reward": 0.8285404741764069, "rewards/format_reward": 0.9897959232330322, "step": 4116 }, { "completion_length": 285.0816345214844, "epoch": 0.4142893081761006, "grad_norm": 0.8944186568260193, "kl": 0.0684814453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6156461834907532, "reward_std": 0.35042814910411835, "rewards/accuracy_reward": 0.6462584733963013, "rewards/format_reward": 0.9693877398967743, "step": 4117 }, { "completion_length": 260.6836700439453, "epoch": 0.41438993710691824, "grad_norm": 0.968323290348053, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6280604600906372, "reward_std": 0.1388237476348877, "rewards/accuracy_reward": 0.6382644474506378, "rewards/format_reward": 0.9897959232330322, "step": 4118 }, { "completion_length": 257.24488830566406, "epoch": 0.4144905660377359, "grad_norm": 0.7746416330337524, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6062737703323364, "reward_std": 0.20034302026033401, "rewards/accuracy_reward": 0.6266819834709167, "rewards/format_reward": 0.9795918166637421, "step": 4119 }, { "completion_length": 192.93877410888672, "epoch": 0.41459119496855346, "grad_norm": 0.40551242232322693, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.836734652519226, "reward_std": 0.07229221146553755, "rewards/accuracy_reward": 0.8367346823215485, "rewards/format_reward": 1.0, "step": 4120 }, { "completion_length": 223.56121826171875, "epoch": 0.4146918238993711, "grad_norm": 0.5126062631607056, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8068045377731323, "reward_std": 0.1357661597430706, "rewards/accuracy_reward": 0.8272127509117126, "rewards/format_reward": 0.9795918464660645, "step": 4121 }, { "completion_length": 241.51019287109375, "epoch": 0.4147924528301887, "grad_norm": 0.9144115447998047, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7618444561958313, "reward_std": 0.15196561813354492, "rewards/accuracy_reward": 0.792456716299057, "rewards/format_reward": 0.9693877398967743, "step": 4122 }, { "completion_length": 255.39794921875, "epoch": 0.4148930817610063, "grad_norm": 0.8374547958374023, "kl": 0.0882568359375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.55346018075943, "reward_std": 0.15235518664121628, "rewards/accuracy_reward": 0.5738683938980103, "rewards/format_reward": 0.9795918464660645, "step": 4123 }, { "completion_length": 251.97958374023438, "epoch": 0.4149937106918239, "grad_norm": 0.545317530632019, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7323786616325378, "reward_std": 0.134540855884552, "rewards/accuracy_reward": 0.7527868747711182, "rewards/format_reward": 0.9795918166637421, "step": 4124 }, { "completion_length": 198.67346954345703, "epoch": 0.41509433962264153, "grad_norm": 0.453873872756958, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7735786437988281, "reward_std": 0.11702624708414078, "rewards/accuracy_reward": 0.7837828099727631, "rewards/format_reward": 0.9897959232330322, "step": 4125 }, { "completion_length": 204.2448959350586, "epoch": 0.4151949685534591, "grad_norm": 0.7363651990890503, "kl": 0.0762939453125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.637497365474701, "reward_std": 0.26070357114076614, "rewards/accuracy_reward": 0.6579055488109589, "rewards/format_reward": 0.9795918166637421, "step": 4126 }, { "completion_length": 222.9183578491211, "epoch": 0.41529559748427675, "grad_norm": 0.8632636070251465, "kl": 0.0880126953125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8161957263946533, "reward_std": 0.1611289344727993, "rewards/accuracy_reward": 0.8263998031616211, "rewards/format_reward": 0.9897959232330322, "step": 4127 }, { "completion_length": 240.04080963134766, "epoch": 0.4153962264150943, "grad_norm": 0.6128599047660828, "kl": 0.0819091796875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7426415085792542, "reward_std": 0.06533597409725189, "rewards/accuracy_reward": 0.7426415681838989, "rewards/format_reward": 1.0, "step": 4128 }, { "completion_length": 261.2959213256836, "epoch": 0.41549685534591196, "grad_norm": 0.5995287895202637, "kl": 0.0726318359375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7407638430595398, "reward_std": 0.1476970911026001, "rewards/accuracy_reward": 0.7407639622688293, "rewards/format_reward": 1.0, "step": 4129 }, { "completion_length": 213.85713958740234, "epoch": 0.41559748427672955, "grad_norm": 0.7347270250320435, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8349214792251587, "reward_std": 0.11181267723441124, "rewards/accuracy_reward": 0.8451256453990936, "rewards/format_reward": 0.9897959232330322, "step": 4130 }, { "completion_length": 256.89794921875, "epoch": 0.4156981132075472, "grad_norm": 0.49017763137817383, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6956710815429688, "reward_std": 0.08829772472381592, "rewards/accuracy_reward": 0.7058750689029694, "rewards/format_reward": 0.9897959232330322, "step": 4131 }, { "completion_length": 193.7551040649414, "epoch": 0.41579874213836476, "grad_norm": 0.6169251203536987, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7356656789779663, "reward_std": 0.12003034725785255, "rewards/accuracy_reward": 0.7458697557449341, "rewards/format_reward": 0.9897959232330322, "step": 4132 }, { "completion_length": 208.2653045654297, "epoch": 0.4158993710691824, "grad_norm": 0.9260281920433044, "kl": 0.048583984375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7469740509986877, "reward_std": 0.15858767926692963, "rewards/accuracy_reward": 0.7571782171726227, "rewards/format_reward": 0.9897959232330322, "step": 4133 }, { "completion_length": 221.55101776123047, "epoch": 0.416, "grad_norm": 0.8327661156654358, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8600582480430603, "reward_std": 0.1289966180920601, "rewards/accuracy_reward": 0.8804664611816406, "rewards/format_reward": 0.9795918464660645, "step": 4134 }, { "completion_length": 224.81632232666016, "epoch": 0.4161006289308176, "grad_norm": 0.7430873513221741, "kl": 0.1015625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8140941858291626, "reward_std": 0.13836215436458588, "rewards/accuracy_reward": 0.8447063565254211, "rewards/format_reward": 0.9693877398967743, "step": 4135 }, { "completion_length": 268.0408020019531, "epoch": 0.4162012578616352, "grad_norm": 0.5966795682907104, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7369510531425476, "reward_std": 0.22858883440494537, "rewards/accuracy_reward": 0.7777673900127411, "rewards/format_reward": 0.9591836333274841, "step": 4136 }, { "completion_length": 277.68367767333984, "epoch": 0.41630188679245284, "grad_norm": 1.2282917499542236, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6894440650939941, "reward_std": 0.20001962035894394, "rewards/accuracy_reward": 0.6996481120586395, "rewards/format_reward": 0.9897959232330322, "step": 4137 }, { "completion_length": 190.2244873046875, "epoch": 0.4164025157232704, "grad_norm": 1.31692373752594, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6963306069374084, "reward_std": 0.14861110225319862, "rewards/accuracy_reward": 0.7167387902736664, "rewards/format_reward": 0.9795918464660645, "step": 4138 }, { "completion_length": 199.10203552246094, "epoch": 0.41650314465408805, "grad_norm": 0.740403413772583, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8173468708992004, "reward_std": 0.1749972477555275, "rewards/accuracy_reward": 0.8377550542354584, "rewards/format_reward": 0.9795918464660645, "step": 4139 }, { "completion_length": 260.0306091308594, "epoch": 0.41660377358490563, "grad_norm": 1.0396538972854614, "kl": 0.0697021484375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.698013186454773, "reward_std": 0.20091036707162857, "rewards/accuracy_reward": 0.698013186454773, "rewards/format_reward": 1.0, "step": 4140 }, { "completion_length": 168.41836547851562, "epoch": 0.41670440251572327, "grad_norm": 1.8119142055511475, "kl": 0.14111328125, "learning_rate": 1e-06, "loss": 0.0057, "reward": 1.8439868092536926, "reward_std": 0.14120373874902725, "rewards/accuracy_reward": 0.8541909456253052, "rewards/format_reward": 0.9897959232330322, "step": 4141 }, { "completion_length": 276.1326370239258, "epoch": 0.4168050314465409, "grad_norm": 1.0121287107467651, "kl": 0.055419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.761287808418274, "reward_std": 0.2520046532154083, "rewards/accuracy_reward": 0.7816959917545319, "rewards/format_reward": 0.9795918464660645, "step": 4142 }, { "completion_length": 214.2551040649414, "epoch": 0.4169056603773585, "grad_norm": 2.7858376502990723, "kl": 0.104248046875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.725882887840271, "reward_std": 0.1656082719564438, "rewards/accuracy_reward": 0.7360870242118835, "rewards/format_reward": 0.9897959232330322, "step": 4143 }, { "completion_length": 156.0714225769043, "epoch": 0.4170062893081761, "grad_norm": 0.7992751598358154, "kl": 0.0911865234375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8835034370422363, "reward_std": 0.08292338252067566, "rewards/accuracy_reward": 0.8937075138092041, "rewards/format_reward": 0.9897959232330322, "step": 4144 }, { "completion_length": 234.1530532836914, "epoch": 0.4171069182389937, "grad_norm": 0.39841559529304504, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8047841787338257, "reward_std": 0.0680423779413104, "rewards/accuracy_reward": 0.8047841787338257, "rewards/format_reward": 1.0, "step": 4145 }, { "completion_length": 277.89794921875, "epoch": 0.41720754716981134, "grad_norm": 0.6954271793365479, "kl": 0.05224609375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.764820158481598, "reward_std": 0.18382546305656433, "rewards/accuracy_reward": 0.7852283418178558, "rewards/format_reward": 0.9795918166637421, "step": 4146 }, { "completion_length": 200.78570556640625, "epoch": 0.4173081761006289, "grad_norm": 0.6246887445449829, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8095238208770752, "reward_std": 0.1011742502450943, "rewards/accuracy_reward": 0.8095237910747528, "rewards/format_reward": 1.0, "step": 4147 }, { "completion_length": 235.73468017578125, "epoch": 0.41740880503144656, "grad_norm": 0.6654558777809143, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7266543507575989, "reward_std": 0.15008116886019707, "rewards/accuracy_reward": 0.7368583977222443, "rewards/format_reward": 0.9897959232330322, "step": 4148 }, { "completion_length": 193.29591369628906, "epoch": 0.41750943396226414, "grad_norm": 0.8512043952941895, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7780361771583557, "reward_std": 0.10520253702998161, "rewards/accuracy_reward": 0.7780362069606781, "rewards/format_reward": 1.0, "step": 4149 }, { "completion_length": 252.06121826171875, "epoch": 0.4176100628930818, "grad_norm": 1.3883402347564697, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.660656750202179, "reward_std": 0.2155015468597412, "rewards/accuracy_reward": 0.6810649335384369, "rewards/format_reward": 0.9795918464660645, "step": 4150 }, { "completion_length": 273.5816345214844, "epoch": 0.41771069182389936, "grad_norm": 0.7044515609741211, "kl": 0.096923828125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6771737337112427, "reward_std": 0.20960885286331177, "rewards/accuracy_reward": 0.7179900109767914, "rewards/format_reward": 0.9591836333274841, "step": 4151 }, { "completion_length": 177.60203552246094, "epoch": 0.417811320754717, "grad_norm": 0.9889403581619263, "kl": 0.0966796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7924385070800781, "reward_std": 0.14460739120841026, "rewards/accuracy_reward": 0.8230507373809814, "rewards/format_reward": 0.9693877398967743, "step": 4152 }, { "completion_length": 181.11224365234375, "epoch": 0.4179119496855346, "grad_norm": 0.6747463941574097, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8968353867530823, "reward_std": 0.10103998333215714, "rewards/accuracy_reward": 0.90703946352005, "rewards/format_reward": 0.9897959232330322, "step": 4153 }, { "completion_length": 185.85713958740234, "epoch": 0.4180125786163522, "grad_norm": 0.465928852558136, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8292734026908875, "reward_std": 0.13324489444494247, "rewards/accuracy_reward": 0.8394775688648224, "rewards/format_reward": 0.9897959232330322, "step": 4154 }, { "completion_length": 185.6530532836914, "epoch": 0.4181132075471698, "grad_norm": 0.6816798448562622, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8627569675445557, "reward_std": 0.0546703040599823, "rewards/accuracy_reward": 0.8627569675445557, "rewards/format_reward": 1.0, "step": 4155 }, { "completion_length": 240.77549743652344, "epoch": 0.41821383647798743, "grad_norm": 0.8315460681915283, "kl": 0.0738525390625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6522756814956665, "reward_std": 0.1718544065952301, "rewards/accuracy_reward": 0.6726838648319244, "rewards/format_reward": 0.9795918166637421, "step": 4156 }, { "completion_length": 231.73468780517578, "epoch": 0.418314465408805, "grad_norm": 1.2186909914016724, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.5936053395271301, "reward_std": 0.24062296003103256, "rewards/accuracy_reward": 0.6140134632587433, "rewards/format_reward": 0.9795918166637421, "step": 4157 }, { "completion_length": 227.55101013183594, "epoch": 0.41841509433962265, "grad_norm": 1.5282049179077148, "kl": 0.12353515625, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.6341122388839722, "reward_std": 0.1696111187338829, "rewards/accuracy_reward": 0.6545203924179077, "rewards/format_reward": 0.9795918464660645, "step": 4158 }, { "completion_length": 203.91836547851562, "epoch": 0.41851572327044023, "grad_norm": 0.6136844754219055, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.776098906993866, "reward_std": 0.1103656142950058, "rewards/accuracy_reward": 0.7863029837608337, "rewards/format_reward": 0.9897959232330322, "step": 4159 }, { "completion_length": 188.30612182617188, "epoch": 0.41861635220125787, "grad_norm": 1.1552625894546509, "kl": 0.13818359375, "learning_rate": 1e-06, "loss": 0.0055, "reward": 1.6876206994056702, "reward_std": 0.14882681891322136, "rewards/accuracy_reward": 0.6978247761726379, "rewards/format_reward": 0.9897959232330322, "step": 4160 }, { "completion_length": 220.8673324584961, "epoch": 0.41871698113207545, "grad_norm": 0.647375762462616, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7818593978881836, "reward_std": 0.23506460338830948, "rewards/accuracy_reward": 0.7920635044574738, "rewards/format_reward": 0.9897959232330322, "step": 4161 }, { "completion_length": 193.27550506591797, "epoch": 0.4188176100628931, "grad_norm": 1.0257409811019897, "kl": 0.0582275390625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8673468828201294, "reward_std": 0.1652088388800621, "rewards/accuracy_reward": 0.8673469424247742, "rewards/format_reward": 1.0, "step": 4162 }, { "completion_length": 180.30612182617188, "epoch": 0.41891823899371067, "grad_norm": 4.7015790939331055, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8549553751945496, "reward_std": 0.19977069646120071, "rewards/accuracy_reward": 0.8855676352977753, "rewards/format_reward": 0.9693877398967743, "step": 4163 }, { "completion_length": 181.06122589111328, "epoch": 0.4190188679245283, "grad_norm": 0.37571266293525696, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8367347121238708, "reward_std": 0.05399492383003235, "rewards/accuracy_reward": 0.8469387590885162, "rewards/format_reward": 0.9897959232330322, "step": 4164 }, { "completion_length": 265.08162689208984, "epoch": 0.4191194968553459, "grad_norm": 0.49710381031036377, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6082125902175903, "reward_std": 0.04129035584628582, "rewards/accuracy_reward": 0.6082125902175903, "rewards/format_reward": 1.0, "step": 4165 }, { "completion_length": 171.62245178222656, "epoch": 0.4192201257861635, "grad_norm": 1.1376667022705078, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8925238251686096, "reward_std": 0.1273123435676098, "rewards/accuracy_reward": 0.9027279019355774, "rewards/format_reward": 0.9897959232330322, "step": 4166 }, { "completion_length": 168.4795913696289, "epoch": 0.41932075471698116, "grad_norm": 1.4469869136810303, "kl": 0.11328125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7719644904136658, "reward_std": 0.11157885566353798, "rewards/accuracy_reward": 0.7821685373783112, "rewards/format_reward": 0.9897959232330322, "step": 4167 }, { "completion_length": 263.8061218261719, "epoch": 0.41942138364779874, "grad_norm": 0.5913186073303223, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6971438527107239, "reward_std": 0.19535557180643082, "rewards/accuracy_reward": 0.707347959280014, "rewards/format_reward": 0.9897959232330322, "step": 4168 }, { "completion_length": 198.41836166381836, "epoch": 0.4195220125786164, "grad_norm": 0.39536893367767334, "kl": 0.108154296875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7387766242027283, "reward_std": 0.09579049982130527, "rewards/accuracy_reward": 0.7693889439105988, "rewards/format_reward": 0.9693877398967743, "step": 4169 }, { "completion_length": 237.45917510986328, "epoch": 0.41962264150943396, "grad_norm": 1.0704978704452515, "kl": 0.15771484375, "learning_rate": 1e-06, "loss": 0.0065, "reward": 1.7098206281661987, "reward_std": 0.17584046721458435, "rewards/accuracy_reward": 0.7302287518978119, "rewards/format_reward": 0.9795918464660645, "step": 4170 }, { "completion_length": 231.77550506591797, "epoch": 0.4197232704402516, "grad_norm": 1.7081663608551025, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7173835635185242, "reward_std": 0.22582890093326569, "rewards/accuracy_reward": 0.7275876998901367, "rewards/format_reward": 0.9897959232330322, "step": 4171 }, { "completion_length": 233.2346954345703, "epoch": 0.4198238993710692, "grad_norm": 0.8338609933853149, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7512595653533936, "reward_std": 0.12928404659032822, "rewards/accuracy_reward": 0.7512595951557159, "rewards/format_reward": 1.0, "step": 4172 }, { "completion_length": 240.1836700439453, "epoch": 0.4199245283018868, "grad_norm": 1.074385166168213, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.606716513633728, "reward_std": 0.3201955705881119, "rewards/accuracy_reward": 0.6271248161792755, "rewards/format_reward": 0.9795918166637421, "step": 4173 }, { "completion_length": 214.67346954345703, "epoch": 0.4200251572327044, "grad_norm": 1.1572202444076538, "kl": 0.09814453125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6395264267921448, "reward_std": 0.15327218174934387, "rewards/accuracy_reward": 0.6497305333614349, "rewards/format_reward": 0.9897959232330322, "step": 4174 }, { "completion_length": 272.57142639160156, "epoch": 0.42012578616352203, "grad_norm": 0.5703980922698975, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.64046311378479, "reward_std": 0.192367285490036, "rewards/accuracy_reward": 0.660871297121048, "rewards/format_reward": 0.9795918166637421, "step": 4175 }, { "completion_length": 246.9285659790039, "epoch": 0.4202264150943396, "grad_norm": 0.7153924107551575, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.662495493888855, "reward_std": 0.16887033730745316, "rewards/accuracy_reward": 0.6726995706558228, "rewards/format_reward": 0.9897959232330322, "step": 4176 }, { "completion_length": 273.55101013183594, "epoch": 0.42032704402515725, "grad_norm": 1.3740845918655396, "kl": 0.0538330078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7158085107803345, "reward_std": 0.20259174704551697, "rewards/accuracy_reward": 0.7260126173496246, "rewards/format_reward": 0.9897959232330322, "step": 4177 }, { "completion_length": 181.948974609375, "epoch": 0.42042767295597483, "grad_norm": 0.6969058513641357, "kl": 0.108154296875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.882167100906372, "reward_std": 0.13655303791165352, "rewards/accuracy_reward": 0.8821671009063721, "rewards/format_reward": 1.0, "step": 4178 }, { "completion_length": 245.61223602294922, "epoch": 0.42052830188679247, "grad_norm": 0.8764533400535583, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7616347670555115, "reward_std": 0.2072557583451271, "rewards/accuracy_reward": 0.7922471463680267, "rewards/format_reward": 0.9693877398967743, "step": 4179 }, { "completion_length": 193.2448959350586, "epoch": 0.42062893081761005, "grad_norm": 0.4998859763145447, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8612934350967407, "reward_std": 0.09648935124278069, "rewards/accuracy_reward": 0.8714974820613861, "rewards/format_reward": 0.9897959232330322, "step": 4180 }, { "completion_length": 208.67346954345703, "epoch": 0.4207295597484277, "grad_norm": 0.7775350213050842, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7604368329048157, "reward_std": 0.15425334125757217, "rewards/accuracy_reward": 0.7706409096717834, "rewards/format_reward": 0.9897959232330322, "step": 4181 }, { "completion_length": 207.49999237060547, "epoch": 0.42083018867924527, "grad_norm": 0.5408610701560974, "kl": 0.09326171875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7290573120117188, "reward_std": 0.09000144153833389, "rewards/accuracy_reward": 0.7290572822093964, "rewards/format_reward": 1.0, "step": 4182 }, { "completion_length": 188.85713958740234, "epoch": 0.4209308176100629, "grad_norm": 0.47916340827941895, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8117913603782654, "reward_std": 0.1461700201034546, "rewards/accuracy_reward": 0.8219954669475555, "rewards/format_reward": 0.9897959232330322, "step": 4183 }, { "completion_length": 258.0102005004883, "epoch": 0.4210314465408805, "grad_norm": 1.0716782808303833, "kl": 0.0740966796875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7374286651611328, "reward_std": 0.17574575543403625, "rewards/accuracy_reward": 0.7374286651611328, "rewards/format_reward": 1.0, "step": 4184 }, { "completion_length": 286.16326904296875, "epoch": 0.4211320754716981, "grad_norm": 1.1354763507843018, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.735312283039093, "reward_std": 0.2204357609152794, "rewards/accuracy_reward": 0.7353123128414154, "rewards/format_reward": 1.0, "step": 4185 }, { "completion_length": 308.99998474121094, "epoch": 0.4212327044025157, "grad_norm": 0.6369189620018005, "kl": 0.0643310546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6005991697311401, "reward_std": 0.17368783801794052, "rewards/accuracy_reward": 0.6108032763004303, "rewards/format_reward": 0.9897959232330322, "step": 4186 }, { "completion_length": 244.01019287109375, "epoch": 0.42133333333333334, "grad_norm": 0.7915284633636475, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8075079321861267, "reward_std": 0.13122053816914558, "rewards/accuracy_reward": 0.8075079619884491, "rewards/format_reward": 1.0, "step": 4187 }, { "completion_length": 288.1020278930664, "epoch": 0.4214339622641509, "grad_norm": 0.5215792059898376, "kl": 0.04541015625, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6550047993659973, "reward_std": 0.15198694542050362, "rewards/accuracy_reward": 0.6652089059352875, "rewards/format_reward": 0.9897959232330322, "step": 4188 }, { "completion_length": 211.3775405883789, "epoch": 0.42153459119496856, "grad_norm": 2.817347764968872, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6937485933303833, "reward_std": 0.206974059343338, "rewards/accuracy_reward": 0.7141566872596741, "rewards/format_reward": 0.9795918464660645, "step": 4189 }, { "completion_length": 259.2448959350586, "epoch": 0.42163522012578614, "grad_norm": 0.614644467830658, "kl": 0.05810546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.690606951713562, "reward_std": 0.1736692599952221, "rewards/accuracy_reward": 0.7008110880851746, "rewards/format_reward": 0.9897959232330322, "step": 4190 }, { "completion_length": 275.67346954345703, "epoch": 0.4217358490566038, "grad_norm": 0.9426761269569397, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.819174885749817, "reward_std": 0.29280272126197815, "rewards/accuracy_reward": 0.839583158493042, "rewards/format_reward": 0.9795918464660645, "step": 4191 }, { "completion_length": 326.3673400878906, "epoch": 0.4218364779874214, "grad_norm": 0.9140722751617432, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5154709815979004, "reward_std": 0.37183573842048645, "rewards/accuracy_reward": 0.5460831522941589, "rewards/format_reward": 0.9693877398967743, "step": 4192 }, { "completion_length": 282.3061218261719, "epoch": 0.421937106918239, "grad_norm": 0.9473293423652649, "kl": 0.0816650390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.734670341014862, "reward_std": 0.1074051670730114, "rewards/accuracy_reward": 0.7448743581771851, "rewards/format_reward": 0.9897959232330322, "step": 4193 }, { "completion_length": 233.83673095703125, "epoch": 0.4220377358490566, "grad_norm": 0.9212222695350647, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6411606073379517, "reward_std": 0.23330659046769142, "rewards/accuracy_reward": 0.6615687757730484, "rewards/format_reward": 0.9795918464660645, "step": 4194 }, { "completion_length": 198.1836700439453, "epoch": 0.4221383647798742, "grad_norm": 0.7746742367744446, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8571428656578064, "reward_std": 0.14284341037273407, "rewards/accuracy_reward": 0.8877550959587097, "rewards/format_reward": 0.9693877398967743, "step": 4195 }, { "completion_length": 267.79591369628906, "epoch": 0.42223899371069185, "grad_norm": 0.6686585545539856, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.817913830280304, "reward_std": 0.12250443547964096, "rewards/accuracy_reward": 0.8383219838142395, "rewards/format_reward": 0.9795918166637421, "step": 4196 }, { "completion_length": 203.4285659790039, "epoch": 0.4223396226415094, "grad_norm": 1.4701926708221436, "kl": 0.0953369140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.787569522857666, "reward_std": 0.25232211500406265, "rewards/accuracy_reward": 0.787569522857666, "rewards/format_reward": 1.0, "step": 4197 }, { "completion_length": 224.11224365234375, "epoch": 0.42244025157232706, "grad_norm": 1.0079277753829956, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6710310578346252, "reward_std": 0.2952592968940735, "rewards/accuracy_reward": 0.7118473649024963, "rewards/format_reward": 0.9591836333274841, "step": 4198 }, { "completion_length": 334.448974609375, "epoch": 0.42254088050314464, "grad_norm": 0.6996421217918396, "kl": 0.0772705078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.4760227799415588, "reward_std": 0.18800364434719086, "rewards/accuracy_reward": 0.4862269461154938, "rewards/format_reward": 0.9897959232330322, "step": 4199 }, { "completion_length": 270.4591751098633, "epoch": 0.4226415094339623, "grad_norm": 0.7716877460479736, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.498453974723816, "reward_std": 0.2243000566959381, "rewards/accuracy_reward": 0.5188620835542679, "rewards/format_reward": 0.9795918166637421, "step": 4200 }, { "completion_length": 216.24488830566406, "epoch": 0.42274213836477986, "grad_norm": 0.5122873187065125, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.726963460445404, "reward_std": 0.1422477327287197, "rewards/accuracy_reward": 0.7269634902477264, "rewards/format_reward": 1.0, "step": 4201 }, { "completion_length": 275.79591369628906, "epoch": 0.4228427672955975, "grad_norm": 0.5029256343841553, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.73031085729599, "reward_std": 0.16426322609186172, "rewards/accuracy_reward": 0.7507190108299255, "rewards/format_reward": 0.9795918166637421, "step": 4202 }, { "completion_length": 175.13265228271484, "epoch": 0.4229433962264151, "grad_norm": 7.5397162437438965, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8905490636825562, "reward_std": 0.04906613705679774, "rewards/accuracy_reward": 0.9007531106472015, "rewards/format_reward": 0.9897959232330322, "step": 4203 }, { "completion_length": 246.1836700439453, "epoch": 0.4230440251572327, "grad_norm": 0.6317194700241089, "kl": 0.0977783203125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6820226907730103, "reward_std": 0.14481331035494804, "rewards/accuracy_reward": 0.7024308741092682, "rewards/format_reward": 0.9795918166637421, "step": 4204 }, { "completion_length": 207.7448959350586, "epoch": 0.4231446540880503, "grad_norm": 2.447680711746216, "kl": 0.105224609375, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6244410872459412, "reward_std": 0.09823275357484818, "rewards/accuracy_reward": 0.6346451938152313, "rewards/format_reward": 0.9897959232330322, "step": 4205 }, { "completion_length": 169.51020050048828, "epoch": 0.42324528301886793, "grad_norm": 1.1998358964920044, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.674669861793518, "reward_std": 0.21981992572546005, "rewards/accuracy_reward": 0.6848739385604858, "rewards/format_reward": 0.9897959232330322, "step": 4206 }, { "completion_length": 305.07142639160156, "epoch": 0.4233459119496855, "grad_norm": 0.877172589302063, "kl": 0.0626220703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7176929116249084, "reward_std": 0.23122600466012955, "rewards/accuracy_reward": 0.7483052313327789, "rewards/format_reward": 0.9693877398967743, "step": 4207 }, { "completion_length": 280.7551040649414, "epoch": 0.42344654088050315, "grad_norm": 1.0009993314743042, "kl": 0.0672607421875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7066326141357422, "reward_std": 0.29370658099651337, "rewards/accuracy_reward": 0.7372449040412903, "rewards/format_reward": 0.9693877398967743, "step": 4208 }, { "completion_length": 232.42857360839844, "epoch": 0.42354716981132073, "grad_norm": 1.2533202171325684, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7806122303009033, "reward_std": 0.22459400445222855, "rewards/accuracy_reward": 0.8010203838348389, "rewards/format_reward": 0.9795918464660645, "step": 4209 }, { "completion_length": 175.1836700439453, "epoch": 0.42364779874213837, "grad_norm": 0.6419668793678284, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.816671073436737, "reward_std": 0.12024538964033127, "rewards/accuracy_reward": 0.837079256772995, "rewards/format_reward": 0.9795918166637421, "step": 4210 }, { "completion_length": 279.01019287109375, "epoch": 0.42374842767295595, "grad_norm": 1.743756890296936, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.800682246685028, "reward_std": 0.27048465609550476, "rewards/accuracy_reward": 0.821090430021286, "rewards/format_reward": 0.9795918464660645, "step": 4211 }, { "completion_length": 289.57142639160156, "epoch": 0.4238490566037736, "grad_norm": 0.6451298594474792, "kl": 0.0726318359375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7185134291648865, "reward_std": 0.16650831699371338, "rewards/accuracy_reward": 0.7287175059318542, "rewards/format_reward": 0.9897959232330322, "step": 4212 }, { "completion_length": 326.33673095703125, "epoch": 0.42394968553459117, "grad_norm": 0.7107820510864258, "kl": 0.0694580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7885112762451172, "reward_std": 0.21983880549669266, "rewards/accuracy_reward": 0.7987154126167297, "rewards/format_reward": 0.9897959232330322, "step": 4213 }, { "completion_length": 261.9897994995117, "epoch": 0.4240503144654088, "grad_norm": 0.6791097521781921, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.856559693813324, "reward_std": 0.2034367099404335, "rewards/accuracy_reward": 0.8769679069519043, "rewards/format_reward": 0.9795918166637421, "step": 4214 }, { "completion_length": 212.7244873046875, "epoch": 0.4241509433962264, "grad_norm": 0.7992509603500366, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7697577476501465, "reward_std": 0.15398072451353073, "rewards/accuracy_reward": 0.7799618244171143, "rewards/format_reward": 0.9897959232330322, "step": 4215 }, { "completion_length": 234.72447967529297, "epoch": 0.424251572327044, "grad_norm": 0.7983288764953613, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7653061151504517, "reward_std": 0.1652088463306427, "rewards/accuracy_reward": 0.7653060853481293, "rewards/format_reward": 1.0, "step": 4216 }, { "completion_length": 231.09182739257812, "epoch": 0.42435220125786166, "grad_norm": 0.5470424294471741, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7549562454223633, "reward_std": 0.09531647711992264, "rewards/accuracy_reward": 0.7651603519916534, "rewards/format_reward": 0.9897959232330322, "step": 4217 }, { "completion_length": 250.8775405883789, "epoch": 0.42445283018867924, "grad_norm": 0.5991119146347046, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7141411304473877, "reward_std": 0.1873490810394287, "rewards/accuracy_reward": 0.7243451774120331, "rewards/format_reward": 0.9897959232330322, "step": 4218 }, { "completion_length": 280.88775634765625, "epoch": 0.4245534591194969, "grad_norm": 0.45322996377944946, "kl": 0.0460205078125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7700392007827759, "reward_std": 0.1422267183661461, "rewards/accuracy_reward": 0.7904473543167114, "rewards/format_reward": 0.9795918166637421, "step": 4219 }, { "completion_length": 220.10203552246094, "epoch": 0.42465408805031446, "grad_norm": 1.2013877630233765, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8469387292861938, "reward_std": 0.18279438838362694, "rewards/accuracy_reward": 0.8469387590885162, "rewards/format_reward": 1.0, "step": 4220 }, { "completion_length": 256.79591369628906, "epoch": 0.4247547169811321, "grad_norm": 0.7426003813743591, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6725847721099854, "reward_std": 0.17978333681821823, "rewards/accuracy_reward": 0.6725847572088242, "rewards/format_reward": 1.0, "step": 4221 }, { "completion_length": 237.27550506591797, "epoch": 0.4248553459119497, "grad_norm": 0.6547278761863708, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6920044422149658, "reward_std": 0.1583659127354622, "rewards/accuracy_reward": 0.7022084891796112, "rewards/format_reward": 0.9897959232330322, "step": 4222 }, { "completion_length": 212.2142791748047, "epoch": 0.4249559748427673, "grad_norm": 0.9015583992004395, "kl": 0.049072265625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8274257183074951, "reward_std": 0.20038547739386559, "rewards/accuracy_reward": 0.8376297652721405, "rewards/format_reward": 0.9897959232330322, "step": 4223 }, { "completion_length": 218.59182739257812, "epoch": 0.4250566037735849, "grad_norm": 1.0220946073532104, "kl": 0.050537109375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.750566840171814, "reward_std": 0.2428746148943901, "rewards/accuracy_reward": 0.7607709765434265, "rewards/format_reward": 0.9897959232330322, "step": 4224 }, { "completion_length": 288.26529693603516, "epoch": 0.42515723270440253, "grad_norm": 0.5008363127708435, "kl": 0.0709228515625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.780362606048584, "reward_std": 0.09991743043065071, "rewards/accuracy_reward": 0.7803626358509064, "rewards/format_reward": 1.0, "step": 4225 }, { "completion_length": 217.55101776123047, "epoch": 0.4252578616352201, "grad_norm": 0.6814363598823547, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.720186710357666, "reward_std": 0.12528188899159431, "rewards/accuracy_reward": 0.7201866805553436, "rewards/format_reward": 1.0, "step": 4226 }, { "completion_length": 243.0408172607422, "epoch": 0.42535849056603775, "grad_norm": 1.1158397197723389, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7008432745933533, "reward_std": 0.21052364259958267, "rewards/accuracy_reward": 0.7314554750919342, "rewards/format_reward": 0.9693877398967743, "step": 4227 }, { "completion_length": 283.2040710449219, "epoch": 0.42545911949685533, "grad_norm": 0.7196944355964661, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7039169669151306, "reward_std": 0.2353805936872959, "rewards/accuracy_reward": 0.7345293760299683, "rewards/format_reward": 0.9693877398967743, "step": 4228 }, { "completion_length": 286.9081497192383, "epoch": 0.42555974842767297, "grad_norm": 2.0051491260528564, "kl": 0.0589599609375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6069992184638977, "reward_std": 0.26113753020763397, "rewards/accuracy_reward": 0.6172033548355103, "rewards/format_reward": 0.9897959232330322, "step": 4229 }, { "completion_length": 325.2040710449219, "epoch": 0.42566037735849055, "grad_norm": 0.6305873990058899, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5665618777275085, "reward_std": 0.1832265667617321, "rewards/accuracy_reward": 0.5767660737037659, "rewards/format_reward": 0.9897959232330322, "step": 4230 }, { "completion_length": 158.2040786743164, "epoch": 0.4257610062893082, "grad_norm": 1.1137515306472778, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7653060555458069, "reward_std": 0.11114142835140228, "rewards/accuracy_reward": 0.775510162115097, "rewards/format_reward": 0.9897959232330322, "step": 4231 }, { "completion_length": 204.14285278320312, "epoch": 0.42586163522012577, "grad_norm": 0.9764642119407654, "kl": 0.10400390625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7948346734046936, "reward_std": 0.10633355379104614, "rewards/accuracy_reward": 0.7948346734046936, "rewards/format_reward": 1.0, "step": 4232 }, { "completion_length": 299.4387741088867, "epoch": 0.4259622641509434, "grad_norm": 0.6076422333717346, "kl": 0.0899658203125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.514459252357483, "reward_std": 0.1995495744049549, "rewards/accuracy_reward": 0.5450714230537415, "rewards/format_reward": 0.9693877398967743, "step": 4233 }, { "completion_length": 209.9183578491211, "epoch": 0.426062893081761, "grad_norm": 0.5172280073165894, "kl": 0.0543212890625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8847641348838806, "reward_std": 0.08675535395741463, "rewards/accuracy_reward": 0.8847642242908478, "rewards/format_reward": 1.0, "step": 4234 }, { "completion_length": 221.81632232666016, "epoch": 0.4261635220125786, "grad_norm": 1.2346612215042114, "kl": 0.097412109375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.677491307258606, "reward_std": 0.2790987566113472, "rewards/accuracy_reward": 0.6978995203971863, "rewards/format_reward": 0.9795918166637421, "step": 4235 }, { "completion_length": 235.49999237060547, "epoch": 0.4262641509433962, "grad_norm": 1.187153935432434, "kl": 0.108642578125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7155889868736267, "reward_std": 0.2001645565032959, "rewards/accuracy_reward": 0.7359971106052399, "rewards/format_reward": 0.9795918166637421, "step": 4236 }, { "completion_length": 254.3877410888672, "epoch": 0.42636477987421384, "grad_norm": 0.9220999479293823, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.765023410320282, "reward_std": 0.1519663855433464, "rewards/accuracy_reward": 0.7854315638542175, "rewards/format_reward": 0.9795918464660645, "step": 4237 }, { "completion_length": 241.6734619140625, "epoch": 0.4264654088050314, "grad_norm": 0.7006030678749084, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8325763940811157, "reward_std": 0.11394812166690826, "rewards/accuracy_reward": 0.8325763940811157, "rewards/format_reward": 1.0, "step": 4238 }, { "completion_length": 290.82652282714844, "epoch": 0.42656603773584906, "grad_norm": 0.7829302549362183, "kl": 0.10546875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.5676720142364502, "reward_std": 0.2266477569937706, "rewards/accuracy_reward": 0.5778760612010956, "rewards/format_reward": 0.9897959232330322, "step": 4239 }, { "completion_length": 239.56121826171875, "epoch": 0.4266666666666667, "grad_norm": 1.2015646696090698, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8144295811653137, "reward_std": 0.19224916398525238, "rewards/accuracy_reward": 0.8348377645015717, "rewards/format_reward": 0.9795918464660645, "step": 4240 }, { "completion_length": 222.4897918701172, "epoch": 0.4267672955974843, "grad_norm": 1.0496315956115723, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8216714262962341, "reward_std": 0.19459382444620132, "rewards/accuracy_reward": 0.8318756222724915, "rewards/format_reward": 0.9897959232330322, "step": 4241 }, { "completion_length": 224.91836547851562, "epoch": 0.4268679245283019, "grad_norm": 2.165905475616455, "kl": 0.05517578125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8388876914978027, "reward_std": 0.05963789485394955, "rewards/accuracy_reward": 0.838887631893158, "rewards/format_reward": 1.0, "step": 4242 }, { "completion_length": 342.0, "epoch": 0.4269685534591195, "grad_norm": 0.8618969321250916, "kl": 0.047119140625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.6913924813270569, "reward_std": 0.11861025542020798, "rewards/accuracy_reward": 0.6913925409317017, "rewards/format_reward": 1.0, "step": 4243 }, { "completion_length": 300.1428527832031, "epoch": 0.42706918238993713, "grad_norm": 0.7124151587486267, "kl": 0.0601806640625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6891852021217346, "reward_std": 0.23301726579666138, "rewards/accuracy_reward": 0.7095933854579926, "rewards/format_reward": 0.9795918464660645, "step": 4244 }, { "completion_length": 253.69387817382812, "epoch": 0.4271698113207547, "grad_norm": 0.8252783417701721, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7740564346313477, "reward_std": 0.15219169482588768, "rewards/accuracy_reward": 0.7944645881652832, "rewards/format_reward": 0.9795918464660645, "step": 4245 }, { "completion_length": 235.33673095703125, "epoch": 0.42727044025157235, "grad_norm": 0.7848260998725891, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6763140559196472, "reward_std": 0.15828847885131836, "rewards/accuracy_reward": 0.676314115524292, "rewards/format_reward": 1.0, "step": 4246 }, { "completion_length": 213.37754821777344, "epoch": 0.42737106918238993, "grad_norm": 1.0400738716125488, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8618163466453552, "reward_std": 0.08762698993086815, "rewards/accuracy_reward": 0.8618163764476776, "rewards/format_reward": 1.0, "step": 4247 }, { "completion_length": 260.65306091308594, "epoch": 0.42747169811320757, "grad_norm": 0.7433923482894897, "kl": 0.11669921875, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.572108805179596, "reward_std": 0.1631675511598587, "rewards/accuracy_reward": 0.5823129117488861, "rewards/format_reward": 0.9897959232330322, "step": 4248 }, { "completion_length": 283.1632537841797, "epoch": 0.42757232704402515, "grad_norm": 2.191739797592163, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6517006754875183, "reward_std": 0.29440218210220337, "rewards/accuracy_reward": 0.6619047522544861, "rewards/format_reward": 0.9897959232330322, "step": 4249 }, { "completion_length": 293.4897994995117, "epoch": 0.4276729559748428, "grad_norm": 2.1400609016418457, "kl": 0.0460205078125, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7595238089561462, "reward_std": 0.14325707778334618, "rewards/accuracy_reward": 0.7595238089561462, "rewards/format_reward": 1.0, "step": 4250 }, { "completion_length": 273.29590606689453, "epoch": 0.42777358490566036, "grad_norm": 0.5180535912513733, "kl": 0.051513671875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.872278869152069, "reward_std": 0.09041845425963402, "rewards/accuracy_reward": 0.8824829757213593, "rewards/format_reward": 0.9897959232330322, "step": 4251 }, { "completion_length": 295.86734771728516, "epoch": 0.427874213836478, "grad_norm": 0.47229427099227905, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6760644316673279, "reward_std": 0.17409682646393776, "rewards/accuracy_reward": 0.6760644614696503, "rewards/format_reward": 1.0, "step": 4252 }, { "completion_length": 202.49999237060547, "epoch": 0.4279748427672956, "grad_norm": 0.7102140784263611, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7825708985328674, "reward_std": 0.1040121540427208, "rewards/accuracy_reward": 0.7825708985328674, "rewards/format_reward": 1.0, "step": 4253 }, { "completion_length": 295.52040100097656, "epoch": 0.4280754716981132, "grad_norm": 0.8039922714233398, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8085379004478455, "reward_std": 0.2728066220879555, "rewards/accuracy_reward": 0.8289460837841034, "rewards/format_reward": 0.9795918166637421, "step": 4254 }, { "completion_length": 306.41835021972656, "epoch": 0.4281761006289308, "grad_norm": 3.8238987922668457, "kl": 0.100830078125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6094959378242493, "reward_std": 0.25320786237716675, "rewards/accuracy_reward": 0.6401082277297974, "rewards/format_reward": 0.9693877398967743, "step": 4255 }, { "completion_length": 265.82652282714844, "epoch": 0.42827672955974844, "grad_norm": 0.958011269569397, "kl": 0.0511474609375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7618803977966309, "reward_std": 0.1826382651925087, "rewards/accuracy_reward": 0.7618804275989532, "rewards/format_reward": 1.0, "step": 4256 }, { "completion_length": 283.93878173828125, "epoch": 0.428377358490566, "grad_norm": 0.6241685152053833, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7462517619132996, "reward_std": 0.15661398321390152, "rewards/accuracy_reward": 0.756455808877945, "rewards/format_reward": 0.9897959232330322, "step": 4257 }, { "completion_length": 207.88775634765625, "epoch": 0.42847798742138365, "grad_norm": 0.9287964701652527, "kl": 0.053466796875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7603541612625122, "reward_std": 0.19408465921878815, "rewards/accuracy_reward": 0.760354071855545, "rewards/format_reward": 1.0, "step": 4258 }, { "completion_length": 367.8061218261719, "epoch": 0.42857861635220124, "grad_norm": 0.6729754209518433, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7061745524406433, "reward_std": 0.23777564615011215, "rewards/accuracy_reward": 0.7163786590099335, "rewards/format_reward": 0.9897959232330322, "step": 4259 }, { "completion_length": 280.07142639160156, "epoch": 0.4286792452830189, "grad_norm": 0.9943805932998657, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.863378643989563, "reward_std": 0.13845471292734146, "rewards/accuracy_reward": 0.863378643989563, "rewards/format_reward": 1.0, "step": 4260 }, { "completion_length": 267.74488830566406, "epoch": 0.42877987421383645, "grad_norm": 0.6136424541473389, "kl": 0.053466796875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7743764519691467, "reward_std": 0.09208921529352665, "rewards/accuracy_reward": 0.7743763625621796, "rewards/format_reward": 1.0, "step": 4261 }, { "completion_length": 226.77550506591797, "epoch": 0.4288805031446541, "grad_norm": 0.5543892979621887, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7675491571426392, "reward_std": 0.13120894134044647, "rewards/accuracy_reward": 0.7777532935142517, "rewards/format_reward": 0.9897959232330322, "step": 4262 }, { "completion_length": 279.5918273925781, "epoch": 0.42898113207547167, "grad_norm": 0.8657248020172119, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7804814577102661, "reward_std": 0.21990295499563217, "rewards/accuracy_reward": 0.8110936880111694, "rewards/format_reward": 0.9693877398967743, "step": 4263 }, { "completion_length": 241.80612182617188, "epoch": 0.4290817610062893, "grad_norm": 1.2635154724121094, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8012702465057373, "reward_std": 0.09208398312330246, "rewards/accuracy_reward": 0.8012702465057373, "rewards/format_reward": 1.0, "step": 4264 }, { "completion_length": 197.06121826171875, "epoch": 0.42918238993710695, "grad_norm": 0.7395122051239014, "kl": 0.11328125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.8279649019241333, "reward_std": 0.10748771950602531, "rewards/accuracy_reward": 0.8381689488887787, "rewards/format_reward": 0.9897959232330322, "step": 4265 }, { "completion_length": 272.6428527832031, "epoch": 0.4292830188679245, "grad_norm": 0.4296056032180786, "kl": 0.0662841796875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7278911471366882, "reward_std": 0.15969868749380112, "rewards/accuracy_reward": 0.7380952388048172, "rewards/format_reward": 0.9897959232330322, "step": 4266 }, { "completion_length": 258.1632537841797, "epoch": 0.42938364779874216, "grad_norm": 0.6711570620536804, "kl": 0.05908203125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8453190922737122, "reward_std": 0.219838485121727, "rewards/accuracy_reward": 0.8657272160053253, "rewards/format_reward": 0.9795918464660645, "step": 4267 }, { "completion_length": 303.1836700439453, "epoch": 0.42948427672955974, "grad_norm": 0.6179366111755371, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8694937229156494, "reward_std": 0.19236662238836288, "rewards/accuracy_reward": 0.8796978294849396, "rewards/format_reward": 0.9897959232330322, "step": 4268 }, { "completion_length": 261.1632614135742, "epoch": 0.4295849056603774, "grad_norm": 0.7984612584114075, "kl": 0.0899658203125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.733560025691986, "reward_std": 0.17791873961687088, "rewards/accuracy_reward": 0.7335600554943085, "rewards/format_reward": 1.0, "step": 4269 }, { "completion_length": 254.77550506591797, "epoch": 0.42968553459119496, "grad_norm": 0.3374984860420227, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8210850954055786, "reward_std": 0.05079168081283569, "rewards/accuracy_reward": 0.8312891721725464, "rewards/format_reward": 0.9897959232330322, "step": 4270 }, { "completion_length": 159.26529693603516, "epoch": 0.4297861635220126, "grad_norm": 0.8798404932022095, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8831168413162231, "reward_std": 0.07997560128569603, "rewards/accuracy_reward": 0.8831168711185455, "rewards/format_reward": 1.0, "step": 4271 }, { "completion_length": 245.1938705444336, "epoch": 0.4298867924528302, "grad_norm": 0.3689518868923187, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8287545442581177, "reward_std": 0.07930074073374271, "rewards/accuracy_reward": 0.8389586806297302, "rewards/format_reward": 0.9897959232330322, "step": 4272 }, { "completion_length": 211.1938705444336, "epoch": 0.4299874213836478, "grad_norm": 3.157409191131592, "kl": 0.09912109375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7367346286773682, "reward_std": 0.174481563270092, "rewards/accuracy_reward": 0.7571428418159485, "rewards/format_reward": 0.9795918464660645, "step": 4273 }, { "completion_length": 198.4795913696289, "epoch": 0.4300880503144654, "grad_norm": 0.6960923075675964, "kl": 0.11328125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7941694855690002, "reward_std": 0.16682352125644684, "rewards/accuracy_reward": 0.8043736219406128, "rewards/format_reward": 0.9897959232330322, "step": 4274 }, { "completion_length": 242.06122589111328, "epoch": 0.43018867924528303, "grad_norm": 0.6355876326560974, "kl": 0.0728759765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7829632759094238, "reward_std": 0.0904037170112133, "rewards/accuracy_reward": 0.7829633057117462, "rewards/format_reward": 1.0, "step": 4275 }, { "completion_length": 305.4693908691406, "epoch": 0.4302893081761006, "grad_norm": 1.0209184885025024, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.705320656299591, "reward_std": 0.2369810715317726, "rewards/accuracy_reward": 0.7155247926712036, "rewards/format_reward": 0.9897959232330322, "step": 4276 }, { "completion_length": 263.7550964355469, "epoch": 0.43038993710691825, "grad_norm": 0.7719906568527222, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7427356839179993, "reward_std": 0.16096723917871714, "rewards/accuracy_reward": 0.7427356541156769, "rewards/format_reward": 1.0, "step": 4277 }, { "completion_length": 262.9387664794922, "epoch": 0.43049056603773583, "grad_norm": 0.5866401195526123, "kl": 0.05859375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.782766342163086, "reward_std": 0.16573414951562881, "rewards/accuracy_reward": 0.7929704189300537, "rewards/format_reward": 0.9897959232330322, "step": 4278 }, { "completion_length": 236.83673095703125, "epoch": 0.43059119496855347, "grad_norm": 0.7100639343261719, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8979591727256775, "reward_std": 0.1158459484577179, "rewards/accuracy_reward": 0.8979591727256775, "rewards/format_reward": 1.0, "step": 4279 }, { "completion_length": 281.9183578491211, "epoch": 0.43069182389937105, "grad_norm": 0.7449011206626892, "kl": 0.0570068359375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8140275478363037, "reward_std": 0.12069898471236229, "rewards/accuracy_reward": 0.8140275776386261, "rewards/format_reward": 1.0, "step": 4280 }, { "completion_length": 215.07141876220703, "epoch": 0.4307924528301887, "grad_norm": 0.7386614084243774, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8287981152534485, "reward_std": 0.14965105801820755, "rewards/accuracy_reward": 0.8287981450557709, "rewards/format_reward": 1.0, "step": 4281 }, { "completion_length": 287.4081573486328, "epoch": 0.43089308176100627, "grad_norm": 0.7375903725624084, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.800907015800476, "reward_std": 0.17089346051216125, "rewards/accuracy_reward": 0.8111110627651215, "rewards/format_reward": 0.9897959232330322, "step": 4282 }, { "completion_length": 213.26529693603516, "epoch": 0.4309937106918239, "grad_norm": 1.7604798078536987, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7827457785606384, "reward_std": 0.0474417582154274, "rewards/accuracy_reward": 0.7827458083629608, "rewards/format_reward": 1.0, "step": 4283 }, { "completion_length": 253.4693832397461, "epoch": 0.4310943396226415, "grad_norm": 1.0136826038360596, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7663457989692688, "reward_std": 0.33680371195077896, "rewards/accuracy_reward": 0.8173662424087524, "rewards/format_reward": 0.9489795565605164, "step": 4284 }, { "completion_length": 250.60203552246094, "epoch": 0.4311949685534591, "grad_norm": 0.42811402678489685, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.855965495109558, "reward_std": 0.1188499815762043, "rewards/accuracy_reward": 0.8559654355049133, "rewards/format_reward": 1.0, "step": 4285 }, { "completion_length": 168.87754821777344, "epoch": 0.4312955974842767, "grad_norm": 0.6973148584365845, "kl": 0.1357421875, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.767408847808838, "reward_std": 0.0703073488548398, "rewards/accuracy_reward": 0.7674087584018707, "rewards/format_reward": 1.0, "step": 4286 }, { "completion_length": 231.1326446533203, "epoch": 0.43139622641509434, "grad_norm": 1.7290178537368774, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.771720051765442, "reward_std": 0.20357180386781693, "rewards/accuracy_reward": 0.7819241881370544, "rewards/format_reward": 0.9897959232330322, "step": 4287 }, { "completion_length": 221.2755126953125, "epoch": 0.4314968553459119, "grad_norm": 0.2696108818054199, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.894819438457489, "reward_std": 0.03738110512495041, "rewards/accuracy_reward": 0.894819438457489, "rewards/format_reward": 1.0, "step": 4288 }, { "completion_length": 300.60203552246094, "epoch": 0.43159748427672956, "grad_norm": 0.6607798933982849, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6890090107917786, "reward_std": 0.17179801687598228, "rewards/accuracy_reward": 0.699213057756424, "rewards/format_reward": 0.9897959232330322, "step": 4289 }, { "completion_length": 344.3877410888672, "epoch": 0.4316981132075472, "grad_norm": 1.0500508546829224, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5627226829528809, "reward_std": 0.21328400075435638, "rewards/accuracy_reward": 0.5831308513879776, "rewards/format_reward": 0.9795918464660645, "step": 4290 }, { "completion_length": 265.7551040649414, "epoch": 0.4317987421383648, "grad_norm": 0.540928065776825, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7861780524253845, "reward_std": 0.12058108299970627, "rewards/accuracy_reward": 0.7861781120300293, "rewards/format_reward": 1.0, "step": 4291 }, { "completion_length": 225.33673095703125, "epoch": 0.4318993710691824, "grad_norm": 1.9607807397842407, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7236562967300415, "reward_std": 0.19777490943670273, "rewards/accuracy_reward": 0.764472633600235, "rewards/format_reward": 0.9591836333274841, "step": 4292 }, { "completion_length": 208.57142639160156, "epoch": 0.432, "grad_norm": 0.9689529538154602, "kl": 0.104248046875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8007230758666992, "reward_std": 0.15420504845678806, "rewards/accuracy_reward": 0.8109272122383118, "rewards/format_reward": 0.9897959232330322, "step": 4293 }, { "completion_length": 266.8061218261719, "epoch": 0.43210062893081763, "grad_norm": 1.1119214296340942, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.70355886220932, "reward_std": 0.2588331252336502, "rewards/accuracy_reward": 0.7137629389762878, "rewards/format_reward": 0.9897959232330322, "step": 4294 }, { "completion_length": 256.6938781738281, "epoch": 0.4322012578616352, "grad_norm": 5.461298942565918, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.9016595482826233, "reward_std": 0.11382223293185234, "rewards/accuracy_reward": 0.9016595482826233, "rewards/format_reward": 1.0, "step": 4295 }, { "completion_length": 261.5102005004883, "epoch": 0.43230188679245285, "grad_norm": 1.0063326358795166, "kl": 0.091552734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6720781922340393, "reward_std": 0.17904353886842728, "rewards/accuracy_reward": 0.6822823584079742, "rewards/format_reward": 0.9897959232330322, "step": 4296 }, { "completion_length": 217.2653045654297, "epoch": 0.43240251572327043, "grad_norm": 1.295697808265686, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7968095541000366, "reward_std": 0.20983312278985977, "rewards/accuracy_reward": 0.8172177970409393, "rewards/format_reward": 0.9795918464660645, "step": 4297 }, { "completion_length": 171.5, "epoch": 0.43250314465408807, "grad_norm": 1.8400946855545044, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6506483554840088, "reward_std": 0.13947682455182076, "rewards/accuracy_reward": 0.6710565388202667, "rewards/format_reward": 0.9795918166637421, "step": 4298 }, { "completion_length": 229.06121826171875, "epoch": 0.43260377358490565, "grad_norm": 1.123592734336853, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7300556302070618, "reward_std": 0.29332638531923294, "rewards/accuracy_reward": 0.7504638135433197, "rewards/format_reward": 0.9795918464660645, "step": 4299 }, { "completion_length": 235.43877410888672, "epoch": 0.4327044025157233, "grad_norm": 0.9254050850868225, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.763140857219696, "reward_std": 0.19950789958238602, "rewards/accuracy_reward": 0.7733449041843414, "rewards/format_reward": 0.9897959232330322, "step": 4300 }, { "completion_length": 227.38774871826172, "epoch": 0.43280503144654087, "grad_norm": 0.7355244159698486, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.712806522846222, "reward_std": 0.12206986546516418, "rewards/accuracy_reward": 0.7332146465778351, "rewards/format_reward": 0.9795918464660645, "step": 4301 }, { "completion_length": 228.30612182617188, "epoch": 0.4329056603773585, "grad_norm": 0.5389595627784729, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.9132652878761292, "reward_std": 0.04049619287252426, "rewards/accuracy_reward": 0.9132652878761292, "rewards/format_reward": 1.0, "step": 4302 }, { "completion_length": 202.2040786743164, "epoch": 0.4330062893081761, "grad_norm": 1.3075670003890991, "kl": 0.124267578125, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.8291993141174316, "reward_std": 0.21455511450767517, "rewards/accuracy_reward": 0.8394034802913666, "rewards/format_reward": 0.9897959232330322, "step": 4303 }, { "completion_length": 286.1326446533203, "epoch": 0.4331069182389937, "grad_norm": 0.7130185961723328, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7294305562973022, "reward_std": 0.2030099630355835, "rewards/accuracy_reward": 0.760042816400528, "rewards/format_reward": 0.9693877398967743, "step": 4304 }, { "completion_length": 244.12244415283203, "epoch": 0.4332075471698113, "grad_norm": 0.8307574391365051, "kl": 0.103515625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.750364363193512, "reward_std": 0.17512596398591995, "rewards/accuracy_reward": 0.7605684697628021, "rewards/format_reward": 0.9897959232330322, "step": 4305 }, { "completion_length": 203.87754821777344, "epoch": 0.43330817610062894, "grad_norm": 0.4373164176940918, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8136621713638306, "reward_std": 0.10550111159682274, "rewards/accuracy_reward": 0.8136621415615082, "rewards/format_reward": 1.0, "step": 4306 }, { "completion_length": 237.1734619140625, "epoch": 0.4334088050314465, "grad_norm": 0.6216320395469666, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6509623527526855, "reward_std": 0.12941491790115833, "rewards/accuracy_reward": 0.671370655298233, "rewards/format_reward": 0.9795918166637421, "step": 4307 }, { "completion_length": 234.30611419677734, "epoch": 0.43350943396226416, "grad_norm": 1.945330023765564, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8447932004928589, "reward_std": 0.12264556810259819, "rewards/accuracy_reward": 0.8447932600975037, "rewards/format_reward": 1.0, "step": 4308 }, { "completion_length": 295.91835021972656, "epoch": 0.43361006289308174, "grad_norm": 0.48039236664772034, "kl": 0.0587158203125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8658892512321472, "reward_std": 0.11661470495164394, "rewards/accuracy_reward": 0.8658892214298248, "rewards/format_reward": 1.0, "step": 4309 }, { "completion_length": 282.79591369628906, "epoch": 0.4337106918238994, "grad_norm": 1.7397089004516602, "kl": 0.0909423828125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7449763417243958, "reward_std": 0.17431359365582466, "rewards/accuracy_reward": 0.7551805377006531, "rewards/format_reward": 0.9897959232330322, "step": 4310 }, { "completion_length": 284.6428527832031, "epoch": 0.43381132075471696, "grad_norm": 0.8197579383850098, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6261616349220276, "reward_std": 0.1561238169670105, "rewards/accuracy_reward": 0.6261616349220276, "rewards/format_reward": 1.0, "step": 4311 }, { "completion_length": 272.29590606689453, "epoch": 0.4339119496855346, "grad_norm": 0.6580641865730286, "kl": 0.0606689453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8699383735656738, "reward_std": 0.17327818274497986, "rewards/accuracy_reward": 0.8903465867042542, "rewards/format_reward": 0.9795918464660645, "step": 4312 }, { "completion_length": 184.34693145751953, "epoch": 0.4340125786163522, "grad_norm": 0.49574705958366394, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8586004972457886, "reward_std": 0.12254881113767624, "rewards/accuracy_reward": 0.8688046038150787, "rewards/format_reward": 0.9897959232330322, "step": 4313 }, { "completion_length": 178.52040100097656, "epoch": 0.4341132075471698, "grad_norm": 2.1049818992614746, "kl": 0.09423828125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6256338357925415, "reward_std": 0.10400255396962166, "rewards/accuracy_reward": 0.6256338655948639, "rewards/format_reward": 1.0, "step": 4314 }, { "completion_length": 346.78570556640625, "epoch": 0.43421383647798745, "grad_norm": 0.40852299332618713, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6011606454849243, "reward_std": 0.0889003723859787, "rewards/accuracy_reward": 0.6113647967576981, "rewards/format_reward": 0.9897959232330322, "step": 4315 }, { "completion_length": 250.6938705444336, "epoch": 0.43431446540880503, "grad_norm": 0.3800893723964691, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7979591488838196, "reward_std": 0.08475881442427635, "rewards/accuracy_reward": 0.797959178686142, "rewards/format_reward": 1.0, "step": 4316 }, { "completion_length": 224.02039337158203, "epoch": 0.43441509433962266, "grad_norm": 0.6381915807723999, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8352813124656677, "reward_std": 0.06328899040818214, "rewards/accuracy_reward": 0.8352814018726349, "rewards/format_reward": 1.0, "step": 4317 }, { "completion_length": 266.66326904296875, "epoch": 0.43451572327044025, "grad_norm": 0.6178603768348694, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7745825052261353, "reward_std": 0.16340237855911255, "rewards/accuracy_reward": 0.8051948249340057, "rewards/format_reward": 0.9693877398967743, "step": 4318 }, { "completion_length": 281.1938781738281, "epoch": 0.4346163522012579, "grad_norm": 1.6398004293441772, "kl": 0.0631103515625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.657057762145996, "reward_std": 0.1692628711462021, "rewards/accuracy_reward": 0.6672618985176086, "rewards/format_reward": 0.9897959232330322, "step": 4319 }, { "completion_length": 243.59182739257812, "epoch": 0.43471698113207546, "grad_norm": 0.7597018480300903, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.692176878452301, "reward_std": 0.1284236665815115, "rewards/accuracy_reward": 0.7023809254169464, "rewards/format_reward": 0.9897959232330322, "step": 4320 }, { "completion_length": 277.28570556640625, "epoch": 0.4348176100628931, "grad_norm": 0.7965632081031799, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5939599871635437, "reward_std": 0.24848995357751846, "rewards/accuracy_reward": 0.6347763240337372, "rewards/format_reward": 0.9591836333274841, "step": 4321 }, { "completion_length": 270.8673400878906, "epoch": 0.4349182389937107, "grad_norm": 0.756050169467926, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6313223838806152, "reward_std": 0.24184076488018036, "rewards/accuracy_reward": 0.6619347035884857, "rewards/format_reward": 0.9693877398967743, "step": 4322 }, { "completion_length": 258.0816345214844, "epoch": 0.4350188679245283, "grad_norm": 2.963151216506958, "kl": 0.10791015625, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.6261274814605713, "reward_std": 0.22888803482055664, "rewards/accuracy_reward": 0.6567397713661194, "rewards/format_reward": 0.9693877398967743, "step": 4323 }, { "completion_length": 270.7040786743164, "epoch": 0.4351194968553459, "grad_norm": 0.732598066329956, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7469730377197266, "reward_std": 0.2452981099486351, "rewards/accuracy_reward": 0.7571771442890167, "rewards/format_reward": 0.9897959232330322, "step": 4324 }, { "completion_length": 284.1734619140625, "epoch": 0.43522012578616354, "grad_norm": 0.5212090015411377, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.771582841873169, "reward_std": 0.14044861309230328, "rewards/accuracy_reward": 0.7817868590354919, "rewards/format_reward": 0.9897959232330322, "step": 4325 }, { "completion_length": 303.70408630371094, "epoch": 0.4353207547169811, "grad_norm": 0.882868230342865, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.538772165775299, "reward_std": 0.1963544487953186, "rewards/accuracy_reward": 0.5591803193092346, "rewards/format_reward": 0.9795918464660645, "step": 4326 }, { "completion_length": 238.78571319580078, "epoch": 0.43542138364779875, "grad_norm": 0.5068002343177795, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7389699220657349, "reward_std": 0.10126394033432007, "rewards/accuracy_reward": 0.7389698624610901, "rewards/format_reward": 1.0, "step": 4327 }, { "completion_length": 228.77550506591797, "epoch": 0.43552201257861634, "grad_norm": 0.4221540093421936, "kl": 0.09326171875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8227367401123047, "reward_std": 0.09328997880220413, "rewards/accuracy_reward": 0.8227367997169495, "rewards/format_reward": 1.0, "step": 4328 }, { "completion_length": 298.88775634765625, "epoch": 0.43562264150943397, "grad_norm": 0.30950599908828735, "kl": 0.0704345703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8011013865470886, "reward_std": 0.05937071517109871, "rewards/accuracy_reward": 0.8011013567447662, "rewards/format_reward": 1.0, "step": 4329 }, { "completion_length": 271.33673095703125, "epoch": 0.43572327044025155, "grad_norm": 0.9449074268341064, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.647314429283142, "reward_std": 0.1657441332936287, "rewards/accuracy_reward": 0.6779265999794006, "rewards/format_reward": 0.9693877398967743, "step": 4330 }, { "completion_length": 210.48979949951172, "epoch": 0.4358238993710692, "grad_norm": 0.5665500164031982, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6491476893424988, "reward_std": 0.09038076177239418, "rewards/accuracy_reward": 0.6491478085517883, "rewards/format_reward": 1.0, "step": 4331 }, { "completion_length": 238.62244415283203, "epoch": 0.43592452830188677, "grad_norm": 0.7164891362190247, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6860588788986206, "reward_std": 0.12478233501315117, "rewards/accuracy_reward": 0.6860588192939758, "rewards/format_reward": 1.0, "step": 4332 }, { "completion_length": 273.5918273925781, "epoch": 0.4360251572327044, "grad_norm": 0.5060385465621948, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8458255529403687, "reward_std": 0.1431061513721943, "rewards/accuracy_reward": 0.8560296595096588, "rewards/format_reward": 0.9897959232330322, "step": 4333 }, { "completion_length": 307.5306091308594, "epoch": 0.436125786163522, "grad_norm": 0.7848539352416992, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7169986367225647, "reward_std": 0.17517820745706558, "rewards/accuracy_reward": 0.7272027730941772, "rewards/format_reward": 0.9897959232330322, "step": 4334 }, { "completion_length": 279.06121826171875, "epoch": 0.4362264150943396, "grad_norm": 1.0148670673370361, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6568756103515625, "reward_std": 0.29771167039871216, "rewards/accuracy_reward": 0.6874878406524658, "rewards/format_reward": 0.9693877398967743, "step": 4335 }, { "completion_length": 335.60203552246094, "epoch": 0.4363270440251572, "grad_norm": 0.6428422331809998, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.662807583808899, "reward_std": 0.20837963372468948, "rewards/accuracy_reward": 0.6628076136112213, "rewards/format_reward": 1.0, "step": 4336 }, { "completion_length": 275.89794921875, "epoch": 0.43642767295597484, "grad_norm": 0.8854551911354065, "kl": 0.09326171875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.693259060382843, "reward_std": 0.26155540347099304, "rewards/accuracy_reward": 0.7034631967544556, "rewards/format_reward": 0.9897959232330322, "step": 4337 }, { "completion_length": 269.26529693603516, "epoch": 0.4365283018867925, "grad_norm": 0.6624236106872559, "kl": 0.05712890625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8608226776123047, "reward_std": 0.19313011318445206, "rewards/accuracy_reward": 0.8608227968215942, "rewards/format_reward": 1.0, "step": 4338 }, { "completion_length": 222.17345428466797, "epoch": 0.43662893081761006, "grad_norm": 0.7584801912307739, "kl": 0.11376953125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.9002267122268677, "reward_std": 0.15135329961776733, "rewards/accuracy_reward": 0.9002267420291901, "rewards/format_reward": 1.0, "step": 4339 }, { "completion_length": 268.2040786743164, "epoch": 0.4367295597484277, "grad_norm": 0.4593481719493866, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.9060776233673096, "reward_std": 0.08556800847873092, "rewards/accuracy_reward": 0.916281670331955, "rewards/format_reward": 0.9897959232330322, "step": 4340 }, { "completion_length": 256.79590606689453, "epoch": 0.4368301886792453, "grad_norm": 1.9910638332366943, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7775306701660156, "reward_std": 0.1467573568224907, "rewards/accuracy_reward": 0.787734717130661, "rewards/format_reward": 0.9897959232330322, "step": 4341 }, { "completion_length": 226.16326141357422, "epoch": 0.4369308176100629, "grad_norm": 0.5023593902587891, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.9025260210037231, "reward_std": 0.16180569678544998, "rewards/accuracy_reward": 0.9331383109092712, "rewards/format_reward": 0.9693877398967743, "step": 4342 }, { "completion_length": 264.87754821777344, "epoch": 0.4370314465408805, "grad_norm": 0.3632310926914215, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7896103858947754, "reward_std": 0.1092020533978939, "rewards/accuracy_reward": 0.7998144328594208, "rewards/format_reward": 0.9897959232330322, "step": 4343 }, { "completion_length": 241.35713958740234, "epoch": 0.43713207547169813, "grad_norm": 0.642959713935852, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7176870703697205, "reward_std": 0.16541338711977005, "rewards/accuracy_reward": 0.7176871001720428, "rewards/format_reward": 1.0, "step": 4344 }, { "completion_length": 264.551025390625, "epoch": 0.4372327044025157, "grad_norm": 1.4754180908203125, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8636308908462524, "reward_std": 0.11093717440962791, "rewards/accuracy_reward": 0.884039044380188, "rewards/format_reward": 0.9795918166637421, "step": 4345 }, { "completion_length": 211.9285659790039, "epoch": 0.43733333333333335, "grad_norm": 0.9567930102348328, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7235621213912964, "reward_std": 0.1867402046918869, "rewards/accuracy_reward": 0.7541743814945221, "rewards/format_reward": 0.9693877398967743, "step": 4346 }, { "completion_length": 197.22447967529297, "epoch": 0.43743396226415093, "grad_norm": 0.9850455522537231, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8594104051589966, "reward_std": 0.19869501888751984, "rewards/accuracy_reward": 0.8696144819259644, "rewards/format_reward": 0.9897959232330322, "step": 4347 }, { "completion_length": 257.08162689208984, "epoch": 0.43753459119496857, "grad_norm": 0.9758877158164978, "kl": 0.111328125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.714870572090149, "reward_std": 0.22170042246580124, "rewards/accuracy_reward": 0.7352787554264069, "rewards/format_reward": 0.9795918166637421, "step": 4348 }, { "completion_length": 215.4387664794922, "epoch": 0.43763522012578615, "grad_norm": 0.6077538728713989, "kl": 0.099609375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7361515164375305, "reward_std": 0.09178527072072029, "rewards/accuracy_reward": 0.7361515760421753, "rewards/format_reward": 1.0, "step": 4349 }, { "completion_length": 248.93877410888672, "epoch": 0.4377358490566038, "grad_norm": 0.6376871466636658, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6866286396980286, "reward_std": 0.15172511711716652, "rewards/accuracy_reward": 0.7070367932319641, "rewards/format_reward": 0.9795918166637421, "step": 4350 }, { "completion_length": 252.06121826171875, "epoch": 0.43783647798742137, "grad_norm": 0.7841488122940063, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7008503079414368, "reward_std": 0.24493402242660522, "rewards/accuracy_reward": 0.7212584614753723, "rewards/format_reward": 0.9795918166637421, "step": 4351 }, { "completion_length": 342.82652282714844, "epoch": 0.437937106918239, "grad_norm": 0.9813742637634277, "kl": 0.0770263671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7462422847747803, "reward_std": 0.1432170569896698, "rewards/accuracy_reward": 0.7564463913440704, "rewards/format_reward": 0.9897959232330322, "step": 4352 }, { "completion_length": 213.58162689208984, "epoch": 0.4380377358490566, "grad_norm": 0.62404865026474, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7764555215835571, "reward_std": 0.1616314873099327, "rewards/accuracy_reward": 0.7764555811882019, "rewards/format_reward": 1.0, "step": 4353 }, { "completion_length": 256.0612258911133, "epoch": 0.4381383647798742, "grad_norm": 0.915860116481781, "kl": 0.10546875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7311276197433472, "reward_std": 0.16510115563869476, "rewards/accuracy_reward": 0.7515358626842499, "rewards/format_reward": 0.9795918464660645, "step": 4354 }, { "completion_length": 289.10203552246094, "epoch": 0.4382389937106918, "grad_norm": 1.3730323314666748, "kl": 0.103271484375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7669301629066467, "reward_std": 0.21848740428686142, "rewards/accuracy_reward": 0.7975424230098724, "rewards/format_reward": 0.9693877398967743, "step": 4355 }, { "completion_length": 234.29591369628906, "epoch": 0.43833962264150944, "grad_norm": 0.45558542013168335, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8750360608100891, "reward_std": 0.09868072345852852, "rewards/accuracy_reward": 0.8750360310077667, "rewards/format_reward": 1.0, "step": 4356 }, { "completion_length": 243.21428680419922, "epoch": 0.438440251572327, "grad_norm": 1.0656592845916748, "kl": 0.1015625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6170020699501038, "reward_std": 0.20327768474817276, "rewards/accuracy_reward": 0.6578184962272644, "rewards/format_reward": 0.9591836631298065, "step": 4357 }, { "completion_length": 242.23468780517578, "epoch": 0.43854088050314466, "grad_norm": 1.0886605978012085, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6469979286193848, "reward_std": 0.06258084625005722, "rewards/accuracy_reward": 0.6469978988170624, "rewards/format_reward": 1.0, "step": 4358 }, { "completion_length": 189.4897918701172, "epoch": 0.43864150943396224, "grad_norm": 0.5525232553482056, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.94735985994339, "reward_std": 0.10641100257635117, "rewards/accuracy_reward": 0.9473598301410675, "rewards/format_reward": 1.0, "step": 4359 }, { "completion_length": 258.948974609375, "epoch": 0.4387421383647799, "grad_norm": 2.4727985858917236, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6702895760536194, "reward_std": 0.22199562191963196, "rewards/accuracy_reward": 0.6702896058559418, "rewards/format_reward": 1.0, "step": 4360 }, { "completion_length": 188.63265228271484, "epoch": 0.43884276729559746, "grad_norm": 0.3265584707260132, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.904818594455719, "reward_std": 0.11423131078481674, "rewards/accuracy_reward": 0.9150226414203644, "rewards/format_reward": 0.9897959232330322, "step": 4361 }, { "completion_length": 266.2142868041992, "epoch": 0.4389433962264151, "grad_norm": 0.5200884342193604, "kl": 0.0706787109375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7542517185211182, "reward_std": 0.12655708938837051, "rewards/accuracy_reward": 0.7644557654857635, "rewards/format_reward": 0.9897959232330322, "step": 4362 }, { "completion_length": 205.10203552246094, "epoch": 0.43904402515723273, "grad_norm": 0.8224497437477112, "kl": 0.122314453125, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.7896912097930908, "reward_std": 0.1389179825782776, "rewards/accuracy_reward": 0.7896912395954132, "rewards/format_reward": 1.0, "step": 4363 }, { "completion_length": 243.52039337158203, "epoch": 0.4391446540880503, "grad_norm": 1.934408187866211, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.5329436659812927, "reward_std": 0.28446705639362335, "rewards/accuracy_reward": 0.5431478172540665, "rewards/format_reward": 0.9897959232330322, "step": 4364 }, { "completion_length": 183.1530532836914, "epoch": 0.43924528301886795, "grad_norm": 0.8003379702568054, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7916136980056763, "reward_std": 0.12311100028455257, "rewards/accuracy_reward": 0.7916136980056763, "rewards/format_reward": 1.0, "step": 4365 }, { "completion_length": 211.4897918701172, "epoch": 0.43934591194968553, "grad_norm": 0.7540038228034973, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6224489212036133, "reward_std": 0.1730649620294571, "rewards/accuracy_reward": 0.6224489808082581, "rewards/format_reward": 1.0, "step": 4366 }, { "completion_length": 288.52040100097656, "epoch": 0.43944654088050317, "grad_norm": 0.8430668711662292, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6250653266906738, "reward_std": 0.2246527597308159, "rewards/accuracy_reward": 0.6454735547304153, "rewards/format_reward": 0.9795918464660645, "step": 4367 }, { "completion_length": 175.61224365234375, "epoch": 0.43954716981132075, "grad_norm": 0.7227577567100525, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.9061224460601807, "reward_std": 0.059394415467977524, "rewards/accuracy_reward": 0.9163265228271484, "rewards/format_reward": 0.9897959232330322, "step": 4368 }, { "completion_length": 270.2142791748047, "epoch": 0.4396477987421384, "grad_norm": 7.765801429748535, "kl": 0.812255859375, "learning_rate": 1e-06, "loss": 0.0324, "reward": 1.7658555507659912, "reward_std": 0.18674026429653168, "rewards/accuracy_reward": 0.7964678108692169, "rewards/format_reward": 0.9693877398967743, "step": 4369 }, { "completion_length": 226.91836547851562, "epoch": 0.43974842767295597, "grad_norm": 0.7154572606086731, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7617192268371582, "reward_std": 0.15132689103484154, "rewards/accuracy_reward": 0.8025355637073517, "rewards/format_reward": 0.9591836333274841, "step": 4370 }, { "completion_length": 191.37754821777344, "epoch": 0.4398490566037736, "grad_norm": 0.680747926235199, "kl": 0.10546875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7243589162826538, "reward_std": 0.0971192717552185, "rewards/accuracy_reward": 0.7243589758872986, "rewards/format_reward": 1.0, "step": 4371 }, { "completion_length": 237.59182739257812, "epoch": 0.4399496855345912, "grad_norm": 0.5194264650344849, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8865697979927063, "reward_std": 0.1293656937777996, "rewards/accuracy_reward": 0.8967738449573517, "rewards/format_reward": 0.9897959232330322, "step": 4372 }, { "completion_length": 254.03060913085938, "epoch": 0.4400503144654088, "grad_norm": 0.9080484509468079, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7136067748069763, "reward_std": 0.18979913741350174, "rewards/accuracy_reward": 0.7340149879455566, "rewards/format_reward": 0.9795918166637421, "step": 4373 }, { "completion_length": 188.85713958740234, "epoch": 0.4401509433962264, "grad_norm": 1.034501314163208, "kl": 0.107666015625, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.6461613178253174, "reward_std": 0.19847175478935242, "rewards/accuracy_reward": 0.6563653945922852, "rewards/format_reward": 0.9897959232330322, "step": 4374 }, { "completion_length": 205.57142639160156, "epoch": 0.44025157232704404, "grad_norm": 13.077573776245117, "kl": 0.10986328125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7104116678237915, "reward_std": 0.15716403350234032, "rewards/accuracy_reward": 0.7104116380214691, "rewards/format_reward": 1.0, "step": 4375 }, { "completion_length": 188.20407104492188, "epoch": 0.4403522012578616, "grad_norm": 0.8086493015289307, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8201651573181152, "reward_std": 0.14127376675605774, "rewards/accuracy_reward": 0.8303692936897278, "rewards/format_reward": 0.9897959232330322, "step": 4376 }, { "completion_length": 230.4285659790039, "epoch": 0.44045283018867926, "grad_norm": 2.9341824054718018, "kl": 0.111083984375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8911409378051758, "reward_std": 0.15171778574585915, "rewards/accuracy_reward": 0.9013450443744659, "rewards/format_reward": 0.9897959232330322, "step": 4377 }, { "completion_length": 172.2040786743164, "epoch": 0.44055345911949684, "grad_norm": 0.6788367033004761, "kl": 0.11279296875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7178725600242615, "reward_std": 0.0869308803230524, "rewards/accuracy_reward": 0.7178725600242615, "rewards/format_reward": 1.0, "step": 4378 }, { "completion_length": 219.9183578491211, "epoch": 0.4406540880503145, "grad_norm": 0.723608136177063, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8593461513519287, "reward_std": 0.09149011969566345, "rewards/accuracy_reward": 0.8695502877235413, "rewards/format_reward": 0.9897959232330322, "step": 4379 }, { "completion_length": 355.51019287109375, "epoch": 0.44075471698113206, "grad_norm": 1.1744786500930786, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5091739296913147, "reward_std": 0.3095333054661751, "rewards/accuracy_reward": 0.5397861897945404, "rewards/format_reward": 0.9693877398967743, "step": 4380 }, { "completion_length": 224.4591827392578, "epoch": 0.4408553459119497, "grad_norm": 0.755255937576294, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8733086585998535, "reward_std": 0.13821056857705116, "rewards/accuracy_reward": 0.8733086287975311, "rewards/format_reward": 1.0, "step": 4381 }, { "completion_length": 197.97958374023438, "epoch": 0.4409559748427673, "grad_norm": 0.9137549996376038, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8798185586929321, "reward_std": 0.12102923542261124, "rewards/accuracy_reward": 0.8798185884952545, "rewards/format_reward": 1.0, "step": 4382 }, { "completion_length": 247.84693145751953, "epoch": 0.4410566037735849, "grad_norm": 0.7392094135284424, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7463220357894897, "reward_std": 0.1758839339017868, "rewards/accuracy_reward": 0.7463220357894897, "rewards/format_reward": 1.0, "step": 4383 }, { "completion_length": 219.38774871826172, "epoch": 0.4411572327044025, "grad_norm": 1.0453317165374756, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8639455437660217, "reward_std": 0.18775787949562073, "rewards/accuracy_reward": 0.90476194024086, "rewards/format_reward": 0.9591836631298065, "step": 4384 }, { "completion_length": 225.91836547851562, "epoch": 0.4412578616352201, "grad_norm": 0.6912844181060791, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7959182858467102, "reward_std": 0.14951694756746292, "rewards/accuracy_reward": 0.795918345451355, "rewards/format_reward": 1.0, "step": 4385 }, { "completion_length": 214.78570556640625, "epoch": 0.4413584905660377, "grad_norm": 0.936845600605011, "kl": 0.114501953125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.817232072353363, "reward_std": 0.16542012989521027, "rewards/accuracy_reward": 0.8274362683296204, "rewards/format_reward": 0.9897959232330322, "step": 4386 }, { "completion_length": 164.91836547851562, "epoch": 0.44145911949685535, "grad_norm": 2.704515218734741, "kl": 0.2509765625, "learning_rate": 1e-06, "loss": 0.0101, "reward": 1.6788809299468994, "reward_std": 0.22076024115085602, "rewards/accuracy_reward": 0.7196972370147705, "rewards/format_reward": 0.9591836631298065, "step": 4387 }, { "completion_length": 221.9693832397461, "epoch": 0.441559748427673, "grad_norm": 0.7943315505981445, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7063698172569275, "reward_std": 0.16208330169320107, "rewards/accuracy_reward": 0.7063698172569275, "rewards/format_reward": 1.0, "step": 4388 }, { "completion_length": 219.59183502197266, "epoch": 0.44166037735849056, "grad_norm": 0.6196601986885071, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.855494499206543, "reward_std": 0.11216778308153152, "rewards/accuracy_reward": 0.8656985759735107, "rewards/format_reward": 0.9897959232330322, "step": 4389 }, { "completion_length": 228.14285278320312, "epoch": 0.4417610062893082, "grad_norm": 1.2420852184295654, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6763848066329956, "reward_std": 0.226917315274477, "rewards/accuracy_reward": 0.6763848066329956, "rewards/format_reward": 1.0, "step": 4390 }, { "completion_length": 163.88774871826172, "epoch": 0.4418616352201258, "grad_norm": 0.5342159867286682, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8590038418769836, "reward_std": 0.07798146456480026, "rewards/accuracy_reward": 0.8692079484462738, "rewards/format_reward": 0.9897959232330322, "step": 4391 }, { "completion_length": 238.54080963134766, "epoch": 0.4419622641509434, "grad_norm": 20.477880477905273, "kl": 0.828857421875, "learning_rate": 1e-06, "loss": 0.0331, "reward": 1.6515483260154724, "reward_std": 0.1809949427843094, "rewards/accuracy_reward": 0.6923646628856659, "rewards/format_reward": 0.9591836631298065, "step": 4392 }, { "completion_length": 259.8571472167969, "epoch": 0.442062893081761, "grad_norm": 1.028097152709961, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7650253772735596, "reward_std": 0.1588454395532608, "rewards/accuracy_reward": 0.7854335606098175, "rewards/format_reward": 0.9795918464660645, "step": 4393 }, { "completion_length": 187.7551040649414, "epoch": 0.44216352201257864, "grad_norm": 0.6944170594215393, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.9149659872055054, "reward_std": 0.15402612835168839, "rewards/accuracy_reward": 0.9251700639724731, "rewards/format_reward": 0.9897959232330322, "step": 4394 }, { "completion_length": 198.23468780517578, "epoch": 0.4422641509433962, "grad_norm": 0.669362485408783, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.887269139289856, "reward_std": 0.23417914658784866, "rewards/accuracy_reward": 0.9076773524284363, "rewards/format_reward": 0.9795918166637421, "step": 4395 }, { "completion_length": 287.54080963134766, "epoch": 0.44236477987421385, "grad_norm": 1.0598225593566895, "kl": 0.097900390625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.5753957033157349, "reward_std": 0.2747483029961586, "rewards/accuracy_reward": 0.646824300289154, "rewards/format_reward": 0.9285714030265808, "step": 4396 }, { "completion_length": 276.8775405883789, "epoch": 0.44246540880503143, "grad_norm": 0.6433655619621277, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.677309811115265, "reward_std": 0.2889929488301277, "rewards/accuracy_reward": 0.769146591424942, "rewards/format_reward": 0.9081632494926453, "step": 4397 }, { "completion_length": 283.78570556640625, "epoch": 0.44256603773584907, "grad_norm": 0.9282804131507874, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.5573559403419495, "reward_std": 0.19633226841688156, "rewards/accuracy_reward": 0.5879682302474976, "rewards/format_reward": 0.9693877398967743, "step": 4398 }, { "completion_length": 226.57141876220703, "epoch": 0.44266666666666665, "grad_norm": 0.7323634624481201, "kl": 0.099609375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.593664526939392, "reward_std": 0.23335883021354675, "rewards/accuracy_reward": 0.6446849405765533, "rewards/format_reward": 0.9489795565605164, "step": 4399 }, { "completion_length": 253.2448959350586, "epoch": 0.4427672955974843, "grad_norm": 0.823150098323822, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.75776207447052, "reward_std": 0.21262230724096298, "rewards/accuracy_reward": 0.7985784411430359, "rewards/format_reward": 0.9591836631298065, "step": 4400 }, { "completion_length": 274.1632537841797, "epoch": 0.44286792452830187, "grad_norm": 0.6330299973487854, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6529598236083984, "reward_std": 0.22471670806407928, "rewards/accuracy_reward": 0.6937762200832367, "rewards/format_reward": 0.9591836333274841, "step": 4401 }, { "completion_length": 241.62245178222656, "epoch": 0.4429685534591195, "grad_norm": 0.6950730085372925, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6125688552856445, "reward_std": 0.22409608587622643, "rewards/accuracy_reward": 0.6737933307886124, "rewards/format_reward": 0.9387754797935486, "step": 4402 }, { "completion_length": 198.5, "epoch": 0.4430691823899371, "grad_norm": 1.169311761856079, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7482993006706238, "reward_std": 0.23904811590909958, "rewards/accuracy_reward": 0.7891156375408173, "rewards/format_reward": 0.9591836631298065, "step": 4403 }, { "completion_length": 251.26529693603516, "epoch": 0.4431698113207547, "grad_norm": 0.6388559341430664, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6147541403770447, "reward_std": 0.19924505054950714, "rewards/accuracy_reward": 0.6453664004802704, "rewards/format_reward": 0.9693877398967743, "step": 4404 }, { "completion_length": 253.9591827392578, "epoch": 0.4432704402515723, "grad_norm": 0.570953905582428, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6549164652824402, "reward_std": 0.12753082811832428, "rewards/accuracy_reward": 0.6855287551879883, "rewards/format_reward": 0.9693877398967743, "step": 4405 }, { "completion_length": 229.32653045654297, "epoch": 0.44337106918238994, "grad_norm": 0.4916739761829376, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8247371315956116, "reward_std": 0.09473006427288055, "rewards/accuracy_reward": 0.8553494215011597, "rewards/format_reward": 0.9693877398967743, "step": 4406 }, { "completion_length": 197.59183502197266, "epoch": 0.4434716981132075, "grad_norm": 0.821051299571991, "kl": 0.12255859375, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.725000023841858, "reward_std": 0.19073734432458878, "rewards/accuracy_reward": 0.7760204076766968, "rewards/format_reward": 0.9489795565605164, "step": 4407 }, { "completion_length": 180.04080963134766, "epoch": 0.44357232704402516, "grad_norm": 0.6683246493339539, "kl": 0.129638671875, "learning_rate": 1e-06, "loss": 0.0052, "reward": 1.973922848701477, "reward_std": 0.06899351254105568, "rewards/accuracy_reward": 0.9841269552707672, "rewards/format_reward": 0.9897959232330322, "step": 4408 }, { "completion_length": 236.1836700439453, "epoch": 0.44367295597484274, "grad_norm": 0.90610671043396, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7795919179916382, "reward_std": 0.17431041598320007, "rewards/accuracy_reward": 0.8204081356525421, "rewards/format_reward": 0.9591836631298065, "step": 4409 }, { "completion_length": 231.32652282714844, "epoch": 0.4437735849056604, "grad_norm": 0.7010757327079773, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7428525686264038, "reward_std": 0.19663777947425842, "rewards/accuracy_reward": 0.7734648585319519, "rewards/format_reward": 0.9693877398967743, "step": 4410 }, { "completion_length": 254.19387817382812, "epoch": 0.44387421383647796, "grad_norm": 0.7919001579284668, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7108562588691711, "reward_std": 0.12503774091601372, "rewards/accuracy_reward": 0.7210603654384613, "rewards/format_reward": 0.9897959232330322, "step": 4411 }, { "completion_length": 174.38775634765625, "epoch": 0.4439748427672956, "grad_norm": 0.9211832284927368, "kl": 0.11865234375, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.8232600688934326, "reward_std": 0.11549583077430725, "rewards/accuracy_reward": 0.833464115858078, "rewards/format_reward": 0.9897959232330322, "step": 4412 }, { "completion_length": 247.15306091308594, "epoch": 0.44407547169811323, "grad_norm": 117.1272201538086, "kl": 3.1728515625, "learning_rate": 1e-06, "loss": 0.127, "reward": 1.561636745929718, "reward_std": 0.3009343892335892, "rewards/accuracy_reward": 0.612657219171524, "rewards/format_reward": 0.9489795565605164, "step": 4413 }, { "completion_length": 319.79591369628906, "epoch": 0.4441761006289308, "grad_norm": 1.0635874271392822, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6510204076766968, "reward_std": 0.20715214684605598, "rewards/accuracy_reward": 0.6816326677799225, "rewards/format_reward": 0.9693877398967743, "step": 4414 }, { "completion_length": 185.90816497802734, "epoch": 0.44427672955974845, "grad_norm": 1.1520562171936035, "kl": 0.09423828125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6961365342140198, "reward_std": 0.16243527084589005, "rewards/accuracy_reward": 0.7063405811786652, "rewards/format_reward": 0.9897959232330322, "step": 4415 }, { "completion_length": 234.80611419677734, "epoch": 0.44437735849056603, "grad_norm": 0.9373937249183655, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6065862774848938, "reward_std": 0.15666358172893524, "rewards/accuracy_reward": 0.6269944310188293, "rewards/format_reward": 0.9795918464660645, "step": 4416 }, { "completion_length": 256.17345428466797, "epoch": 0.44447798742138367, "grad_norm": 2.141815662384033, "kl": 0.05224609375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7687074542045593, "reward_std": 0.12857268750667572, "rewards/accuracy_reward": 0.7687074840068817, "rewards/format_reward": 1.0, "step": 4417 }, { "completion_length": 235.38774871826172, "epoch": 0.44457861635220125, "grad_norm": 0.7240145802497864, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7857142686843872, "reward_std": 0.11820796877145767, "rewards/accuracy_reward": 0.8061224222183228, "rewards/format_reward": 0.9795918464660645, "step": 4418 }, { "completion_length": 208.89794921875, "epoch": 0.4446792452830189, "grad_norm": 1.202244520187378, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.752999722957611, "reward_std": 0.21408917754888535, "rewards/accuracy_reward": 0.7632037401199341, "rewards/format_reward": 0.9897959232330322, "step": 4419 }, { "completion_length": 225.58162689208984, "epoch": 0.44477987421383647, "grad_norm": 0.9093366265296936, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8374635577201843, "reward_std": 0.22357584536075592, "rewards/accuracy_reward": 0.8476676046848297, "rewards/format_reward": 0.9897959232330322, "step": 4420 }, { "completion_length": 305.61224365234375, "epoch": 0.4448805031446541, "grad_norm": 0.5909328460693359, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7011870741844177, "reward_std": 0.1913682483136654, "rewards/accuracy_reward": 0.7215953171253204, "rewards/format_reward": 0.9795918166637421, "step": 4421 }, { "completion_length": 209.448974609375, "epoch": 0.4449811320754717, "grad_norm": 0.3634580373764038, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8270164728164673, "reward_std": 0.08474217541515827, "rewards/accuracy_reward": 0.8474246859550476, "rewards/format_reward": 0.9795918166637421, "step": 4422 }, { "completion_length": 211.81632232666016, "epoch": 0.4450817610062893, "grad_norm": 1.6083967685699463, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.803287923336029, "reward_std": 0.15102646872401237, "rewards/accuracy_reward": 0.8134920299053192, "rewards/format_reward": 0.9897959232330322, "step": 4423 }, { "completion_length": 242.6734619140625, "epoch": 0.4451823899371069, "grad_norm": 0.977323591709137, "kl": 0.059814453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6305341720581055, "reward_std": 0.2595958784222603, "rewards/accuracy_reward": 0.6611463725566864, "rewards/format_reward": 0.9693877398967743, "step": 4424 }, { "completion_length": 211.12244415283203, "epoch": 0.44528301886792454, "grad_norm": 0.4177750051021576, "kl": 0.0701904296875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7800453305244446, "reward_std": 0.059789412654936314, "rewards/accuracy_reward": 0.7902494370937347, "rewards/format_reward": 0.9897959232330322, "step": 4425 }, { "completion_length": 223.84693145751953, "epoch": 0.4453836477987421, "grad_norm": 0.6472622752189636, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7023847699165344, "reward_std": 0.07140962406992912, "rewards/accuracy_reward": 0.7023848593235016, "rewards/format_reward": 1.0, "step": 4426 }, { "completion_length": 217.72449493408203, "epoch": 0.44548427672955976, "grad_norm": 0.767295777797699, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7430874109268188, "reward_std": 0.184585839509964, "rewards/accuracy_reward": 0.753291517496109, "rewards/format_reward": 0.9897959232330322, "step": 4427 }, { "completion_length": 274.61224365234375, "epoch": 0.44558490566037734, "grad_norm": 0.8149682283401489, "kl": 0.11181640625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.71632719039917, "reward_std": 0.18855801224708557, "rewards/accuracy_reward": 0.7367353439331055, "rewards/format_reward": 0.9795918464660645, "step": 4428 }, { "completion_length": 266.3877487182617, "epoch": 0.445685534591195, "grad_norm": 0.6123437285423279, "kl": 0.10302734375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7396413087844849, "reward_std": 0.15466924011707306, "rewards/accuracy_reward": 0.7600494027137756, "rewards/format_reward": 0.9795918166637421, "step": 4429 }, { "completion_length": 200.12244415283203, "epoch": 0.44578616352201256, "grad_norm": 0.6161934733390808, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7842344045639038, "reward_std": 0.13325171917676926, "rewards/accuracy_reward": 0.8046425879001617, "rewards/format_reward": 0.9795918464660645, "step": 4430 }, { "completion_length": 245.4897918701172, "epoch": 0.4458867924528302, "grad_norm": 2.8403022289276123, "kl": 0.09912109375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7434980869293213, "reward_std": 0.19429098069667816, "rewards/accuracy_reward": 0.7741104066371918, "rewards/format_reward": 0.9693877398967743, "step": 4431 }, { "completion_length": 253.82652282714844, "epoch": 0.4459874213836478, "grad_norm": 0.6561470627784729, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6248299479484558, "reward_std": 0.13687507808208466, "rewards/accuracy_reward": 0.645238071680069, "rewards/format_reward": 0.9795918464660645, "step": 4432 }, { "completion_length": 275.8061218261719, "epoch": 0.4460880503144654, "grad_norm": 0.8281733989715576, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7987225651741028, "reward_std": 0.24296164512634277, "rewards/accuracy_reward": 0.8191307187080383, "rewards/format_reward": 0.9795918464660645, "step": 4433 }, { "completion_length": 269.89794921875, "epoch": 0.446188679245283, "grad_norm": 0.7321357131004333, "kl": 0.0697021484375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7122448682785034, "reward_std": 0.26176413893699646, "rewards/accuracy_reward": 0.7530612051486969, "rewards/format_reward": 0.9591836631298065, "step": 4434 }, { "completion_length": 213.9897918701172, "epoch": 0.44628930817610063, "grad_norm": 0.5181815028190613, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7946402430534363, "reward_std": 0.14267997071146965, "rewards/accuracy_reward": 0.804844319820404, "rewards/format_reward": 0.9897959232330322, "step": 4435 }, { "completion_length": 225.82652282714844, "epoch": 0.44638993710691827, "grad_norm": 0.9003838896751404, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8398451209068298, "reward_std": 0.1873232126235962, "rewards/accuracy_reward": 0.85004922747612, "rewards/format_reward": 0.9897959232330322, "step": 4436 }, { "completion_length": 204.2040786743164, "epoch": 0.44649056603773585, "grad_norm": 2.0185210704803467, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8801829814910889, "reward_std": 0.20862843096256256, "rewards/accuracy_reward": 0.9107952415943146, "rewards/format_reward": 0.9693877398967743, "step": 4437 }, { "completion_length": 253.4285659790039, "epoch": 0.4465911949685535, "grad_norm": 0.9783087968826294, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6566588282585144, "reward_std": 0.24674276262521744, "rewards/accuracy_reward": 0.6872710883617401, "rewards/format_reward": 0.9693877398967743, "step": 4438 }, { "completion_length": 257.94898223876953, "epoch": 0.44669182389937107, "grad_norm": 0.8262836337089539, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.5657642483711243, "reward_std": 0.1507967859506607, "rewards/accuracy_reward": 0.6065806150436401, "rewards/format_reward": 0.9591836631298065, "step": 4439 }, { "completion_length": 237.09182739257812, "epoch": 0.4467924528301887, "grad_norm": 0.5006949305534363, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.689504325389862, "reward_std": 0.10602135956287384, "rewards/accuracy_reward": 0.6997084617614746, "rewards/format_reward": 0.9897959232330322, "step": 4440 }, { "completion_length": 273.9897918701172, "epoch": 0.4468930817610063, "grad_norm": 0.7411093711853027, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7186588644981384, "reward_std": 0.12303733080625534, "rewards/accuracy_reward": 0.7390670478343964, "rewards/format_reward": 0.9795918464660645, "step": 4441 }, { "completion_length": 203.55101776123047, "epoch": 0.4469937106918239, "grad_norm": 1.184147834777832, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8385900259017944, "reward_std": 0.18634572625160217, "rewards/accuracy_reward": 0.8385899662971497, "rewards/format_reward": 1.0, "step": 4442 }, { "completion_length": 258.79591369628906, "epoch": 0.4470943396226415, "grad_norm": 0.9361963868141174, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6914854049682617, "reward_std": 0.22885610163211823, "rewards/accuracy_reward": 0.7016894519329071, "rewards/format_reward": 0.9897959232330322, "step": 4443 }, { "completion_length": 185.5408172607422, "epoch": 0.44719496855345914, "grad_norm": 1.6579053401947021, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.715743362903595, "reward_std": 0.13248217850923538, "rewards/accuracy_reward": 0.7157434225082397, "rewards/format_reward": 1.0, "step": 4444 }, { "completion_length": 191.64285278320312, "epoch": 0.4472955974842767, "grad_norm": 0.9184241890907288, "kl": 0.125732421875, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.8715689182281494, "reward_std": 0.15029148757457733, "rewards/accuracy_reward": 0.8817729651927948, "rewards/format_reward": 0.9897959232330322, "step": 4445 }, { "completion_length": 234.07141876220703, "epoch": 0.44739622641509436, "grad_norm": 0.981157124042511, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8109086155891418, "reward_std": 0.14818403497338295, "rewards/accuracy_reward": 0.8109086453914642, "rewards/format_reward": 1.0, "step": 4446 }, { "completion_length": 250.97958374023438, "epoch": 0.44749685534591194, "grad_norm": 0.8138037919998169, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6344650387763977, "reward_std": 0.19230107963085175, "rewards/accuracy_reward": 0.6548732221126556, "rewards/format_reward": 0.9795918464660645, "step": 4447 }, { "completion_length": 189.6938705444336, "epoch": 0.4475974842767296, "grad_norm": 4.065664768218994, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.5459656715393066, "reward_std": 0.12986060604453087, "rewards/accuracy_reward": 0.5663738548755646, "rewards/format_reward": 0.9795918166637421, "step": 4448 }, { "completion_length": 237.82652282714844, "epoch": 0.44769811320754715, "grad_norm": 0.8472381234169006, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7986397743225098, "reward_std": 0.1475423201918602, "rewards/accuracy_reward": 0.8088438510894775, "rewards/format_reward": 0.9897959232330322, "step": 4449 }, { "completion_length": 211.33673095703125, "epoch": 0.4477987421383648, "grad_norm": 0.780164897441864, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7894557118415833, "reward_std": 0.15350157022476196, "rewards/accuracy_reward": 0.7996598780155182, "rewards/format_reward": 0.9897959232330322, "step": 4450 }, { "completion_length": 181.39794921875, "epoch": 0.4478993710691824, "grad_norm": 0.9682658314704895, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6963555812835693, "reward_std": 0.19145400077104568, "rewards/accuracy_reward": 0.726967841386795, "rewards/format_reward": 0.9693877398967743, "step": 4451 }, { "completion_length": 179.32653045654297, "epoch": 0.448, "grad_norm": 1.1645410060882568, "kl": 0.0760498046875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.775883972644806, "reward_std": 0.17410703748464584, "rewards/accuracy_reward": 0.7962920963764191, "rewards/format_reward": 0.9795918166637421, "step": 4452 }, { "completion_length": 210.13265228271484, "epoch": 0.4481006289308176, "grad_norm": 0.5623675584793091, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8059524297714233, "reward_std": 0.1470521241426468, "rewards/accuracy_reward": 0.8263605237007141, "rewards/format_reward": 0.9795918166637421, "step": 4453 }, { "completion_length": 216.52039337158203, "epoch": 0.4482012578616352, "grad_norm": 0.7244739532470703, "kl": 0.062744140625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7943485379219055, "reward_std": 0.15163946896791458, "rewards/accuracy_reward": 0.7943485081195831, "rewards/format_reward": 1.0, "step": 4454 }, { "completion_length": 182.30612182617188, "epoch": 0.4483018867924528, "grad_norm": 0.9059143662452698, "kl": 0.1044921875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8683257699012756, "reward_std": 0.12796767055988312, "rewards/accuracy_reward": 0.8683258593082428, "rewards/format_reward": 1.0, "step": 4455 }, { "completion_length": 231.2653045654297, "epoch": 0.44840251572327045, "grad_norm": 0.4900464117527008, "kl": 0.0672607421875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.767512023448944, "reward_std": 0.17029058188199997, "rewards/accuracy_reward": 0.7879201769828796, "rewards/format_reward": 0.9795918166637421, "step": 4456 }, { "completion_length": 210.2040786743164, "epoch": 0.448503144654088, "grad_norm": 0.9656317830085754, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7091681957244873, "reward_std": 0.20612459257245064, "rewards/accuracy_reward": 0.7295763492584229, "rewards/format_reward": 0.9795918166637421, "step": 4457 }, { "completion_length": 207.7040786743164, "epoch": 0.44860377358490566, "grad_norm": 0.561484694480896, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7538152933120728, "reward_std": 0.17326710373163223, "rewards/accuracy_reward": 0.7742234766483307, "rewards/format_reward": 0.9795918464660645, "step": 4458 }, { "completion_length": 242.14286041259766, "epoch": 0.44870440251572324, "grad_norm": 0.7011138200759888, "kl": 0.0665283203125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.66700679063797, "reward_std": 0.1580263078212738, "rewards/accuracy_reward": 0.6670067608356476, "rewards/format_reward": 1.0, "step": 4459 }, { "completion_length": 146.15306091308594, "epoch": 0.4488050314465409, "grad_norm": 1.1513391733169556, "kl": 0.113037109375, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.8571428060531616, "reward_std": 0.0863918773829937, "rewards/accuracy_reward": 0.857142835855484, "rewards/format_reward": 1.0, "step": 4460 }, { "completion_length": 179.27550506591797, "epoch": 0.4489056603773585, "grad_norm": 1.1154911518096924, "kl": 0.103759765625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.724100947380066, "reward_std": 0.10609699599444866, "rewards/accuracy_reward": 0.7241010963916779, "rewards/format_reward": 1.0, "step": 4461 }, { "completion_length": 190.33673095703125, "epoch": 0.4490062893081761, "grad_norm": 1.3212772607803345, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8244983553886414, "reward_std": 0.1762421503663063, "rewards/accuracy_reward": 0.8347024619579315, "rewards/format_reward": 0.9897959232330322, "step": 4462 }, { "completion_length": 223.4795913696289, "epoch": 0.44910691823899374, "grad_norm": 0.7722669839859009, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.862973690032959, "reward_std": 0.17184039950370789, "rewards/accuracy_reward": 0.8629737198352814, "rewards/format_reward": 1.0, "step": 4463 }, { "completion_length": 242.20407104492188, "epoch": 0.4492075471698113, "grad_norm": 0.8255513310432434, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5746962428092957, "reward_std": 0.2290954813361168, "rewards/accuracy_reward": 0.595104455947876, "rewards/format_reward": 0.9795918166637421, "step": 4464 }, { "completion_length": 279.10203552246094, "epoch": 0.44930817610062895, "grad_norm": 0.9379883408546448, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6499042510986328, "reward_std": 0.29617927968502045, "rewards/accuracy_reward": 0.6601083278656006, "rewards/format_reward": 0.9897959232330322, "step": 4465 }, { "completion_length": 179.11224365234375, "epoch": 0.44940880503144653, "grad_norm": 2.4783084392547607, "kl": 0.0626220703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8478458523750305, "reward_std": 0.18990695476531982, "rewards/accuracy_reward": 0.8682539761066437, "rewards/format_reward": 0.9795918464660645, "step": 4466 }, { "completion_length": 212.05101776123047, "epoch": 0.44950943396226417, "grad_norm": 0.900276243686676, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7111449241638184, "reward_std": 0.19575828313827515, "rewards/accuracy_reward": 0.7213490307331085, "rewards/format_reward": 0.9897959232330322, "step": 4467 }, { "completion_length": 230.4897918701172, "epoch": 0.44961006289308175, "grad_norm": 0.2683919370174408, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8265305757522583, "reward_std": 0.05289251729846001, "rewards/accuracy_reward": 0.8265305757522583, "rewards/format_reward": 1.0, "step": 4468 }, { "completion_length": 237.7653045654297, "epoch": 0.4497106918238994, "grad_norm": 0.6895603537559509, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.67558753490448, "reward_std": 0.11675257096067071, "rewards/accuracy_reward": 0.685791552066803, "rewards/format_reward": 0.9897959232330322, "step": 4469 }, { "completion_length": 147.62244415283203, "epoch": 0.44981132075471697, "grad_norm": 6.259212970733643, "kl": 0.10498046875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.684565544128418, "reward_std": 0.12004555389285088, "rewards/accuracy_reward": 0.6845655739307404, "rewards/format_reward": 1.0, "step": 4470 }, { "completion_length": 226.02040100097656, "epoch": 0.4499119496855346, "grad_norm": 0.9421848058700562, "kl": 0.062744140625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6577854752540588, "reward_std": 0.29746773838996887, "rewards/accuracy_reward": 0.6679896414279938, "rewards/format_reward": 0.9897959232330322, "step": 4471 }, { "completion_length": 182.51020050048828, "epoch": 0.4500125786163522, "grad_norm": 0.8685166835784912, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.748774528503418, "reward_std": 0.12718511000275612, "rewards/accuracy_reward": 0.7487745881080627, "rewards/format_reward": 1.0, "step": 4472 }, { "completion_length": 220.9693832397461, "epoch": 0.4501132075471698, "grad_norm": 0.6683990955352783, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7060009241104126, "reward_std": 0.19089809758588672, "rewards/accuracy_reward": 0.7264091372489929, "rewards/format_reward": 0.9795918166637421, "step": 4473 }, { "completion_length": 166.0, "epoch": 0.4502138364779874, "grad_norm": 0.6138845086097717, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7561585903167725, "reward_std": 0.12167736887931824, "rewards/accuracy_reward": 0.7663625776767731, "rewards/format_reward": 0.9897959232330322, "step": 4474 }, { "completion_length": 266.1632537841797, "epoch": 0.45031446540880504, "grad_norm": 0.6009877920150757, "kl": 0.048828125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6876391768455505, "reward_std": 0.1831147000193596, "rewards/accuracy_reward": 0.6876392364501953, "rewards/format_reward": 1.0, "step": 4475 }, { "completion_length": 187.1326446533203, "epoch": 0.4504150943396226, "grad_norm": 3.8430137634277344, "kl": 0.124755859375, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.6673506498336792, "reward_std": 0.1352389231324196, "rewards/accuracy_reward": 0.6673506796360016, "rewards/format_reward": 1.0, "step": 4476 }, { "completion_length": 189.11223602294922, "epoch": 0.45051572327044026, "grad_norm": 1.1983672380447388, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.832199513912201, "reward_std": 0.1921774223446846, "rewards/accuracy_reward": 0.8321995437145233, "rewards/format_reward": 1.0, "step": 4477 }, { "completion_length": 219.1530532836914, "epoch": 0.45061635220125784, "grad_norm": 1.4047412872314453, "kl": 0.116455078125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7418058514595032, "reward_std": 0.17178687453269958, "rewards/accuracy_reward": 0.7418057918548584, "rewards/format_reward": 1.0, "step": 4478 }, { "completion_length": 173.2551040649414, "epoch": 0.4507169811320755, "grad_norm": 0.7523118257522583, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8971088528633118, "reward_std": 0.11674976348876953, "rewards/accuracy_reward": 0.9073128998279572, "rewards/format_reward": 0.9897959232330322, "step": 4479 }, { "completion_length": 218.62244415283203, "epoch": 0.45081761006289306, "grad_norm": 0.540158748626709, "kl": 0.10595703125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8293079733848572, "reward_std": 0.12729813158512115, "rewards/accuracy_reward": 0.8497161567211151, "rewards/format_reward": 0.9795918464660645, "step": 4480 }, { "completion_length": 194.9285659790039, "epoch": 0.4509182389937107, "grad_norm": 1.760014533996582, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5398444533348083, "reward_std": 0.3221743032336235, "rewards/accuracy_reward": 0.5398445129394531, "rewards/format_reward": 1.0, "step": 4481 }, { "completion_length": 179.89795684814453, "epoch": 0.4510188679245283, "grad_norm": 0.6104201674461365, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7904875874519348, "reward_std": 0.13144781440496445, "rewards/accuracy_reward": 0.8108957707881927, "rewards/format_reward": 0.9795918464660645, "step": 4482 }, { "completion_length": 224.0306167602539, "epoch": 0.4511194968553459, "grad_norm": 0.7608858346939087, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7108843922615051, "reward_std": 0.19220630824565887, "rewards/accuracy_reward": 0.7108843624591827, "rewards/format_reward": 1.0, "step": 4483 }, { "completion_length": 259.9285583496094, "epoch": 0.4512201257861635, "grad_norm": 0.9252803921699524, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.4989641308784485, "reward_std": 0.1609959527850151, "rewards/accuracy_reward": 0.5193722993135452, "rewards/format_reward": 0.9795918166637421, "step": 4484 }, { "completion_length": 202.41836547851562, "epoch": 0.45132075471698113, "grad_norm": 1.7598789930343628, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.616836667060852, "reward_std": 0.1523767039179802, "rewards/accuracy_reward": 0.6168367266654968, "rewards/format_reward": 1.0, "step": 4485 }, { "completion_length": 238.65306091308594, "epoch": 0.45142138364779877, "grad_norm": 22.06527328491211, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.812584936618805, "reward_std": 0.1910884976387024, "rewards/accuracy_reward": 0.8534013330936432, "rewards/format_reward": 0.9591836631298065, "step": 4486 }, { "completion_length": 226.38774871826172, "epoch": 0.45152201257861635, "grad_norm": 1.299784779548645, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6471368074417114, "reward_std": 0.15494350343942642, "rewards/accuracy_reward": 0.657340943813324, "rewards/format_reward": 0.9897959232330322, "step": 4487 }, { "completion_length": 238.4693832397461, "epoch": 0.451622641509434, "grad_norm": 0.9978132843971252, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6605609059333801, "reward_std": 0.19821403175592422, "rewards/accuracy_reward": 0.6809690594673157, "rewards/format_reward": 0.9795918166637421, "step": 4488 }, { "completion_length": 172.19387817382812, "epoch": 0.45172327044025157, "grad_norm": 0.884941041469574, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8074829578399658, "reward_std": 0.08242611587047577, "rewards/accuracy_reward": 0.8074829578399658, "rewards/format_reward": 1.0, "step": 4489 }, { "completion_length": 240.87754821777344, "epoch": 0.4518238993710692, "grad_norm": 0.7813083529472351, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6227301955223083, "reward_std": 0.20686031877994537, "rewards/accuracy_reward": 0.622730165719986, "rewards/format_reward": 1.0, "step": 4490 }, { "completion_length": 220.448974609375, "epoch": 0.4519245283018868, "grad_norm": 0.8791075944900513, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5566568970680237, "reward_std": 0.2433190420269966, "rewards/accuracy_reward": 0.577065110206604, "rewards/format_reward": 0.9795918166637421, "step": 4491 }, { "completion_length": 223.18366241455078, "epoch": 0.4520251572327044, "grad_norm": 1.161481499671936, "kl": 0.10400390625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7184196710586548, "reward_std": 0.16427329927682877, "rewards/accuracy_reward": 0.7184196412563324, "rewards/format_reward": 1.0, "step": 4492 }, { "completion_length": 195.0204086303711, "epoch": 0.452125786163522, "grad_norm": 0.8588380813598633, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8462584614753723, "reward_std": 0.11893150210380554, "rewards/accuracy_reward": 0.8462584614753723, "rewards/format_reward": 1.0, "step": 4493 }, { "completion_length": 170.93877410888672, "epoch": 0.45222641509433964, "grad_norm": 1.0531902313232422, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7693614959716797, "reward_std": 0.2041567638516426, "rewards/accuracy_reward": 0.7693615853786469, "rewards/format_reward": 1.0, "step": 4494 }, { "completion_length": 182.4285659790039, "epoch": 0.4523270440251572, "grad_norm": 0.5740090012550354, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.827557384967804, "reward_std": 0.11465670168399811, "rewards/accuracy_reward": 0.827557384967804, "rewards/format_reward": 1.0, "step": 4495 }, { "completion_length": 222.4285659790039, "epoch": 0.45242767295597486, "grad_norm": 1.0268837213516235, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7186272740364075, "reward_std": 0.21403034031391144, "rewards/accuracy_reward": 0.7288313806056976, "rewards/format_reward": 0.9897959232330322, "step": 4496 }, { "completion_length": 221.9183578491211, "epoch": 0.45252830188679244, "grad_norm": 2.718498468399048, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7448979616165161, "reward_std": 0.15402612835168839, "rewards/accuracy_reward": 0.7551020383834839, "rewards/format_reward": 0.9897959232330322, "step": 4497 }, { "completion_length": 225.87754821777344, "epoch": 0.4526289308176101, "grad_norm": 1.2209460735321045, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.4766627550125122, "reward_std": 0.2149103507399559, "rewards/accuracy_reward": 0.5072750151157379, "rewards/format_reward": 0.9693877398967743, "step": 4498 }, { "completion_length": 204.73468780517578, "epoch": 0.45272955974842766, "grad_norm": 0.5295360684394836, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8170769214630127, "reward_std": 0.10300750657916069, "rewards/accuracy_reward": 0.8476892411708832, "rewards/format_reward": 0.9693877398967743, "step": 4499 }, { "completion_length": 223.7040786743164, "epoch": 0.4528301886792453, "grad_norm": 0.8131310939788818, "kl": 0.09326171875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7618246674537659, "reward_std": 0.14634516090154648, "rewards/accuracy_reward": 0.7720287442207336, "rewards/format_reward": 0.9897959232330322, "step": 4500 }, { "completion_length": 228.77549743652344, "epoch": 0.4529308176100629, "grad_norm": 0.8591498136520386, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7440990805625916, "reward_std": 0.21502635627985, "rewards/accuracy_reward": 0.7440991997718811, "rewards/format_reward": 1.0, "step": 4501 }, { "completion_length": 267.3061065673828, "epoch": 0.4530314465408805, "grad_norm": 1.8423062562942505, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.742932140827179, "reward_std": 0.18361657112836838, "rewards/accuracy_reward": 0.7633403241634369, "rewards/format_reward": 0.9795918166637421, "step": 4502 }, { "completion_length": 282.8367385864258, "epoch": 0.4531320754716981, "grad_norm": 2.8809807300567627, "kl": 0.1123046875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7433170080184937, "reward_std": 0.17148006707429886, "rewards/accuracy_reward": 0.7433169782161713, "rewards/format_reward": 1.0, "step": 4503 }, { "completion_length": 272.7346954345703, "epoch": 0.45323270440251573, "grad_norm": 3.6147406101226807, "kl": 0.111328125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.6481459736824036, "reward_std": 0.22944609075784683, "rewards/accuracy_reward": 0.6787583231925964, "rewards/format_reward": 0.9693877398967743, "step": 4504 }, { "completion_length": 258.84693908691406, "epoch": 0.4533333333333333, "grad_norm": 1.4102696180343628, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6459690928459167, "reward_std": 0.19156508520245552, "rewards/accuracy_reward": 0.6561732590198517, "rewards/format_reward": 0.9897959232330322, "step": 4505 }, { "completion_length": 269.5816192626953, "epoch": 0.45343396226415095, "grad_norm": 0.5940940380096436, "kl": 0.0782470703125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6735569834709167, "reward_std": 0.19981946423649788, "rewards/accuracy_reward": 0.6939651668071747, "rewards/format_reward": 0.9795918464660645, "step": 4506 }, { "completion_length": 181.37754821777344, "epoch": 0.45353459119496853, "grad_norm": 0.7486650347709656, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.9054834246635437, "reward_std": 0.036832697689533234, "rewards/accuracy_reward": 0.9054834246635437, "rewards/format_reward": 1.0, "step": 4507 }, { "completion_length": 253.7653045654297, "epoch": 0.45363522012578616, "grad_norm": 0.6487442851066589, "kl": 0.0753173828125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7134947776794434, "reward_std": 0.1655537486076355, "rewards/accuracy_reward": 0.7134948074817657, "rewards/format_reward": 1.0, "step": 4508 }, { "completion_length": 243.7346954345703, "epoch": 0.45373584905660375, "grad_norm": 0.43551668524742126, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.922060251235962, "reward_std": 0.03273606114089489, "rewards/accuracy_reward": 0.9220602810382843, "rewards/format_reward": 1.0, "step": 4509 }, { "completion_length": 199.2346954345703, "epoch": 0.4538364779874214, "grad_norm": 0.6991949081420898, "kl": 0.0665283203125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7797213792800903, "reward_std": 0.19807066768407822, "rewards/accuracy_reward": 0.8001295328140259, "rewards/format_reward": 0.9795918464660645, "step": 4510 }, { "completion_length": 307.10203552246094, "epoch": 0.453937106918239, "grad_norm": 0.7530950307846069, "kl": 0.121826171875, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.733437955379486, "reward_std": 0.16812004148960114, "rewards/accuracy_reward": 0.753846138715744, "rewards/format_reward": 0.9795918166637421, "step": 4511 }, { "completion_length": 260.7040786743164, "epoch": 0.4540377358490566, "grad_norm": 0.987235963344574, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7418572902679443, "reward_std": 0.21092556416988373, "rewards/accuracy_reward": 0.7622655034065247, "rewards/format_reward": 0.9795918464660645, "step": 4512 }, { "completion_length": 282.0, "epoch": 0.45413836477987424, "grad_norm": 1.0440421104431152, "kl": 0.102294921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6155454516410828, "reward_std": 0.18403154611587524, "rewards/accuracy_reward": 0.615545392036438, "rewards/format_reward": 1.0, "step": 4513 }, { "completion_length": 245.65306091308594, "epoch": 0.4542389937106918, "grad_norm": 0.6174454689025879, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.7004956007003784, "reward_std": 0.10307896882295609, "rewards/accuracy_reward": 0.710699737071991, "rewards/format_reward": 0.9897959232330322, "step": 4514 }, { "completion_length": 252.83673095703125, "epoch": 0.45433962264150946, "grad_norm": 0.589566707611084, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7978619933128357, "reward_std": 0.22670908272266388, "rewards/accuracy_reward": 0.8182701468467712, "rewards/format_reward": 0.9795918166637421, "step": 4515 }, { "completion_length": 245.04080963134766, "epoch": 0.45444025157232704, "grad_norm": 0.5606390833854675, "kl": 0.091064453125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8453927040100098, "reward_std": 0.06968476250767708, "rewards/accuracy_reward": 0.8453926742076874, "rewards/format_reward": 1.0, "step": 4516 }, { "completion_length": 253.20407104492188, "epoch": 0.4545408805031447, "grad_norm": 0.8189567923545837, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6311001777648926, "reward_std": 0.13025174662470818, "rewards/accuracy_reward": 0.6311002671718597, "rewards/format_reward": 1.0, "step": 4517 }, { "completion_length": 168.9897918701172, "epoch": 0.45464150943396225, "grad_norm": 0.6960116028785706, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7877550721168518, "reward_std": 0.11928880959749222, "rewards/accuracy_reward": 0.7979591190814972, "rewards/format_reward": 0.9897959232330322, "step": 4518 }, { "completion_length": 254.21427154541016, "epoch": 0.4547421383647799, "grad_norm": 1.5172773599624634, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7679463028907776, "reward_std": 0.16355175524950027, "rewards/accuracy_reward": 0.7679463624954224, "rewards/format_reward": 1.0, "step": 4519 }, { "completion_length": 265.58162689208984, "epoch": 0.45484276729559747, "grad_norm": 1.0029035806655884, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.687191367149353, "reward_std": 0.2343236654996872, "rewards/accuracy_reward": 0.7178036272525787, "rewards/format_reward": 0.9693877398967743, "step": 4520 }, { "completion_length": 264.448974609375, "epoch": 0.4549433962264151, "grad_norm": 0.7495366334915161, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.779319167137146, "reward_std": 0.21891819685697556, "rewards/accuracy_reward": 0.7997273504734039, "rewards/format_reward": 0.9795918166637421, "step": 4521 }, { "completion_length": 300.79591369628906, "epoch": 0.4550440251572327, "grad_norm": 0.8325996398925781, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6018902659416199, "reward_std": 0.2095983698964119, "rewards/accuracy_reward": 0.6222984492778778, "rewards/format_reward": 0.9795918464660645, "step": 4522 }, { "completion_length": 293.5918273925781, "epoch": 0.4551446540880503, "grad_norm": 1.2917704582214355, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6182944178581238, "reward_std": 0.2313796579837799, "rewards/accuracy_reward": 0.6489066779613495, "rewards/format_reward": 0.9693877398967743, "step": 4523 }, { "completion_length": 203.57142639160156, "epoch": 0.4552452830188679, "grad_norm": 1.3759117126464844, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7069087028503418, "reward_std": 0.2551370710134506, "rewards/accuracy_reward": 0.7273168563842773, "rewards/format_reward": 0.9795918166637421, "step": 4524 }, { "completion_length": 213.9591827392578, "epoch": 0.45534591194968554, "grad_norm": 0.9434623718261719, "kl": 0.06103515625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7609623670578003, "reward_std": 0.16378602385520935, "rewards/accuracy_reward": 0.7711665034294128, "rewards/format_reward": 0.9897959232330322, "step": 4525 }, { "completion_length": 278.7550964355469, "epoch": 0.4554465408805031, "grad_norm": 0.5661987066268921, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8176019787788391, "reward_std": 0.19771810621023178, "rewards/accuracy_reward": 0.8278061151504517, "rewards/format_reward": 0.9897959232330322, "step": 4526 }, { "completion_length": 211.9591827392578, "epoch": 0.45554716981132076, "grad_norm": 1.292114019393921, "kl": 0.0762939453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.9183672666549683, "reward_std": 0.11917256191372871, "rewards/accuracy_reward": 0.938775509595871, "rewards/format_reward": 0.9795918464660645, "step": 4527 }, { "completion_length": 260.7142868041992, "epoch": 0.45564779874213834, "grad_norm": 0.7513285875320435, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.708983838558197, "reward_std": 0.14283134043216705, "rewards/accuracy_reward": 0.7395961284637451, "rewards/format_reward": 0.9693877398967743, "step": 4528 }, { "completion_length": 216.85713958740234, "epoch": 0.455748427672956, "grad_norm": 0.6024720668792725, "kl": 0.059326171875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8018388748168945, "reward_std": 0.1447247713804245, "rewards/accuracy_reward": 0.8120430409908295, "rewards/format_reward": 0.9897959232330322, "step": 4529 }, { "completion_length": 292.87754821777344, "epoch": 0.45584905660377356, "grad_norm": 0.6286967396736145, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7555201053619385, "reward_std": 0.12936003133654594, "rewards/accuracy_reward": 0.7657241821289062, "rewards/format_reward": 0.9897959232330322, "step": 4530 }, { "completion_length": 298.56121826171875, "epoch": 0.4559496855345912, "grad_norm": 0.5636200308799744, "kl": 0.0780029296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7949613332748413, "reward_std": 0.11144815757870674, "rewards/accuracy_reward": 0.8051653504371643, "rewards/format_reward": 0.9897959232330322, "step": 4531 }, { "completion_length": 255.2448959350586, "epoch": 0.4560503144654088, "grad_norm": 1.235205054283142, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6744636297225952, "reward_std": 0.2678125575184822, "rewards/accuracy_reward": 0.7050758898258209, "rewards/format_reward": 0.9693877398967743, "step": 4532 }, { "completion_length": 234.27550506591797, "epoch": 0.4561509433962264, "grad_norm": 1.001962661743164, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6861968040466309, "reward_std": 0.10664525628089905, "rewards/accuracy_reward": 0.7066049873828888, "rewards/format_reward": 0.9795918464660645, "step": 4533 }, { "completion_length": 323.83673095703125, "epoch": 0.45625157232704405, "grad_norm": 0.48993274569511414, "kl": 0.0643310546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6975628733634949, "reward_std": 0.17744459956884384, "rewards/accuracy_reward": 0.7383792996406555, "rewards/format_reward": 0.9591836333274841, "step": 4534 }, { "completion_length": 191.4897918701172, "epoch": 0.45635220125786163, "grad_norm": 1.6040109395980835, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8712055683135986, "reward_std": 0.16088785976171494, "rewards/accuracy_reward": 0.8916137516498566, "rewards/format_reward": 0.9795918166637421, "step": 4535 }, { "completion_length": 183.85713958740234, "epoch": 0.45645283018867927, "grad_norm": 1.1125822067260742, "kl": 0.0665283203125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8110936880111694, "reward_std": 0.1397624835371971, "rewards/accuracy_reward": 0.8212977051734924, "rewards/format_reward": 0.9897959232330322, "step": 4536 }, { "completion_length": 243.33673095703125, "epoch": 0.45655345911949685, "grad_norm": 0.48644590377807617, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8516847491264343, "reward_std": 0.040768953040242195, "rewards/accuracy_reward": 0.8516848683357239, "rewards/format_reward": 1.0, "step": 4537 }, { "completion_length": 275.3163299560547, "epoch": 0.4566540880503145, "grad_norm": 0.4869832694530487, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7712968587875366, "reward_std": 0.19103455543518066, "rewards/accuracy_reward": 0.8019091188907623, "rewards/format_reward": 0.9693877398967743, "step": 4538 }, { "completion_length": 293.07141876220703, "epoch": 0.45675471698113207, "grad_norm": 0.41292881965637207, "kl": 0.0517578125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7137389183044434, "reward_std": 0.11314164847135544, "rewards/accuracy_reward": 0.7443511486053467, "rewards/format_reward": 0.9693877398967743, "step": 4539 }, { "completion_length": 200.1836700439453, "epoch": 0.4568553459119497, "grad_norm": 2.004685163497925, "kl": 0.1171875, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.6771393418312073, "reward_std": 0.1356818899512291, "rewards/accuracy_reward": 0.687343418598175, "rewards/format_reward": 0.9897959232330322, "step": 4540 }, { "completion_length": 277.7550964355469, "epoch": 0.4569559748427673, "grad_norm": 0.7839315533638, "kl": 0.0706787109375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7590453028678894, "reward_std": 0.16692251712083817, "rewards/accuracy_reward": 0.7794535458087921, "rewards/format_reward": 0.9795918464660645, "step": 4541 }, { "completion_length": 281.2653045654297, "epoch": 0.4570566037735849, "grad_norm": 0.6295017600059509, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7562505602836609, "reward_std": 0.14121656119823456, "rewards/accuracy_reward": 0.7664546072483063, "rewards/format_reward": 0.9897959232330322, "step": 4542 }, { "completion_length": 269.6428451538086, "epoch": 0.4571572327044025, "grad_norm": 0.7950707674026489, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.86028254032135, "reward_std": 0.2072991356253624, "rewards/accuracy_reward": 0.9010988771915436, "rewards/format_reward": 0.9591836333274841, "step": 4543 }, { "completion_length": 211.9081573486328, "epoch": 0.45725786163522014, "grad_norm": 0.7790220379829407, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7871984839439392, "reward_std": 0.2037104293704033, "rewards/accuracy_reward": 0.8178107440471649, "rewards/format_reward": 0.9693877398967743, "step": 4544 }, { "completion_length": 266.9387664794922, "epoch": 0.4573584905660377, "grad_norm": 0.6502781510353088, "kl": 0.0863037109375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7614879608154297, "reward_std": 0.2944106310606003, "rewards/accuracy_reward": 0.8023043572902679, "rewards/format_reward": 0.9591836631298065, "step": 4545 }, { "completion_length": 195.37754821777344, "epoch": 0.45745911949685536, "grad_norm": 0.6161113381385803, "kl": 0.1025390625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7614795565605164, "reward_std": 0.1732444204390049, "rewards/accuracy_reward": 0.7818877398967743, "rewards/format_reward": 0.9795918166637421, "step": 4546 }, { "completion_length": 275.39795684814453, "epoch": 0.45755974842767294, "grad_norm": 2.8318307399749756, "kl": 0.0738525390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.711261808872223, "reward_std": 0.10843856353312731, "rewards/accuracy_reward": 0.7214658558368683, "rewards/format_reward": 0.9897959232330322, "step": 4547 }, { "completion_length": 257.62244415283203, "epoch": 0.4576603773584906, "grad_norm": 1.5733203887939453, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7509320378303528, "reward_std": 0.15621977299451828, "rewards/accuracy_reward": 0.7713401913642883, "rewards/format_reward": 0.9795918166637421, "step": 4548 }, { "completion_length": 195.9897918701172, "epoch": 0.45776100628930816, "grad_norm": 1.3365784883499146, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6868584156036377, "reward_std": 0.1840219497680664, "rewards/accuracy_reward": 0.6868583858013153, "rewards/format_reward": 1.0, "step": 4549 }, { "completion_length": 201.3571319580078, "epoch": 0.4578616352201258, "grad_norm": 2.9371721744537354, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8210574984550476, "reward_std": 0.11612031981348991, "rewards/accuracy_reward": 0.831261545419693, "rewards/format_reward": 0.9897959232330322, "step": 4550 }, { "completion_length": 205.41836547851562, "epoch": 0.4579622641509434, "grad_norm": 0.41104555130004883, "kl": 0.112060546875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.8632652163505554, "reward_std": 0.06882175803184509, "rewards/accuracy_reward": 0.873469352722168, "rewards/format_reward": 0.9897959232330322, "step": 4551 }, { "completion_length": 187.9591827392578, "epoch": 0.458062893081761, "grad_norm": 0.8356376886367798, "kl": 0.0770263671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6581631898880005, "reward_std": 0.14516311138868332, "rewards/accuracy_reward": 0.6785714030265808, "rewards/format_reward": 0.9795918464660645, "step": 4552 }, { "completion_length": 312.7448959350586, "epoch": 0.4581635220125786, "grad_norm": 1.1467596292495728, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6799098253250122, "reward_std": 0.17302343249320984, "rewards/accuracy_reward": 0.7003180384635925, "rewards/format_reward": 0.9795918464660645, "step": 4553 }, { "completion_length": 344.29591369628906, "epoch": 0.45826415094339623, "grad_norm": 0.420316219329834, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6719546914100647, "reward_std": 0.10801782459020615, "rewards/accuracy_reward": 0.6821587681770325, "rewards/format_reward": 0.9897959232330322, "step": 4554 }, { "completion_length": 277.3571319580078, "epoch": 0.4583647798742138, "grad_norm": 0.7355754375457764, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5991774201393127, "reward_std": 0.17560072988271713, "rewards/accuracy_reward": 0.629789724946022, "rewards/format_reward": 0.9693877398967743, "step": 4555 }, { "completion_length": 201.7653045654297, "epoch": 0.45846540880503145, "grad_norm": 0.41310572624206543, "kl": 0.0867919921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8207199573516846, "reward_std": 0.1266525536775589, "rewards/accuracy_reward": 0.8513321876525879, "rewards/format_reward": 0.9693877398967743, "step": 4556 }, { "completion_length": 230.2959213256836, "epoch": 0.45856603773584903, "grad_norm": 6.862829685211182, "kl": 0.1072998046875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7648810148239136, "reward_std": 0.07805096358060837, "rewards/accuracy_reward": 0.7750850021839142, "rewards/format_reward": 0.9897959232330322, "step": 4557 }, { "completion_length": 193.91836547851562, "epoch": 0.45866666666666667, "grad_norm": 0.4774899184703827, "kl": 0.101318359375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.83788400888443, "reward_std": 0.10156209673732519, "rewards/accuracy_reward": 0.8480881154537201, "rewards/format_reward": 0.9897959232330322, "step": 4558 }, { "completion_length": 184.32652282714844, "epoch": 0.4587672955974843, "grad_norm": 0.5893211960792542, "kl": 0.062744140625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8408163189888, "reward_std": 0.06479390989989042, "rewards/accuracy_reward": 0.8408163189888, "rewards/format_reward": 1.0, "step": 4559 }, { "completion_length": 222.7346954345703, "epoch": 0.4588679245283019, "grad_norm": 1.1512571573257446, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7198434472084045, "reward_std": 0.26279179006814957, "rewards/accuracy_reward": 0.7402516305446625, "rewards/format_reward": 0.9795918464660645, "step": 4560 }, { "completion_length": 244.94896697998047, "epoch": 0.4589685534591195, "grad_norm": 0.578850269317627, "kl": 0.057373046875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7130345106124878, "reward_std": 0.2169843167066574, "rewards/accuracy_reward": 0.7232385575771332, "rewards/format_reward": 0.9897959232330322, "step": 4561 }, { "completion_length": 254.7040786743164, "epoch": 0.4590691823899371, "grad_norm": 0.38870587944984436, "kl": 0.09423828125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7475666999816895, "reward_std": 0.10833195690065622, "rewards/accuracy_reward": 0.7577707767486572, "rewards/format_reward": 0.9897959232330322, "step": 4562 }, { "completion_length": 199.12244415283203, "epoch": 0.45916981132075474, "grad_norm": 1.7909135818481445, "kl": 0.135498046875, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.719604253768921, "reward_std": 0.08389044366776943, "rewards/accuracy_reward": 0.7298083007335663, "rewards/format_reward": 0.9897959232330322, "step": 4563 }, { "completion_length": 260.56121826171875, "epoch": 0.4592704402515723, "grad_norm": 0.9996359944343567, "kl": 0.0540771484375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7367590069770813, "reward_std": 0.18414703756570816, "rewards/accuracy_reward": 0.7571671307086945, "rewards/format_reward": 0.9795918464660645, "step": 4564 }, { "completion_length": 232.13265228271484, "epoch": 0.45937106918238996, "grad_norm": 0.7628062963485718, "kl": 0.097412109375, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.722111701965332, "reward_std": 0.2217540293931961, "rewards/accuracy_reward": 0.7425197958946228, "rewards/format_reward": 0.9795918464660645, "step": 4565 }, { "completion_length": 176.89795684814453, "epoch": 0.45947169811320754, "grad_norm": 0.4523567259311676, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7857142686843872, "reward_std": 0.10335781052708626, "rewards/accuracy_reward": 0.8061224222183228, "rewards/format_reward": 0.9795918166637421, "step": 4566 }, { "completion_length": 194.948974609375, "epoch": 0.4595723270440252, "grad_norm": 0.48545515537261963, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.747675359249115, "reward_std": 0.06207279581576586, "rewards/accuracy_reward": 0.747675359249115, "rewards/format_reward": 1.0, "step": 4567 }, { "completion_length": 293.8061065673828, "epoch": 0.45967295597484276, "grad_norm": 0.7371797561645508, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6320330500602722, "reward_std": 0.2059762328863144, "rewards/accuracy_reward": 0.6524412035942078, "rewards/format_reward": 0.9795918464660645, "step": 4568 }, { "completion_length": 301.2244873046875, "epoch": 0.4597735849056604, "grad_norm": 0.5862246751785278, "kl": 0.0626220703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6915717124938965, "reward_std": 0.18606184422969818, "rewards/accuracy_reward": 0.6915716528892517, "rewards/format_reward": 1.0, "step": 4569 }, { "completion_length": 235.69387817382812, "epoch": 0.459874213836478, "grad_norm": 0.932627260684967, "kl": 0.0870361328125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7885487079620361, "reward_std": 0.16541235893964767, "rewards/accuracy_reward": 0.7885487377643585, "rewards/format_reward": 1.0, "step": 4570 }, { "completion_length": 215.4285659790039, "epoch": 0.4599748427672956, "grad_norm": 1.5646190643310547, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6029496788978577, "reward_std": 0.18011688441038132, "rewards/accuracy_reward": 0.6029497385025024, "rewards/format_reward": 1.0, "step": 4571 }, { "completion_length": 207.7142791748047, "epoch": 0.4600754716981132, "grad_norm": 0.5503832697868347, "kl": 0.059814453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.611904799938202, "reward_std": 0.10929432883858681, "rewards/accuracy_reward": 0.6119047701358795, "rewards/format_reward": 1.0, "step": 4572 }, { "completion_length": 212.46937561035156, "epoch": 0.46017610062893083, "grad_norm": 0.5266093611717224, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7711095809936523, "reward_std": 0.12454871088266373, "rewards/accuracy_reward": 0.7711096405982971, "rewards/format_reward": 1.0, "step": 4573 }, { "completion_length": 201.4897918701172, "epoch": 0.4602767295597484, "grad_norm": 0.9408403038978577, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8476881384849548, "reward_std": 0.20814981311559677, "rewards/accuracy_reward": 0.8578922152519226, "rewards/format_reward": 0.9897959232330322, "step": 4574 }, { "completion_length": 219.15306091308594, "epoch": 0.46037735849056605, "grad_norm": 0.8817070722579956, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7397269010543823, "reward_std": 0.2319895103573799, "rewards/accuracy_reward": 0.7601350843906403, "rewards/format_reward": 0.9795918464660645, "step": 4575 }, { "completion_length": 238.27550506591797, "epoch": 0.4604779874213836, "grad_norm": 2.075552463531494, "kl": 0.1126708984375, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.6228200197219849, "reward_std": 0.08667689561843872, "rewards/accuracy_reward": 0.6228200197219849, "rewards/format_reward": 1.0, "step": 4576 }, { "completion_length": 326.74488830566406, "epoch": 0.46057861635220126, "grad_norm": 0.9026625156402588, "kl": 0.098876953125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.5553666949272156, "reward_std": 0.15077663213014603, "rewards/accuracy_reward": 0.555366724729538, "rewards/format_reward": 1.0, "step": 4577 }, { "completion_length": 265.65306091308594, "epoch": 0.46067924528301885, "grad_norm": 0.4752224087715149, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7272220849990845, "reward_std": 0.12444159761071205, "rewards/accuracy_reward": 0.7272220253944397, "rewards/format_reward": 1.0, "step": 4578 }, { "completion_length": 181.58162689208984, "epoch": 0.4607798742138365, "grad_norm": 0.5809259414672852, "kl": 0.1044921875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.9257194995880127, "reward_std": 0.0781886987388134, "rewards/accuracy_reward": 0.9257194995880127, "rewards/format_reward": 1.0, "step": 4579 }, { "completion_length": 232.10204315185547, "epoch": 0.46088050314465406, "grad_norm": 0.40537145733833313, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7856292128562927, "reward_std": 0.07003082567825913, "rewards/accuracy_reward": 0.7856292128562927, "rewards/format_reward": 1.0, "step": 4580 }, { "completion_length": 192.2448959350586, "epoch": 0.4609811320754717, "grad_norm": 0.6603013277053833, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.5622119903564453, "reward_std": 0.1382376290857792, "rewards/accuracy_reward": 0.5826201140880585, "rewards/format_reward": 0.9795918166637421, "step": 4581 }, { "completion_length": 302.87754821777344, "epoch": 0.4610817610062893, "grad_norm": 0.4386887848377228, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5552664995193481, "reward_std": 0.1613009348511696, "rewards/accuracy_reward": 0.5756747275590897, "rewards/format_reward": 0.9795918464660645, "step": 4582 }, { "completion_length": 165.55101776123047, "epoch": 0.4611823899371069, "grad_norm": 0.8829078078269958, "kl": 0.091064453125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8086734414100647, "reward_std": 0.10480373352766037, "rewards/accuracy_reward": 0.8086734414100647, "rewards/format_reward": 1.0, "step": 4583 }, { "completion_length": 259.80611419677734, "epoch": 0.46128301886792455, "grad_norm": 0.7753334641456604, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6944880485534668, "reward_std": 0.20996148884296417, "rewards/accuracy_reward": 0.7046920955181122, "rewards/format_reward": 0.9897959232330322, "step": 4584 }, { "completion_length": 279.4591827392578, "epoch": 0.46138364779874214, "grad_norm": 0.7749009132385254, "kl": 0.06689453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5719159245491028, "reward_std": 0.20765237510204315, "rewards/accuracy_reward": 0.5719159543514252, "rewards/format_reward": 1.0, "step": 4585 }, { "completion_length": 253.80612182617188, "epoch": 0.4614842767295598, "grad_norm": 1.1705121994018555, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.683077096939087, "reward_std": 0.24625850468873978, "rewards/accuracy_reward": 0.6932811737060547, "rewards/format_reward": 0.9897959232330322, "step": 4586 }, { "completion_length": 171.52040100097656, "epoch": 0.46158490566037735, "grad_norm": 0.4774743914604187, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8750566244125366, "reward_std": 0.10339648835361004, "rewards/accuracy_reward": 0.875056654214859, "rewards/format_reward": 1.0, "step": 4587 }, { "completion_length": 212.14285278320312, "epoch": 0.461685534591195, "grad_norm": 3.834336519241333, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7960752844810486, "reward_std": 0.09754297509789467, "rewards/accuracy_reward": 0.7960753440856934, "rewards/format_reward": 1.0, "step": 4588 }, { "completion_length": 290.06121826171875, "epoch": 0.46178616352201257, "grad_norm": 1.0197376012802124, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6438239812850952, "reward_std": 0.3061581254005432, "rewards/accuracy_reward": 0.6642321646213531, "rewards/format_reward": 0.9795918166637421, "step": 4589 }, { "completion_length": 261.28570556640625, "epoch": 0.4618867924528302, "grad_norm": 1.583125114440918, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6936158537864685, "reward_std": 0.17003224790096283, "rewards/accuracy_reward": 0.6936158835887909, "rewards/format_reward": 1.0, "step": 4590 }, { "completion_length": 164.28571319580078, "epoch": 0.4619874213836478, "grad_norm": 0.6268602609634399, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7252063751220703, "reward_std": 0.14456936717033386, "rewards/accuracy_reward": 0.7456145584583282, "rewards/format_reward": 0.9795918464660645, "step": 4591 }, { "completion_length": 283.1224365234375, "epoch": 0.4620880503144654, "grad_norm": 0.5376284122467041, "kl": 0.0596923828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6823129057884216, "reward_std": 0.15015685930848122, "rewards/accuracy_reward": 0.682312935590744, "rewards/format_reward": 1.0, "step": 4592 }, { "completion_length": 252.59183502197266, "epoch": 0.462188679245283, "grad_norm": 0.6862483024597168, "kl": 0.100830078125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6563286781311035, "reward_std": 0.13907483592629433, "rewards/accuracy_reward": 0.6767367571592331, "rewards/format_reward": 0.9795918464660645, "step": 4593 }, { "completion_length": 239.80612182617188, "epoch": 0.46228930817610064, "grad_norm": 0.45066794753074646, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.704828679561615, "reward_std": 0.09270167350769043, "rewards/accuracy_reward": 0.7048286497592926, "rewards/format_reward": 1.0, "step": 4594 }, { "completion_length": 286.32653045654297, "epoch": 0.4623899371069182, "grad_norm": 0.9325999021530151, "kl": 0.0706787109375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7330715656280518, "reward_std": 0.1221211701631546, "rewards/accuracy_reward": 0.7534798085689545, "rewards/format_reward": 0.9795918464660645, "step": 4595 }, { "completion_length": 155.77550506591797, "epoch": 0.46249056603773586, "grad_norm": 1.0207438468933105, "kl": 0.0582275390625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8308905363082886, "reward_std": 0.053247157484292984, "rewards/accuracy_reward": 0.8308905363082886, "rewards/format_reward": 1.0, "step": 4596 }, { "completion_length": 244.24488830566406, "epoch": 0.46259119496855344, "grad_norm": 0.6335040926933289, "kl": 0.0887451171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6390113830566406, "reward_std": 0.13507971167564392, "rewards/accuracy_reward": 0.6492154598236084, "rewards/format_reward": 0.9897959232330322, "step": 4597 }, { "completion_length": 272.6428527832031, "epoch": 0.4626918238993711, "grad_norm": 0.659935474395752, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.668744444847107, "reward_std": 0.1495993286371231, "rewards/accuracy_reward": 0.6789484322071075, "rewards/format_reward": 0.9897959232330322, "step": 4598 }, { "completion_length": 235.69386291503906, "epoch": 0.46279245283018866, "grad_norm": 0.5754866600036621, "kl": 0.10400390625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.758635699748993, "reward_std": 0.1417369544506073, "rewards/accuracy_reward": 0.7688397169113159, "rewards/format_reward": 0.9897959232330322, "step": 4599 }, { "completion_length": 249.83673095703125, "epoch": 0.4628930817610063, "grad_norm": 0.36035069823265076, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8370897769927979, "reward_std": 0.08448325097560883, "rewards/accuracy_reward": 0.847293883562088, "rewards/format_reward": 0.9897959232330322, "step": 4600 }, { "completion_length": 215.4591827392578, "epoch": 0.4629937106918239, "grad_norm": 0.6667174696922302, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7956636548042297, "reward_std": 0.13628783449530602, "rewards/accuracy_reward": 0.8058677911758423, "rewards/format_reward": 0.9897959232330322, "step": 4601 }, { "completion_length": 282.9285583496094, "epoch": 0.4630943396226415, "grad_norm": 0.9010543823242188, "kl": 0.06689453125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8395949006080627, "reward_std": 0.19755268841981888, "rewards/accuracy_reward": 0.8600030839443207, "rewards/format_reward": 0.9795918464660645, "step": 4602 }, { "completion_length": 253.39795684814453, "epoch": 0.4631949685534591, "grad_norm": 0.5351211428642273, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8161969780921936, "reward_std": 0.16006576269865036, "rewards/accuracy_reward": 0.8366051316261292, "rewards/format_reward": 0.9795918464660645, "step": 4603 }, { "completion_length": 247.30612182617188, "epoch": 0.46329559748427673, "grad_norm": 0.957590639591217, "kl": 0.0589599609375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7346938252449036, "reward_std": 0.20802105963230133, "rewards/accuracy_reward": 0.7551020085811615, "rewards/format_reward": 0.9795918166637421, "step": 4604 }, { "completion_length": 256.34693908691406, "epoch": 0.4633962264150943, "grad_norm": 0.7562270164489746, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6504001021385193, "reward_std": 0.13060395047068596, "rewards/accuracy_reward": 0.6504001319408417, "rewards/format_reward": 1.0, "step": 4605 }, { "completion_length": 263.32652282714844, "epoch": 0.46349685534591195, "grad_norm": 0.35890910029411316, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.860544204711914, "reward_std": 0.16084171831607819, "rewards/accuracy_reward": 0.8911564648151398, "rewards/format_reward": 0.9693877398967743, "step": 4606 }, { "completion_length": 254.36734771728516, "epoch": 0.46359748427672953, "grad_norm": 0.5616324543952942, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8327913880348206, "reward_std": 0.09521399438381195, "rewards/accuracy_reward": 0.8429954648017883, "rewards/format_reward": 0.9897959232330322, "step": 4607 }, { "completion_length": 241.1836700439453, "epoch": 0.46369811320754717, "grad_norm": 0.564706027507782, "kl": 0.10302734375, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6611766815185547, "reward_std": 0.13158540800213814, "rewards/accuracy_reward": 0.6713808178901672, "rewards/format_reward": 0.9897959232330322, "step": 4608 }, { "completion_length": 240.18366241455078, "epoch": 0.4637987421383648, "grad_norm": 5.135120391845703, "kl": 0.108642578125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7327414155006409, "reward_std": 0.17648138105869293, "rewards/accuracy_reward": 0.7633536756038666, "rewards/format_reward": 0.9693877398967743, "step": 4609 }, { "completion_length": 213.2653045654297, "epoch": 0.4638993710691824, "grad_norm": 0.8109884262084961, "kl": 0.1064453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7564625144004822, "reward_std": 0.13605335727334023, "rewards/accuracy_reward": 0.7666666805744171, "rewards/format_reward": 0.9897959232330322, "step": 4610 }, { "completion_length": 198.16326141357422, "epoch": 0.464, "grad_norm": 0.744026243686676, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7597762942314148, "reward_std": 0.12664875388145447, "rewards/accuracy_reward": 0.7699804306030273, "rewards/format_reward": 0.9897959232330322, "step": 4611 }, { "completion_length": 248.06121826171875, "epoch": 0.4641006289308176, "grad_norm": 0.985410749912262, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6768707633018494, "reward_std": 0.2029198408126831, "rewards/accuracy_reward": 0.6870748102664948, "rewards/format_reward": 0.9897959232330322, "step": 4612 }, { "completion_length": 236.24488830566406, "epoch": 0.46420125786163524, "grad_norm": 0.5085622072219849, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.627346932888031, "reward_std": 0.12795960158109665, "rewards/accuracy_reward": 0.627346932888031, "rewards/format_reward": 1.0, "step": 4613 }, { "completion_length": 223.2142791748047, "epoch": 0.4643018867924528, "grad_norm": 0.7024155259132385, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.806963562965393, "reward_std": 0.20208939164876938, "rewards/accuracy_reward": 0.8069634139537811, "rewards/format_reward": 1.0, "step": 4614 }, { "completion_length": 334.78570556640625, "epoch": 0.46440251572327046, "grad_norm": 0.8071175813674927, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.675158143043518, "reward_std": 0.2844606339931488, "rewards/accuracy_reward": 0.7261785566806793, "rewards/format_reward": 0.9489795565605164, "step": 4615 }, { "completion_length": 328.7244873046875, "epoch": 0.46450314465408804, "grad_norm": 0.9004327058792114, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6602473855018616, "reward_std": 0.19683489203453064, "rewards/accuracy_reward": 0.6602473855018616, "rewards/format_reward": 1.0, "step": 4616 }, { "completion_length": 282.1836700439453, "epoch": 0.4646037735849057, "grad_norm": 0.6004738807678223, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6819449663162231, "reward_std": 0.15098752081394196, "rewards/accuracy_reward": 0.7023531794548035, "rewards/format_reward": 0.9795918464660645, "step": 4617 }, { "completion_length": 237.06121063232422, "epoch": 0.46470440251572326, "grad_norm": 0.8376272320747375, "kl": 0.0711669921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8571428060531616, "reward_std": 0.15069954097270966, "rewards/accuracy_reward": 0.8673469424247742, "rewards/format_reward": 0.9897959232330322, "step": 4618 }, { "completion_length": 223.63265228271484, "epoch": 0.4648050314465409, "grad_norm": 0.6539837718009949, "kl": 0.11474609375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.772762954235077, "reward_std": 0.14084261283278465, "rewards/accuracy_reward": 0.7727629840373993, "rewards/format_reward": 1.0, "step": 4619 }, { "completion_length": 257.72447967529297, "epoch": 0.4649056603773585, "grad_norm": 0.2588489353656769, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8622448444366455, "reward_std": 0.0550973117351532, "rewards/accuracy_reward": 0.8724489808082581, "rewards/format_reward": 0.9897959232330322, "step": 4620 }, { "completion_length": 308.7142791748047, "epoch": 0.4650062893081761, "grad_norm": 0.7500117421150208, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7130372524261475, "reward_std": 0.14990999549627304, "rewards/accuracy_reward": 0.72324138879776, "rewards/format_reward": 0.9897959232330322, "step": 4621 }, { "completion_length": 259.27550506591797, "epoch": 0.4651069182389937, "grad_norm": 1.8051178455352783, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5852261781692505, "reward_std": 0.1428018882870674, "rewards/accuracy_reward": 0.5954302251338959, "rewards/format_reward": 0.9897959232330322, "step": 4622 }, { "completion_length": 235.34693145751953, "epoch": 0.46520754716981133, "grad_norm": 0.6287155151367188, "kl": 0.0689697265625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7144402861595154, "reward_std": 0.17307258397340775, "rewards/accuracy_reward": 0.7552566230297089, "rewards/format_reward": 0.9591836631298065, "step": 4623 }, { "completion_length": 190.27550506591797, "epoch": 0.4653081761006289, "grad_norm": 0.5772069692611694, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7558132410049438, "reward_std": 0.11593083664774895, "rewards/accuracy_reward": 0.7558131814002991, "rewards/format_reward": 1.0, "step": 4624 }, { "completion_length": 293.9897918701172, "epoch": 0.46540880503144655, "grad_norm": 0.7077244520187378, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.738280713558197, "reward_std": 0.1751022506505251, "rewards/accuracy_reward": 0.738280713558197, "rewards/format_reward": 1.0, "step": 4625 }, { "completion_length": 330.5816345214844, "epoch": 0.46550943396226413, "grad_norm": 0.5192602872848511, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6930996179580688, "reward_std": 0.18016968667507172, "rewards/accuracy_reward": 0.7135078310966492, "rewards/format_reward": 0.9795918166637421, "step": 4626 }, { "completion_length": 222.8877410888672, "epoch": 0.46561006289308177, "grad_norm": 0.8121945261955261, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8072664737701416, "reward_std": 0.10707519203424454, "rewards/accuracy_reward": 0.8072665333747864, "rewards/format_reward": 1.0, "step": 4627 }, { "completion_length": 135.01020431518555, "epoch": 0.46571069182389935, "grad_norm": 1.4858566522598267, "kl": 0.119873046875, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.8395690321922302, "reward_std": 0.15321870520710945, "rewards/accuracy_reward": 0.8395691514015198, "rewards/format_reward": 1.0, "step": 4628 }, { "completion_length": 290.32652282714844, "epoch": 0.465811320754717, "grad_norm": 0.589249849319458, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8513119220733643, "reward_std": 0.16248979791998863, "rewards/accuracy_reward": 0.8717201054096222, "rewards/format_reward": 0.9795918166637421, "step": 4629 }, { "completion_length": 291.10203552246094, "epoch": 0.46591194968553457, "grad_norm": 1.6342287063598633, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.479164719581604, "reward_std": 0.2084781751036644, "rewards/accuracy_reward": 0.49957285821437836, "rewards/format_reward": 0.9795918464660645, "step": 4630 }, { "completion_length": 215.4183578491211, "epoch": 0.4660125786163522, "grad_norm": 0.5274726152420044, "kl": 0.13037109375, "learning_rate": 1e-06, "loss": 0.0052, "reward": 1.803079903125763, "reward_std": 0.09044812619686127, "rewards/accuracy_reward": 0.8030799627304077, "rewards/format_reward": 1.0, "step": 4631 }, { "completion_length": 298.27549743652344, "epoch": 0.46611320754716984, "grad_norm": 0.6595010161399841, "kl": 0.0576171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.768303096294403, "reward_std": 0.18569556809961796, "rewards/accuracy_reward": 0.778507262468338, "rewards/format_reward": 0.9897959232330322, "step": 4632 }, { "completion_length": 204.87754821777344, "epoch": 0.4662138364779874, "grad_norm": 0.450399786233902, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8148688077926636, "reward_std": 0.08370297960937023, "rewards/accuracy_reward": 0.825072854757309, "rewards/format_reward": 0.9897959232330322, "step": 4633 }, { "completion_length": 282.79590606689453, "epoch": 0.46631446540880506, "grad_norm": 0.666420578956604, "kl": 0.0760498046875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6990929245948792, "reward_std": 0.13469130545854568, "rewards/accuracy_reward": 0.6990929841995239, "rewards/format_reward": 1.0, "step": 4634 }, { "completion_length": 251.8877410888672, "epoch": 0.46641509433962264, "grad_norm": 0.5698531866073608, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.725473165512085, "reward_std": 0.17730475962162018, "rewards/accuracy_reward": 0.7458813786506653, "rewards/format_reward": 0.9795918166637421, "step": 4635 }, { "completion_length": 277.69386291503906, "epoch": 0.4665157232704403, "grad_norm": 0.7224482893943787, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6052295565605164, "reward_std": 0.10224250704050064, "rewards/accuracy_reward": 0.6052295714616776, "rewards/format_reward": 1.0, "step": 4636 }, { "completion_length": 196.32653045654297, "epoch": 0.46661635220125786, "grad_norm": 0.8074936866760254, "kl": 0.103515625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7124205827713013, "reward_std": 0.08602590532973409, "rewards/accuracy_reward": 0.7124205231666565, "rewards/format_reward": 1.0, "step": 4637 }, { "completion_length": 216.33673095703125, "epoch": 0.4667169811320755, "grad_norm": 0.9975447654724121, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7800424098968506, "reward_std": 0.12060463055968285, "rewards/accuracy_reward": 0.780042439699173, "rewards/format_reward": 1.0, "step": 4638 }, { "completion_length": 254.2040786743164, "epoch": 0.4668176100628931, "grad_norm": 0.7255004048347473, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8112663626670837, "reward_std": 0.09842182323336601, "rewards/accuracy_reward": 0.8214704990386963, "rewards/format_reward": 0.9897959232330322, "step": 4639 }, { "completion_length": 290.8877410888672, "epoch": 0.4669182389937107, "grad_norm": 0.37833094596862793, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7496871948242188, "reward_std": 0.10007587820291519, "rewards/accuracy_reward": 0.7598913013935089, "rewards/format_reward": 0.9897959232330322, "step": 4640 }, { "completion_length": 228.96939086914062, "epoch": 0.4670188679245283, "grad_norm": 0.8175063133239746, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7437483072280884, "reward_std": 0.03587903827428818, "rewards/accuracy_reward": 0.7437483072280884, "rewards/format_reward": 1.0, "step": 4641 }, { "completion_length": 351.8775329589844, "epoch": 0.46711949685534593, "grad_norm": 0.7352232336997986, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.5971439480781555, "reward_std": 0.25924350321292877, "rewards/accuracy_reward": 0.6073480844497681, "rewards/format_reward": 0.9897959232330322, "step": 4642 }, { "completion_length": 190.15306091308594, "epoch": 0.4672201257861635, "grad_norm": 1.210227370262146, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.751875102519989, "reward_std": 0.11370741948485374, "rewards/accuracy_reward": 0.762079119682312, "rewards/format_reward": 0.9897959232330322, "step": 4643 }, { "completion_length": 279.53060150146484, "epoch": 0.46732075471698115, "grad_norm": 0.6475959420204163, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.699522852897644, "reward_std": 0.10954604670405388, "rewards/accuracy_reward": 0.699522852897644, "rewards/format_reward": 1.0, "step": 4644 }, { "completion_length": 294.5306091308594, "epoch": 0.4674213836477987, "grad_norm": 1.1327866315841675, "kl": 0.0675048828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8214285373687744, "reward_std": 0.10113359801471233, "rewards/accuracy_reward": 0.8316326439380646, "rewards/format_reward": 0.9897959232330322, "step": 4645 }, { "completion_length": 209.05101776123047, "epoch": 0.46752201257861636, "grad_norm": 0.5573446154594421, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.877551019191742, "reward_std": 0.10641694068908691, "rewards/accuracy_reward": 0.8979591727256775, "rewards/format_reward": 0.9795918464660645, "step": 4646 }, { "completion_length": 286.2550964355469, "epoch": 0.46762264150943395, "grad_norm": 0.9194929003715515, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7177483439445496, "reward_std": 0.1999962106347084, "rewards/accuracy_reward": 0.7381564080715179, "rewards/format_reward": 0.9795918464660645, "step": 4647 }, { "completion_length": 282.4285583496094, "epoch": 0.4677232704402516, "grad_norm": 0.567040741443634, "kl": 0.09765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.823123037815094, "reward_std": 0.15362491831183434, "rewards/accuracy_reward": 0.8435312807559967, "rewards/format_reward": 0.9795918166637421, "step": 4648 }, { "completion_length": 253.79591369628906, "epoch": 0.46782389937106916, "grad_norm": 0.8312582969665527, "kl": 0.1065673828125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.747120976448059, "reward_std": 0.169883631169796, "rewards/accuracy_reward": 0.7573250234127045, "rewards/format_reward": 0.9897959232330322, "step": 4649 }, { "completion_length": 216.09183502197266, "epoch": 0.4679245283018868, "grad_norm": 0.5980643033981323, "kl": 0.12451171875, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.686028242111206, "reward_std": 0.10960783809423447, "rewards/accuracy_reward": 0.6860282570123672, "rewards/format_reward": 1.0, "step": 4650 }, { "completion_length": 172.96939086914062, "epoch": 0.4680251572327044, "grad_norm": 0.9126737713813782, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7491415739059448, "reward_std": 0.10997525602579117, "rewards/accuracy_reward": 0.7491414844989777, "rewards/format_reward": 1.0, "step": 4651 }, { "completion_length": 236.22447967529297, "epoch": 0.468125786163522, "grad_norm": 0.7037020325660706, "kl": 0.109375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.745183289051056, "reward_std": 0.1784319281578064, "rewards/accuracy_reward": 0.7553874552249908, "rewards/format_reward": 0.9897959232330322, "step": 4652 }, { "completion_length": 193.06121826171875, "epoch": 0.4682264150943396, "grad_norm": 0.812079668045044, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6713139414787292, "reward_std": 0.19924592226743698, "rewards/accuracy_reward": 0.681518018245697, "rewards/format_reward": 0.9897959232330322, "step": 4653 }, { "completion_length": 261.6020278930664, "epoch": 0.46832704402515724, "grad_norm": 0.5611026883125305, "kl": 0.052978515625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8137044310569763, "reward_std": 0.16023527830839157, "rewards/accuracy_reward": 0.8239085674285889, "rewards/format_reward": 0.9897959232330322, "step": 4654 }, { "completion_length": 264.9693832397461, "epoch": 0.4684276729559748, "grad_norm": 0.6232698559761047, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7343294620513916, "reward_std": 0.21617724001407623, "rewards/accuracy_reward": 0.7547376155853271, "rewards/format_reward": 0.9795918464660645, "step": 4655 }, { "completion_length": 220.7551040649414, "epoch": 0.46852830188679245, "grad_norm": 0.5261341333389282, "kl": 0.114990234375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.8040435314178467, "reward_std": 0.07887400686740875, "rewards/accuracy_reward": 0.8142476677894592, "rewards/format_reward": 0.9897959232330322, "step": 4656 }, { "completion_length": 291.3877410888672, "epoch": 0.4686289308176101, "grad_norm": 0.8989919424057007, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7145852446556091, "reward_std": 0.21655013412237167, "rewards/accuracy_reward": 0.7145852148532867, "rewards/format_reward": 1.0, "step": 4657 }, { "completion_length": 257.9285659790039, "epoch": 0.46872955974842767, "grad_norm": 0.973126232624054, "kl": 0.107666015625, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.5954084396362305, "reward_std": 0.2577657587826252, "rewards/accuracy_reward": 0.6362246870994568, "rewards/format_reward": 0.9591836333274841, "step": 4658 }, { "completion_length": 238.81631469726562, "epoch": 0.4688301886792453, "grad_norm": 0.717144250869751, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.89549720287323, "reward_std": 0.04828980192542076, "rewards/accuracy_reward": 0.9057013094425201, "rewards/format_reward": 0.9897959232330322, "step": 4659 }, { "completion_length": 243.0102081298828, "epoch": 0.4689308176100629, "grad_norm": 0.8697975873947144, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7115336656570435, "reward_std": 0.22185814380645752, "rewards/accuracy_reward": 0.7319419085979462, "rewards/format_reward": 0.9795918166637421, "step": 4660 }, { "completion_length": 283.39795684814453, "epoch": 0.4690314465408805, "grad_norm": 0.6116425395011902, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7971330881118774, "reward_std": 0.2109610103070736, "rewards/accuracy_reward": 0.8073371350765228, "rewards/format_reward": 0.9897959232330322, "step": 4661 }, { "completion_length": 207.14285278320312, "epoch": 0.4691320754716981, "grad_norm": 0.48456239700317383, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8582199811935425, "reward_std": 0.06964767351746559, "rewards/accuracy_reward": 0.8786280751228333, "rewards/format_reward": 0.9795918464660645, "step": 4662 }, { "completion_length": 271.42857360839844, "epoch": 0.46923270440251574, "grad_norm": 0.5794146060943604, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7238680720329285, "reward_std": 0.2426633983850479, "rewards/accuracy_reward": 0.7646845579147339, "rewards/format_reward": 0.9591836631298065, "step": 4663 }, { "completion_length": 180.55101776123047, "epoch": 0.4693333333333333, "grad_norm": 0.5250838398933411, "kl": 0.105224609375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7815029621124268, "reward_std": 0.06811601296067238, "rewards/accuracy_reward": 0.7815030813217163, "rewards/format_reward": 1.0, "step": 4664 }, { "completion_length": 244.55101776123047, "epoch": 0.46943396226415096, "grad_norm": 1.7433573007583618, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6926406621932983, "reward_std": 0.23175477236509323, "rewards/accuracy_reward": 0.7334569990634918, "rewards/format_reward": 0.9591836333274841, "step": 4665 }, { "completion_length": 244.55101776123047, "epoch": 0.46953459119496854, "grad_norm": 0.5806277394294739, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7327780723571777, "reward_std": 0.12836530059576035, "rewards/accuracy_reward": 0.7429821789264679, "rewards/format_reward": 0.9897959232330322, "step": 4666 }, { "completion_length": 287.60203552246094, "epoch": 0.4696352201257862, "grad_norm": 1.1829051971435547, "kl": 0.098876953125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6049619913101196, "reward_std": 0.11593261733651161, "rewards/accuracy_reward": 0.6049620807170868, "rewards/format_reward": 1.0, "step": 4667 }, { "completion_length": 256.79591369628906, "epoch": 0.46973584905660376, "grad_norm": 1.041872501373291, "kl": 0.11767578125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.6592273712158203, "reward_std": 0.3393756300210953, "rewards/accuracy_reward": 0.689839631319046, "rewards/format_reward": 0.9693877398967743, "step": 4668 }, { "completion_length": 204.45917510986328, "epoch": 0.4698364779874214, "grad_norm": 1.2283209562301636, "kl": 0.1015625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8100906610488892, "reward_std": 0.2237769067287445, "rewards/accuracy_reward": 0.8100906908512115, "rewards/format_reward": 1.0, "step": 4669 }, { "completion_length": 240.89794921875, "epoch": 0.469937106918239, "grad_norm": 4.028901100158691, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7064494490623474, "reward_std": 0.20818976312875748, "rewards/accuracy_reward": 0.7064494490623474, "rewards/format_reward": 1.0, "step": 4670 }, { "completion_length": 228.4795913696289, "epoch": 0.4700377358490566, "grad_norm": 0.8694692850112915, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7281782031059265, "reward_std": 0.19044458866119385, "rewards/accuracy_reward": 0.7281782925128937, "rewards/format_reward": 1.0, "step": 4671 }, { "completion_length": 159.32653045654297, "epoch": 0.4701383647798742, "grad_norm": 0.6526732444763184, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7755101323127747, "reward_std": 0.08317594975233078, "rewards/accuracy_reward": 0.7857142686843872, "rewards/format_reward": 0.9897959232330322, "step": 4672 }, { "completion_length": 274.07142639160156, "epoch": 0.47023899371069183, "grad_norm": 0.6871602535247803, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.691610872745514, "reward_std": 0.14497216790914536, "rewards/accuracy_reward": 0.7018148899078369, "rewards/format_reward": 0.9897959232330322, "step": 4673 }, { "completion_length": 266.10203552246094, "epoch": 0.4703396226415094, "grad_norm": 5.228780269622803, "kl": 0.089111328125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.795918345451355, "reward_std": 0.27464015781879425, "rewards/accuracy_reward": 0.8061224222183228, "rewards/format_reward": 0.9897959232330322, "step": 4674 }, { "completion_length": 280.29591369628906, "epoch": 0.47044025157232705, "grad_norm": 0.8100632429122925, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6878284811973572, "reward_std": 0.17370430752635002, "rewards/accuracy_reward": 0.7184407114982605, "rewards/format_reward": 0.9693877398967743, "step": 4675 }, { "completion_length": 228.51019287109375, "epoch": 0.47054088050314463, "grad_norm": 0.6927164793014526, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.878902792930603, "reward_std": 0.0955718345940113, "rewards/accuracy_reward": 0.878902792930603, "rewards/format_reward": 1.0, "step": 4676 }, { "completion_length": 235.78571319580078, "epoch": 0.47064150943396227, "grad_norm": 1.0479031801223755, "kl": 0.104248046875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6773219108581543, "reward_std": 0.13698602840304375, "rewards/accuracy_reward": 0.6875260770320892, "rewards/format_reward": 0.9897959232330322, "step": 4677 }, { "completion_length": 195.14285278320312, "epoch": 0.47074213836477985, "grad_norm": 0.523550271987915, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.877551019191742, "reward_std": 0.13498730212450027, "rewards/accuracy_reward": 0.8979591727256775, "rewards/format_reward": 0.9795918464660645, "step": 4678 }, { "completion_length": 233.31632232666016, "epoch": 0.4708427672955975, "grad_norm": 0.7041056752204895, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.5170801877975464, "reward_std": 0.1395934298634529, "rewards/accuracy_reward": 0.5272842347621918, "rewards/format_reward": 0.9897959232330322, "step": 4679 }, { "completion_length": 219.72447967529297, "epoch": 0.47094339622641507, "grad_norm": 0.3902057111263275, "kl": 0.0966796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.771622896194458, "reward_std": 0.049974849447607994, "rewards/accuracy_reward": 0.7716229259967804, "rewards/format_reward": 1.0, "step": 4680 }, { "completion_length": 290.2142868041992, "epoch": 0.4710440251572327, "grad_norm": 0.7918054461479187, "kl": 0.0933837890625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7447124123573303, "reward_std": 0.1782313473522663, "rewards/accuracy_reward": 0.7549165189266205, "rewards/format_reward": 0.9897959232330322, "step": 4681 }, { "completion_length": 238.48979949951172, "epoch": 0.47114465408805034, "grad_norm": 0.8626653552055359, "kl": 0.0823974609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.722643256187439, "reward_std": 0.19539161026477814, "rewards/accuracy_reward": 0.7430515289306641, "rewards/format_reward": 0.9795918464660645, "step": 4682 }, { "completion_length": 200.56121826171875, "epoch": 0.4712452830188679, "grad_norm": 0.3314509689807892, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.9042385816574097, "reward_std": 0.055379413068294525, "rewards/accuracy_reward": 0.9042386114597321, "rewards/format_reward": 1.0, "step": 4683 }, { "completion_length": 318.4897918701172, "epoch": 0.47134591194968556, "grad_norm": 1.6335077285766602, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8061224222183228, "reward_std": 0.23824258148670197, "rewards/accuracy_reward": 0.8163264989852905, "rewards/format_reward": 0.9897959232330322, "step": 4684 }, { "completion_length": 247.60204315185547, "epoch": 0.47144654088050314, "grad_norm": 2.243847131729126, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8538265228271484, "reward_std": 0.09724102914333344, "rewards/accuracy_reward": 0.8640305995941162, "rewards/format_reward": 0.9897959232330322, "step": 4685 }, { "completion_length": 269.6122360229492, "epoch": 0.4715471698113208, "grad_norm": 1.0617746114730835, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8035835027694702, "reward_std": 0.2772316411137581, "rewards/accuracy_reward": 0.8239917159080505, "rewards/format_reward": 0.9795918166637421, "step": 4686 }, { "completion_length": 269.02040100097656, "epoch": 0.47164779874213836, "grad_norm": 0.7137766480445862, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6713868975639343, "reward_std": 0.1788799874484539, "rewards/accuracy_reward": 0.6815910041332245, "rewards/format_reward": 0.9897959232330322, "step": 4687 }, { "completion_length": 179.9285659790039, "epoch": 0.471748427672956, "grad_norm": 0.45802363753318787, "kl": 0.09814453125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8775509595870972, "reward_std": 0.06517763808369637, "rewards/accuracy_reward": 0.8877550959587097, "rewards/format_reward": 0.9897959232330322, "step": 4688 }, { "completion_length": 260.7959213256836, "epoch": 0.4718490566037736, "grad_norm": 0.7414945960044861, "kl": 0.0709228515625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.802721083164215, "reward_std": 0.14502697438001633, "rewards/accuracy_reward": 0.8027210533618927, "rewards/format_reward": 1.0, "step": 4689 }, { "completion_length": 286.3367156982422, "epoch": 0.4719496855345912, "grad_norm": 0.8253647685050964, "kl": 0.114501953125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.608474850654602, "reward_std": 0.1865454763174057, "rewards/accuracy_reward": 0.6390870809555054, "rewards/format_reward": 0.9693877398967743, "step": 4690 }, { "completion_length": 262.0918273925781, "epoch": 0.4720503144654088, "grad_norm": 0.5222426056861877, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6773083806037903, "reward_std": 0.13344185426831245, "rewards/accuracy_reward": 0.6875124871730804, "rewards/format_reward": 0.9897959232330322, "step": 4691 }, { "completion_length": 181.37754821777344, "epoch": 0.47215094339622643, "grad_norm": 1.5201547145843506, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7448979020118713, "reward_std": 0.1079898476600647, "rewards/accuracy_reward": 0.7653061151504517, "rewards/format_reward": 0.9795918166637421, "step": 4692 }, { "completion_length": 271.8061218261719, "epoch": 0.472251572327044, "grad_norm": 0.984809935092926, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.650301992893219, "reward_std": 0.12569303438067436, "rewards/accuracy_reward": 0.6605061590671539, "rewards/format_reward": 0.9897959232330322, "step": 4693 }, { "completion_length": 256.6224365234375, "epoch": 0.47235220125786165, "grad_norm": 0.6673398017883301, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6402795314788818, "reward_std": 0.17601139098405838, "rewards/accuracy_reward": 0.6606877148151398, "rewards/format_reward": 0.9795918166637421, "step": 4694 }, { "completion_length": 194.9387664794922, "epoch": 0.47245283018867923, "grad_norm": 0.45961862802505493, "kl": 0.09814453125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8211450576782227, "reward_std": 0.08531290292739868, "rewards/accuracy_reward": 0.821145087480545, "rewards/format_reward": 1.0, "step": 4695 }, { "completion_length": 239.25508880615234, "epoch": 0.47255345911949687, "grad_norm": 0.38027265667915344, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7600906491279602, "reward_std": 0.14944690093398094, "rewards/accuracy_reward": 0.7907029092311859, "rewards/format_reward": 0.9693877398967743, "step": 4696 }, { "completion_length": 200.83673095703125, "epoch": 0.47265408805031445, "grad_norm": 0.6435437798500061, "kl": 0.111083984375, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.8152443170547485, "reward_std": 0.11798936687409878, "rewards/accuracy_reward": 0.8152442574501038, "rewards/format_reward": 1.0, "step": 4697 }, { "completion_length": 241.846923828125, "epoch": 0.4727547169811321, "grad_norm": 1.071803092956543, "kl": 0.05810546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7196709513664246, "reward_std": 0.19240384548902512, "rewards/accuracy_reward": 0.7298751175403595, "rewards/format_reward": 0.9897959232330322, "step": 4698 }, { "completion_length": 277.1122360229492, "epoch": 0.47285534591194966, "grad_norm": 0.7576318383216858, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.625886619091034, "reward_std": 0.16991058737039566, "rewards/accuracy_reward": 0.6360906958580017, "rewards/format_reward": 0.9897959232330322, "step": 4699 }, { "completion_length": 236.1020278930664, "epoch": 0.4729559748427673, "grad_norm": 0.5834524035453796, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6103325486183167, "reward_std": 0.1862589716911316, "rewards/accuracy_reward": 0.6307406723499298, "rewards/format_reward": 0.9795918464660645, "step": 4700 }, { "completion_length": 304.846923828125, "epoch": 0.4730566037735849, "grad_norm": 0.6205686926841736, "kl": 0.0556640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7856736779212952, "reward_std": 0.17517922073602676, "rewards/accuracy_reward": 0.7856737673282623, "rewards/format_reward": 1.0, "step": 4701 }, { "completion_length": 203.64285278320312, "epoch": 0.4731572327044025, "grad_norm": 0.3998032808303833, "kl": 0.104736328125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.9012934565544128, "reward_std": 0.044911742210388184, "rewards/accuracy_reward": 0.90129354596138, "rewards/format_reward": 1.0, "step": 4702 }, { "completion_length": 154.62244415283203, "epoch": 0.4732578616352201, "grad_norm": 1.3474059104919434, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8978421688079834, "reward_std": 0.10967757180333138, "rewards/accuracy_reward": 0.9080462753772736, "rewards/format_reward": 0.9897959232330322, "step": 4703 }, { "completion_length": 210.4897918701172, "epoch": 0.47335849056603774, "grad_norm": 0.4460242986679077, "kl": 0.1044921875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8577530980110168, "reward_std": 0.07831869274377823, "rewards/accuracy_reward": 0.8577531278133392, "rewards/format_reward": 1.0, "step": 4704 }, { "completion_length": 259.62244415283203, "epoch": 0.4734591194968553, "grad_norm": 0.8135376572608948, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8299319744110107, "reward_std": 0.07676002383232117, "rewards/accuracy_reward": 0.8401360809803009, "rewards/format_reward": 0.9897959232330322, "step": 4705 }, { "completion_length": 238.75509643554688, "epoch": 0.47355974842767296, "grad_norm": 0.258592814207077, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.861289918422699, "reward_std": 0.06116168946027756, "rewards/accuracy_reward": 0.8714940547943115, "rewards/format_reward": 0.9897959232330322, "step": 4706 }, { "completion_length": 207.9081573486328, "epoch": 0.4736603773584906, "grad_norm": 0.5322889685630798, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7414966225624084, "reward_std": 0.08884849399328232, "rewards/accuracy_reward": 0.7414965927600861, "rewards/format_reward": 1.0, "step": 4707 }, { "completion_length": 289.12245178222656, "epoch": 0.4737610062893082, "grad_norm": 0.3924558162689209, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8564801216125488, "reward_std": 0.10406021028757095, "rewards/accuracy_reward": 0.856480211019516, "rewards/format_reward": 1.0, "step": 4708 }, { "completion_length": 255.2448959350586, "epoch": 0.4738616352201258, "grad_norm": 0.8013818264007568, "kl": 0.0689697265625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7091071009635925, "reward_std": 0.13593409210443497, "rewards/accuracy_reward": 0.7193112373352051, "rewards/format_reward": 0.9897959232330322, "step": 4709 }, { "completion_length": 257.9591827392578, "epoch": 0.4739622641509434, "grad_norm": 0.4305962026119232, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7040815949440002, "reward_std": 0.17316748201847076, "rewards/accuracy_reward": 0.7346938848495483, "rewards/format_reward": 0.9693877398967743, "step": 4710 }, { "completion_length": 223.11224365234375, "epoch": 0.474062893081761, "grad_norm": 1.0655150413513184, "kl": 0.097900390625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6952258944511414, "reward_std": 0.20378027856349945, "rewards/accuracy_reward": 0.7258382141590118, "rewards/format_reward": 0.9693877398967743, "step": 4711 }, { "completion_length": 248.06121826171875, "epoch": 0.4741635220125786, "grad_norm": 0.9791117906570435, "kl": 0.097412109375, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6577743291854858, "reward_std": 0.11724457703530788, "rewards/accuracy_reward": 0.6679784804582596, "rewards/format_reward": 0.9897959232330322, "step": 4712 }, { "completion_length": 136.4693832397461, "epoch": 0.47426415094339625, "grad_norm": 0.5718299150466919, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.969387710094452, "reward_std": 0.08099238574504852, "rewards/accuracy_reward": 0.9795918166637421, "rewards/format_reward": 0.9897959232330322, "step": 4713 }, { "completion_length": 300.32652282714844, "epoch": 0.4743647798742138, "grad_norm": 0.4321286976337433, "kl": 0.0677490234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.845679759979248, "reward_std": 0.1103995218873024, "rewards/accuracy_reward": 0.8558838665485382, "rewards/format_reward": 0.9897959232330322, "step": 4714 }, { "completion_length": 217.69387817382812, "epoch": 0.47446540880503146, "grad_norm": 0.49282580614089966, "kl": 0.0843505859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.788925290107727, "reward_std": 0.05311031639575958, "rewards/accuracy_reward": 0.7889253497123718, "rewards/format_reward": 1.0, "step": 4715 }, { "completion_length": 213.11224365234375, "epoch": 0.47456603773584904, "grad_norm": 0.8969278931617737, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7455884218215942, "reward_std": 0.18182969093322754, "rewards/accuracy_reward": 0.7557925581932068, "rewards/format_reward": 0.9897959232330322, "step": 4716 }, { "completion_length": 186.61224365234375, "epoch": 0.4746666666666667, "grad_norm": 0.3886275291442871, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7961074709892273, "reward_std": 0.06533818878233433, "rewards/accuracy_reward": 0.7961075603961945, "rewards/format_reward": 1.0, "step": 4717 }, { "completion_length": 171.23468780517578, "epoch": 0.47476729559748426, "grad_norm": 1.3578779697418213, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6707221269607544, "reward_std": 0.14766480773687363, "rewards/accuracy_reward": 0.6809261739253998, "rewards/format_reward": 0.9897959232330322, "step": 4718 }, { "completion_length": 277.75508880615234, "epoch": 0.4748679245283019, "grad_norm": 1.6351943016052246, "kl": 0.103515625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6845738887786865, "reward_std": 0.14874187856912613, "rewards/accuracy_reward": 0.6947780251502991, "rewards/format_reward": 0.9897959232330322, "step": 4719 }, { "completion_length": 200.6326446533203, "epoch": 0.4749685534591195, "grad_norm": 0.8590339422225952, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8718616962432861, "reward_std": 0.10034157056361437, "rewards/accuracy_reward": 0.8820658624172211, "rewards/format_reward": 0.9897959232330322, "step": 4720 }, { "completion_length": 239.18366241455078, "epoch": 0.4750691823899371, "grad_norm": 0.49793919920921326, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6546310186386108, "reward_std": 0.11501102149486542, "rewards/accuracy_reward": 0.6648351550102234, "rewards/format_reward": 0.9897959232330322, "step": 4721 }, { "completion_length": 209.79591369628906, "epoch": 0.4751698113207547, "grad_norm": 0.8789170384407043, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8249202966690063, "reward_std": 0.16448501497507095, "rewards/accuracy_reward": 0.8351243436336517, "rewards/format_reward": 0.9897959232330322, "step": 4722 }, { "completion_length": 258.1326446533203, "epoch": 0.47527044025157233, "grad_norm": 0.4877343773841858, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8992630243301392, "reward_std": 0.12026579678058624, "rewards/accuracy_reward": 0.9094671010971069, "rewards/format_reward": 0.9897959232330322, "step": 4723 }, { "completion_length": 205.7142791748047, "epoch": 0.4753710691823899, "grad_norm": 0.8099907040596008, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6953352093696594, "reward_std": 0.1226097010076046, "rewards/accuracy_reward": 0.6953352689743042, "rewards/format_reward": 1.0, "step": 4724 }, { "completion_length": 181.6326446533203, "epoch": 0.47547169811320755, "grad_norm": 0.41587692499160767, "kl": 0.12353515625, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.9081632494926453, "reward_std": 0.12611785531044006, "rewards/accuracy_reward": 0.9591836631298065, "rewards/format_reward": 0.9489795863628387, "step": 4725 }, { "completion_length": 270.3163299560547, "epoch": 0.47557232704402513, "grad_norm": 0.6320234537124634, "kl": 0.0667724609375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7010689973831177, "reward_std": 0.15790897607803345, "rewards/accuracy_reward": 0.7010689973831177, "rewards/format_reward": 1.0, "step": 4726 }, { "completion_length": 244.31632232666016, "epoch": 0.47567295597484277, "grad_norm": 0.9975257515907288, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7523809671401978, "reward_std": 0.15216399729251862, "rewards/accuracy_reward": 0.7727891206741333, "rewards/format_reward": 0.9795918464660645, "step": 4727 }, { "completion_length": 291.05101013183594, "epoch": 0.47577358490566035, "grad_norm": 1.0153709650039673, "kl": 0.1044921875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6473923325538635, "reward_std": 0.216109499335289, "rewards/accuracy_reward": 0.6678004264831543, "rewards/format_reward": 0.9795918464660645, "step": 4728 }, { "completion_length": 242.84693145751953, "epoch": 0.475874213836478, "grad_norm": 0.7242733240127563, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7823129892349243, "reward_std": 0.11039818823337555, "rewards/accuracy_reward": 0.7823129296302795, "rewards/format_reward": 1.0, "step": 4729 }, { "completion_length": 191.36734008789062, "epoch": 0.4759748427672956, "grad_norm": 1.2623602151870728, "kl": 0.125244140625, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.6650383472442627, "reward_std": 0.17784736678004265, "rewards/accuracy_reward": 0.6854465305805206, "rewards/format_reward": 0.9795918464660645, "step": 4730 }, { "completion_length": 249.46939086914062, "epoch": 0.4760754716981132, "grad_norm": 0.8051787614822388, "kl": 0.0770263671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.785708725452423, "reward_std": 0.2599799111485481, "rewards/accuracy_reward": 0.7959127724170685, "rewards/format_reward": 0.9897959232330322, "step": 4731 }, { "completion_length": 282.8061218261719, "epoch": 0.47617610062893084, "grad_norm": 0.48133933544158936, "kl": 0.0784912109375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8244279026985168, "reward_std": 0.18845159932971, "rewards/accuracy_reward": 0.834632009267807, "rewards/format_reward": 0.9897959232330322, "step": 4732 }, { "completion_length": 212.2653045654297, "epoch": 0.4762767295597484, "grad_norm": 0.5941667556762695, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8656299710273743, "reward_std": 0.13083817064762115, "rewards/accuracy_reward": 0.8758341372013092, "rewards/format_reward": 0.9897959232330322, "step": 4733 }, { "completion_length": 207.80612182617188, "epoch": 0.47637735849056606, "grad_norm": 0.73969566822052, "kl": 0.116943359375, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.8521193265914917, "reward_std": 0.1122869960963726, "rewards/accuracy_reward": 0.8623233735561371, "rewards/format_reward": 0.9897959232330322, "step": 4734 }, { "completion_length": 254.2244873046875, "epoch": 0.47647798742138364, "grad_norm": 0.9115119576454163, "kl": 0.09716796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6802411079406738, "reward_std": 0.1616475023329258, "rewards/accuracy_reward": 0.6904452443122864, "rewards/format_reward": 0.9897959232330322, "step": 4735 }, { "completion_length": 254.11224365234375, "epoch": 0.4765786163522013, "grad_norm": 0.8370291590690613, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8435352444648743, "reward_std": 0.14088009297847748, "rewards/accuracy_reward": 0.8435352146625519, "rewards/format_reward": 1.0, "step": 4736 }, { "completion_length": 325.6326446533203, "epoch": 0.47667924528301886, "grad_norm": 0.5215827226638794, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6479302644729614, "reward_std": 0.22083104401826859, "rewards/accuracy_reward": 0.6785425245761871, "rewards/format_reward": 0.9693877398967743, "step": 4737 }, { "completion_length": 318.22447204589844, "epoch": 0.4767798742138365, "grad_norm": 0.6056405305862427, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7232011556625366, "reward_std": 0.19953469932079315, "rewards/accuracy_reward": 0.7232011556625366, "rewards/format_reward": 1.0, "step": 4738 }, { "completion_length": 220.53060913085938, "epoch": 0.4768805031446541, "grad_norm": 0.6763073801994324, "kl": 0.0909423828125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8463820815086365, "reward_std": 0.11880111321806908, "rewards/accuracy_reward": 0.8565862476825714, "rewards/format_reward": 0.9897959232330322, "step": 4739 }, { "completion_length": 226.2653045654297, "epoch": 0.4769811320754717, "grad_norm": 1.2281180620193481, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7543049454689026, "reward_std": 0.15079449862241745, "rewards/accuracy_reward": 0.764508992433548, "rewards/format_reward": 0.9897959232330322, "step": 4740 }, { "completion_length": 234.5, "epoch": 0.4770817610062893, "grad_norm": 0.3002874255180359, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8489663004875183, "reward_std": 0.045009125024080276, "rewards/accuracy_reward": 0.8591704070568085, "rewards/format_reward": 0.9897959232330322, "step": 4741 }, { "completion_length": 235.02040100097656, "epoch": 0.47718238993710693, "grad_norm": 0.49112075567245483, "kl": 0.116943359375, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.8764090538024902, "reward_std": 0.23109740763902664, "rewards/accuracy_reward": 0.9172253608703613, "rewards/format_reward": 0.9591836631298065, "step": 4742 }, { "completion_length": 194.16326141357422, "epoch": 0.4772830188679245, "grad_norm": 0.6693532466888428, "kl": 0.1298828125, "learning_rate": 1e-06, "loss": 0.0052, "reward": 1.732962191104889, "reward_std": 0.11419062316417694, "rewards/accuracy_reward": 0.7431663274765015, "rewards/format_reward": 0.9897959232330322, "step": 4743 }, { "completion_length": 238.75508880615234, "epoch": 0.47738364779874215, "grad_norm": 1.0635186433792114, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7442640662193298, "reward_std": 0.10612152516841888, "rewards/accuracy_reward": 0.7544681429862976, "rewards/format_reward": 0.9897959232330322, "step": 4744 }, { "completion_length": 213.448974609375, "epoch": 0.47748427672955973, "grad_norm": 0.3202647864818573, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8769557476043701, "reward_std": 0.055018847808241844, "rewards/accuracy_reward": 0.8871598541736603, "rewards/format_reward": 0.9897959232330322, "step": 4745 }, { "completion_length": 250.27550506591797, "epoch": 0.47758490566037737, "grad_norm": 3.217059373855591, "kl": 0.21875, "learning_rate": 1e-06, "loss": 0.0088, "reward": 1.7331368327140808, "reward_std": 0.11517747864127159, "rewards/accuracy_reward": 0.7433409690856934, "rewards/format_reward": 0.9897959232330322, "step": 4746 }, { "completion_length": 320.27549743652344, "epoch": 0.47768553459119495, "grad_norm": 0.9615257382392883, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.552380919456482, "reward_std": 0.17999329790472984, "rewards/accuracy_reward": 0.5727890729904175, "rewards/format_reward": 0.9795918166637421, "step": 4747 }, { "completion_length": 252.60203552246094, "epoch": 0.4777861635220126, "grad_norm": 1.22380793094635, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.77787446975708, "reward_std": 0.22989268600940704, "rewards/accuracy_reward": 0.7778745591640472, "rewards/format_reward": 1.0, "step": 4748 }, { "completion_length": 231.89794921875, "epoch": 0.47788679245283017, "grad_norm": 0.8986846208572388, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.738095223903656, "reward_std": 0.13580807484686375, "rewards/accuracy_reward": 0.7482992708683014, "rewards/format_reward": 0.9897959232330322, "step": 4749 }, { "completion_length": 194.61224365234375, "epoch": 0.4779874213836478, "grad_norm": 0.7404953241348267, "kl": 0.091064453125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.80206698179245, "reward_std": 0.11353940889239311, "rewards/accuracy_reward": 0.8020670115947723, "rewards/format_reward": 1.0, "step": 4750 }, { "completion_length": 186.08162689208984, "epoch": 0.4780880503144654, "grad_norm": 1.267076849937439, "kl": 0.104736328125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7680271863937378, "reward_std": 0.19127196818590164, "rewards/accuracy_reward": 0.7680272459983826, "rewards/format_reward": 1.0, "step": 4751 }, { "completion_length": 254.31632232666016, "epoch": 0.478188679245283, "grad_norm": 0.4777323603630066, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.715466022491455, "reward_std": 0.11899874545633793, "rewards/accuracy_reward": 0.7358740568161011, "rewards/format_reward": 0.9795918166637421, "step": 4752 }, { "completion_length": 254.64285278320312, "epoch": 0.4782893081761006, "grad_norm": 0.6471741795539856, "kl": 0.0614013671875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.706997036933899, "reward_std": 0.18308581411838531, "rewards/accuracy_reward": 0.7274052202701569, "rewards/format_reward": 0.9795918166637421, "step": 4753 }, { "completion_length": 313.5408172607422, "epoch": 0.47838993710691824, "grad_norm": 0.7252688407897949, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7114537954330444, "reward_std": 0.1782427877187729, "rewards/accuracy_reward": 0.721657782793045, "rewards/format_reward": 0.9897959232330322, "step": 4754 }, { "completion_length": 135.4897918701172, "epoch": 0.4784905660377359, "grad_norm": 1.3636815547943115, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8453513979911804, "reward_std": 0.06450273562222719, "rewards/accuracy_reward": 0.8453514873981476, "rewards/format_reward": 1.0, "step": 4755 }, { "completion_length": 230.84693908691406, "epoch": 0.47859119496855346, "grad_norm": 0.7266821265220642, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6657191514968872, "reward_std": 0.24750813841819763, "rewards/accuracy_reward": 0.6759231984615326, "rewards/format_reward": 0.9897959232330322, "step": 4756 }, { "completion_length": 199.03060913085938, "epoch": 0.4786918238993711, "grad_norm": 1.3671857118606567, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8413459658622742, "reward_std": 0.2324111983180046, "rewards/accuracy_reward": 0.8617542386054993, "rewards/format_reward": 0.9795918464660645, "step": 4757 }, { "completion_length": 246.53060150146484, "epoch": 0.4787924528301887, "grad_norm": 2.0610790252685547, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6508259773254395, "reward_std": 0.24553199112415314, "rewards/accuracy_reward": 0.6814382672309875, "rewards/format_reward": 0.9693877398967743, "step": 4758 }, { "completion_length": 160.75509643554688, "epoch": 0.4788930817610063, "grad_norm": 0.41606172919273376, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.855725646018982, "reward_std": 0.034870266914367676, "rewards/accuracy_reward": 0.8557256162166595, "rewards/format_reward": 1.0, "step": 4759 }, { "completion_length": 206.23468780517578, "epoch": 0.4789937106918239, "grad_norm": 1.0143178701400757, "kl": 0.05810546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8752834796905518, "reward_std": 0.14918231964111328, "rewards/accuracy_reward": 0.8854875266551971, "rewards/format_reward": 0.9897959232330322, "step": 4760 }, { "completion_length": 284.33673095703125, "epoch": 0.47909433962264153, "grad_norm": 1.4381273984909058, "kl": 0.10205078125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6301470398902893, "reward_std": 0.23952583968639374, "rewards/accuracy_reward": 0.6403512060642242, "rewards/format_reward": 0.9897959232330322, "step": 4761 }, { "completion_length": 208.4693832397461, "epoch": 0.4791949685534591, "grad_norm": 0.7117005586624146, "kl": 0.0721435546875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8634222745895386, "reward_std": 0.18933824449777603, "rewards/accuracy_reward": 0.8736263811588287, "rewards/format_reward": 0.9897959232330322, "step": 4762 }, { "completion_length": 211.2346954345703, "epoch": 0.47929559748427675, "grad_norm": 0.904240608215332, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5849186182022095, "reward_std": 0.25228167325258255, "rewards/accuracy_reward": 0.6155308336019516, "rewards/format_reward": 0.9693877398967743, "step": 4763 }, { "completion_length": 224.80612182617188, "epoch": 0.47939622641509433, "grad_norm": 0.9153409600257874, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.5766651034355164, "reward_std": 0.24113906919956207, "rewards/accuracy_reward": 0.6072774231433868, "rewards/format_reward": 0.9693877398967743, "step": 4764 }, { "completion_length": 214.69387817382812, "epoch": 0.47949685534591197, "grad_norm": 0.827627420425415, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7043863534927368, "reward_std": 0.18308168277144432, "rewards/accuracy_reward": 0.7145904898643494, "rewards/format_reward": 0.9897959232330322, "step": 4765 }, { "completion_length": 249.99999237060547, "epoch": 0.47959748427672955, "grad_norm": 0.6030886173248291, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8105058073997498, "reward_std": 0.16501452773809433, "rewards/accuracy_reward": 0.8309139609336853, "rewards/format_reward": 0.9795918464660645, "step": 4766 }, { "completion_length": 243.2448959350586, "epoch": 0.4796981132075472, "grad_norm": 0.9048119187355042, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7749123573303223, "reward_std": 0.18058770522475243, "rewards/accuracy_reward": 0.7953205108642578, "rewards/format_reward": 0.9795918464660645, "step": 4767 }, { "completion_length": 177.79591369628906, "epoch": 0.47979874213836476, "grad_norm": 1.1867389678955078, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7666805386543274, "reward_std": 0.2062811702489853, "rewards/accuracy_reward": 0.7666806280612946, "rewards/format_reward": 1.0, "step": 4768 }, { "completion_length": 192.19387817382812, "epoch": 0.4798993710691824, "grad_norm": 0.8748278617858887, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.776643991470337, "reward_std": 0.21252990514039993, "rewards/accuracy_reward": 0.7766439616680145, "rewards/format_reward": 1.0, "step": 4769 }, { "completion_length": 210.23468017578125, "epoch": 0.48, "grad_norm": 0.7584051489830017, "kl": 0.09765625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7656055688858032, "reward_std": 0.10086188092827797, "rewards/accuracy_reward": 0.7758096754550934, "rewards/format_reward": 0.9897959232330322, "step": 4770 }, { "completion_length": 168.61224365234375, "epoch": 0.4801006289308176, "grad_norm": 0.7418932914733887, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8250728845596313, "reward_std": 0.13039079308509827, "rewards/accuracy_reward": 0.8352769613265991, "rewards/format_reward": 0.9897959232330322, "step": 4771 }, { "completion_length": 194.02040100097656, "epoch": 0.4802012578616352, "grad_norm": 0.7112899422645569, "kl": 0.089111328125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.841001808643341, "reward_std": 0.11802654340863228, "rewards/accuracy_reward": 0.8512059450149536, "rewards/format_reward": 0.9897959232330322, "step": 4772 }, { "completion_length": 279.78570556640625, "epoch": 0.48030188679245284, "grad_norm": 0.7040631175041199, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.663313865661621, "reward_std": 0.1287742257118225, "rewards/accuracy_reward": 0.6735179871320724, "rewards/format_reward": 0.9897959232330322, "step": 4773 }, { "completion_length": 223.65306091308594, "epoch": 0.4804025157232704, "grad_norm": 0.7427046298980713, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6887754797935486, "reward_std": 0.12845928221940994, "rewards/accuracy_reward": 0.6989795565605164, "rewards/format_reward": 0.9897959232330322, "step": 4774 }, { "completion_length": 176.7653045654297, "epoch": 0.48050314465408805, "grad_norm": 9.131853103637695, "kl": 0.204345703125, "learning_rate": 1e-06, "loss": 0.0082, "reward": 1.8791458010673523, "reward_std": 0.1307203695178032, "rewards/accuracy_reward": 0.8893499970436096, "rewards/format_reward": 0.9897959232330322, "step": 4775 }, { "completion_length": 157.37754821777344, "epoch": 0.48060377358490564, "grad_norm": 0.7845602035522461, "kl": 0.098876953125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8060606122016907, "reward_std": 0.08633994963020086, "rewards/accuracy_reward": 0.8060605227947235, "rewards/format_reward": 1.0, "step": 4776 }, { "completion_length": 174.6734619140625, "epoch": 0.4807044025157233, "grad_norm": 30.438430786132812, "kl": 0.124267578125, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.8006941676139832, "reward_std": 0.08369026705622673, "rewards/accuracy_reward": 0.8006941974163055, "rewards/format_reward": 1.0, "step": 4777 }, { "completion_length": 188.1938705444336, "epoch": 0.48080503144654085, "grad_norm": 0.7499307990074158, "kl": 0.098388671875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8299660086631775, "reward_std": 0.14298345148563385, "rewards/accuracy_reward": 0.8503742218017578, "rewards/format_reward": 0.9795918464660645, "step": 4778 }, { "completion_length": 243.03060913085938, "epoch": 0.4809056603773585, "grad_norm": 0.7497156858444214, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7028183937072754, "reward_std": 0.14271975308656693, "rewards/accuracy_reward": 0.7028183937072754, "rewards/format_reward": 1.0, "step": 4779 }, { "completion_length": 239.66326141357422, "epoch": 0.4810062893081761, "grad_norm": 0.5135916471481323, "kl": 0.1044921875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7222332954406738, "reward_std": 0.13375210016965866, "rewards/accuracy_reward": 0.7528455853462219, "rewards/format_reward": 0.9693877398967743, "step": 4780 }, { "completion_length": 198.14285278320312, "epoch": 0.4811069182389937, "grad_norm": 0.5802666544914246, "kl": 0.0845947265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7571427822113037, "reward_std": 0.19496501982212067, "rewards/accuracy_reward": 0.7775510251522064, "rewards/format_reward": 0.9795918464660645, "step": 4781 }, { "completion_length": 219.4693832397461, "epoch": 0.48120754716981134, "grad_norm": 0.76018887758255, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7167593836784363, "reward_std": 0.17243214696645737, "rewards/accuracy_reward": 0.7167594134807587, "rewards/format_reward": 1.0, "step": 4782 }, { "completion_length": 250.56122589111328, "epoch": 0.4813081761006289, "grad_norm": 0.5379917025566101, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7408957481384277, "reward_std": 0.15229281038045883, "rewards/accuracy_reward": 0.7510998845100403, "rewards/format_reward": 0.9897959232330322, "step": 4783 }, { "completion_length": 131.48979568481445, "epoch": 0.48140880503144656, "grad_norm": 0.5205562114715576, "kl": 0.110595703125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.819436252117157, "reward_std": 0.10223523899912834, "rewards/accuracy_reward": 0.8296404182910919, "rewards/format_reward": 0.9897959232330322, "step": 4784 }, { "completion_length": 178.26529693603516, "epoch": 0.48150943396226414, "grad_norm": 0.3910713493824005, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8527224659919739, "reward_std": 0.1308678202331066, "rewards/accuracy_reward": 0.8629265427589417, "rewards/format_reward": 0.9897959232330322, "step": 4785 }, { "completion_length": 234.53060913085938, "epoch": 0.4816100628930818, "grad_norm": 0.7550440430641174, "kl": 0.115234375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.692551076412201, "reward_std": 0.2046102210879326, "rewards/accuracy_reward": 0.7231632471084595, "rewards/format_reward": 0.9693877398967743, "step": 4786 }, { "completion_length": 246.1734619140625, "epoch": 0.48171069182389936, "grad_norm": 0.33070412278175354, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8653061389923096, "reward_std": 0.12031924724578857, "rewards/accuracy_reward": 0.8959183394908905, "rewards/format_reward": 0.9693877398967743, "step": 4787 }, { "completion_length": 185.38775634765625, "epoch": 0.481811320754717, "grad_norm": 0.8653303980827332, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7993197441101074, "reward_std": 0.16894324868917465, "rewards/accuracy_reward": 0.8095237910747528, "rewards/format_reward": 0.9897959232330322, "step": 4788 }, { "completion_length": 207.40816497802734, "epoch": 0.4819119496855346, "grad_norm": 1.092666506767273, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7653409242630005, "reward_std": 0.14672167971730232, "rewards/accuracy_reward": 0.7653409838676453, "rewards/format_reward": 1.0, "step": 4789 }, { "completion_length": 233.33673095703125, "epoch": 0.4820125786163522, "grad_norm": 0.24234461784362793, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7789115905761719, "reward_std": 0.048419730737805367, "rewards/accuracy_reward": 0.7789115905761719, "rewards/format_reward": 1.0, "step": 4790 }, { "completion_length": 249.27549743652344, "epoch": 0.4821132075471698, "grad_norm": 1.046561360359192, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7338435649871826, "reward_std": 0.26721706986427307, "rewards/accuracy_reward": 0.7950680255889893, "rewards/format_reward": 0.938775509595871, "step": 4791 }, { "completion_length": 156.63265228271484, "epoch": 0.48221383647798743, "grad_norm": 2.530268430709839, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6414835453033447, "reward_std": 0.151047982275486, "rewards/accuracy_reward": 0.6414835304021835, "rewards/format_reward": 1.0, "step": 4792 }, { "completion_length": 221.05101776123047, "epoch": 0.482314465408805, "grad_norm": 1.04180908203125, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7523809671401978, "reward_std": 0.14001121371984482, "rewards/accuracy_reward": 0.7523809373378754, "rewards/format_reward": 1.0, "step": 4793 }, { "completion_length": 146.82652282714844, "epoch": 0.48241509433962265, "grad_norm": 0.24243947863578796, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.9956268072128296, "reward_std": 0.011570342816412449, "rewards/accuracy_reward": 0.9956267774105072, "rewards/format_reward": 1.0, "step": 4794 }, { "completion_length": 228.14285278320312, "epoch": 0.48251572327044023, "grad_norm": 0.48352715373039246, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8064793944358826, "reward_std": 0.04696762003004551, "rewards/accuracy_reward": 0.8064794540405273, "rewards/format_reward": 1.0, "step": 4795 }, { "completion_length": 245.3571319580078, "epoch": 0.48261635220125787, "grad_norm": 0.6203755736351013, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7234126329421997, "reward_std": 0.09961369726806879, "rewards/accuracy_reward": 0.7234127223491669, "rewards/format_reward": 1.0, "step": 4796 }, { "completion_length": 213.7142791748047, "epoch": 0.48271698113207545, "grad_norm": 0.8965386748313904, "kl": 0.10595703125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6738054156303406, "reward_std": 0.2135435715317726, "rewards/accuracy_reward": 0.673805445432663, "rewards/format_reward": 1.0, "step": 4797 }, { "completion_length": 192.34693908691406, "epoch": 0.4828176100628931, "grad_norm": 0.3950197100639343, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7781693935394287, "reward_std": 0.0857749730348587, "rewards/accuracy_reward": 0.798577606678009, "rewards/format_reward": 0.9795918464660645, "step": 4798 }, { "completion_length": 251.7244873046875, "epoch": 0.48291823899371067, "grad_norm": 0.5588343143463135, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.721117079257965, "reward_std": 0.06787139549851418, "rewards/accuracy_reward": 0.7211171686649323, "rewards/format_reward": 1.0, "step": 4799 }, { "completion_length": 243.39794921875, "epoch": 0.4830188679245283, "grad_norm": 0.7840772867202759, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6370791792869568, "reward_std": 0.05189705640077591, "rewards/accuracy_reward": 0.6370792388916016, "rewards/format_reward": 1.0, "step": 4800 }, { "completion_length": 223.6836700439453, "epoch": 0.4831194968553459, "grad_norm": 1.29057776927948, "kl": 0.1083984375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.6079976558685303, "reward_std": 0.2205490544438362, "rewards/accuracy_reward": 0.6386098861694336, "rewards/format_reward": 0.9693877398967743, "step": 4801 }, { "completion_length": 167.7244873046875, "epoch": 0.4832201257861635, "grad_norm": 1.0182894468307495, "kl": 0.0684814453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.814650058746338, "reward_std": 0.20490466803312302, "rewards/accuracy_reward": 0.8248542249202728, "rewards/format_reward": 0.9897959232330322, "step": 4802 }, { "completion_length": 267.38775634765625, "epoch": 0.4833207547169811, "grad_norm": 1.0526732206344604, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8529435396194458, "reward_std": 0.1442551612854004, "rewards/accuracy_reward": 0.863147646188736, "rewards/format_reward": 0.9897959232330322, "step": 4803 }, { "completion_length": 201.08162689208984, "epoch": 0.48342138364779874, "grad_norm": 0.9584518074989319, "kl": 0.0804443359375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7122448682785034, "reward_std": 0.18931367993354797, "rewards/accuracy_reward": 0.7224489748477936, "rewards/format_reward": 0.9897959232330322, "step": 4804 }, { "completion_length": 234.57142639160156, "epoch": 0.4835220125786164, "grad_norm": 0.6965649127960205, "kl": 0.0966796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6516934633255005, "reward_std": 0.21700947731733322, "rewards/accuracy_reward": 0.682305708527565, "rewards/format_reward": 0.9693877398967743, "step": 4805 }, { "completion_length": 192.7959213256836, "epoch": 0.48362264150943396, "grad_norm": 1.416953682899475, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.9097424149513245, "reward_std": 0.10063864849507809, "rewards/accuracy_reward": 0.9301506280899048, "rewards/format_reward": 0.9795918166637421, "step": 4806 }, { "completion_length": 174.80612182617188, "epoch": 0.4837232704402516, "grad_norm": 0.9060747623443604, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.836248755455017, "reward_std": 0.13574540801346302, "rewards/accuracy_reward": 0.8464528322219849, "rewards/format_reward": 0.9897959232330322, "step": 4807 }, { "completion_length": 143.58162689208984, "epoch": 0.4838238993710692, "grad_norm": 0.7077674865722656, "kl": 0.1064453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.8469387888908386, "reward_std": 0.08099238574504852, "rewards/accuracy_reward": 0.8469387590885162, "rewards/format_reward": 1.0, "step": 4808 }, { "completion_length": 174.67346954345703, "epoch": 0.4839245283018868, "grad_norm": 0.44549989700317383, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8269280791282654, "reward_std": 0.041392721235752106, "rewards/accuracy_reward": 0.8269281983375549, "rewards/format_reward": 1.0, "step": 4809 }, { "completion_length": 276.2346954345703, "epoch": 0.4840251572327044, "grad_norm": 1.0111560821533203, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.699822187423706, "reward_std": 0.19482047110795975, "rewards/accuracy_reward": 0.7202304303646088, "rewards/format_reward": 0.9795918464660645, "step": 4810 }, { "completion_length": 217.79591369628906, "epoch": 0.48412578616352203, "grad_norm": 0.7305612564086914, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7683531641960144, "reward_std": 0.12960843741893768, "rewards/accuracy_reward": 0.7785573303699493, "rewards/format_reward": 0.9897959232330322, "step": 4811 }, { "completion_length": 212.25509643554688, "epoch": 0.4842264150943396, "grad_norm": 0.3666321337223053, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.724157691001892, "reward_std": 0.025484411977231503, "rewards/accuracy_reward": 0.7241576015949249, "rewards/format_reward": 1.0, "step": 4812 }, { "completion_length": 281.1020278930664, "epoch": 0.48432704402515725, "grad_norm": 1.0530141592025757, "kl": 0.0765380859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6017617583274841, "reward_std": 0.28750016540288925, "rewards/accuracy_reward": 0.6425780653953552, "rewards/format_reward": 0.9591836631298065, "step": 4813 }, { "completion_length": 226.2653045654297, "epoch": 0.48442767295597483, "grad_norm": 0.6128957867622375, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7901867628097534, "reward_std": 0.11082278564572334, "rewards/accuracy_reward": 0.8207990825176239, "rewards/format_reward": 0.9693877398967743, "step": 4814 }, { "completion_length": 246.53060913085938, "epoch": 0.48452830188679247, "grad_norm": 0.6923327445983887, "kl": 0.103271484375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7629637122154236, "reward_std": 0.1419987641274929, "rewards/accuracy_reward": 0.7629638016223907, "rewards/format_reward": 1.0, "step": 4815 }, { "completion_length": 284.3571319580078, "epoch": 0.48462893081761005, "grad_norm": 0.9408493638038635, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7794314622879028, "reward_std": 0.19090524315834045, "rewards/accuracy_reward": 0.789635568857193, "rewards/format_reward": 0.9897959232330322, "step": 4816 }, { "completion_length": 316.6224365234375, "epoch": 0.4847295597484277, "grad_norm": 0.6010851263999939, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6821761727333069, "reward_std": 0.14955990202724934, "rewards/accuracy_reward": 0.6923801898956299, "rewards/format_reward": 0.9897959232330322, "step": 4817 }, { "completion_length": 225.77550506591797, "epoch": 0.48483018867924527, "grad_norm": 0.7877667546272278, "kl": 0.11376953125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.690840482711792, "reward_std": 0.21414756029844284, "rewards/accuracy_reward": 0.7112487256526947, "rewards/format_reward": 0.9795918166637421, "step": 4818 }, { "completion_length": 205.78571319580078, "epoch": 0.4849308176100629, "grad_norm": 0.7320484519004822, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8979591131210327, "reward_std": 0.20016494393348694, "rewards/accuracy_reward": 0.9285714030265808, "rewards/format_reward": 0.9693877398967743, "step": 4819 }, { "completion_length": 170.84693908691406, "epoch": 0.4850314465408805, "grad_norm": 0.35285401344299316, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.77068430185318, "reward_std": 0.047194696962833405, "rewards/accuracy_reward": 0.7706842720508575, "rewards/format_reward": 1.0, "step": 4820 }, { "completion_length": 268.77550506591797, "epoch": 0.4851320754716981, "grad_norm": 0.883317232131958, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6687366962432861, "reward_std": 0.17533320933580399, "rewards/accuracy_reward": 0.6789407730102539, "rewards/format_reward": 0.9897959232330322, "step": 4821 }, { "completion_length": 202.8775405883789, "epoch": 0.4852327044025157, "grad_norm": 0.19705379009246826, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.795918345451355, "reward_std": 0.026997461915016174, "rewards/accuracy_reward": 0.795918345451355, "rewards/format_reward": 1.0, "step": 4822 }, { "completion_length": 194.4285659790039, "epoch": 0.48533333333333334, "grad_norm": 4.268762111663818, "kl": 0.292724609375, "learning_rate": 1e-06, "loss": 0.0117, "reward": 1.6758062243461609, "reward_std": 0.15026599913835526, "rewards/accuracy_reward": 0.6860102117061615, "rewards/format_reward": 0.9897959232330322, "step": 4823 }, { "completion_length": 205.28571319580078, "epoch": 0.4854339622641509, "grad_norm": 2.0711569786071777, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.817460298538208, "reward_std": 0.2522667795419693, "rewards/accuracy_reward": 0.8582766056060791, "rewards/format_reward": 0.9591836631298065, "step": 4824 }, { "completion_length": 261.85713958740234, "epoch": 0.48553459119496856, "grad_norm": 2.0513601303100586, "kl": 0.1123046875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.6033359169960022, "reward_std": 0.3059035763144493, "rewards/accuracy_reward": 0.6645603179931641, "rewards/format_reward": 0.938775509595871, "step": 4825 }, { "completion_length": 198.81632232666016, "epoch": 0.48563522012578614, "grad_norm": 9.354511260986328, "kl": 0.24365234375, "learning_rate": 1e-06, "loss": 0.0097, "reward": 1.609369158744812, "reward_std": 0.19934412091970444, "rewards/accuracy_reward": 0.6297773718833923, "rewards/format_reward": 0.9795918166637421, "step": 4826 }, { "completion_length": 206.39794921875, "epoch": 0.4857358490566038, "grad_norm": 0.8649026155471802, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8336422443389893, "reward_std": 0.1839408352971077, "rewards/accuracy_reward": 0.8438464105129242, "rewards/format_reward": 0.9897959232330322, "step": 4827 }, { "completion_length": 199.15306091308594, "epoch": 0.4858364779874214, "grad_norm": 0.6978653073310852, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7172592878341675, "reward_std": 0.18136218190193176, "rewards/accuracy_reward": 0.72746342420578, "rewards/format_reward": 0.9897959232330322, "step": 4828 }, { "completion_length": 198.87754821777344, "epoch": 0.485937106918239, "grad_norm": 0.8164907693862915, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.9183672666549683, "reward_std": 0.10335781052708626, "rewards/accuracy_reward": 0.918367326259613, "rewards/format_reward": 1.0, "step": 4829 }, { "completion_length": 218.4693832397461, "epoch": 0.48603773584905663, "grad_norm": 0.9455909729003906, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.682296633720398, "reward_std": 0.3066771477460861, "rewards/accuracy_reward": 0.7027048766613007, "rewards/format_reward": 0.9795918464660645, "step": 4830 }, { "completion_length": 258.09183502197266, "epoch": 0.4861383647798742, "grad_norm": 0.9392332434654236, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.629305362701416, "reward_std": 0.1930093765258789, "rewards/accuracy_reward": 0.639509454369545, "rewards/format_reward": 0.9897959232330322, "step": 4831 }, { "completion_length": 226.34693145751953, "epoch": 0.48623899371069185, "grad_norm": 1.2266778945922852, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7184887528419495, "reward_std": 0.186491958796978, "rewards/accuracy_reward": 0.7286929190158844, "rewards/format_reward": 0.9897959232330322, "step": 4832 }, { "completion_length": 286.08162689208984, "epoch": 0.48633962264150943, "grad_norm": 0.8762904405593872, "kl": 0.0667724609375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.628547728061676, "reward_std": 0.23323939740657806, "rewards/accuracy_reward": 0.648955911397934, "rewards/format_reward": 0.9795918166637421, "step": 4833 }, { "completion_length": 219.85713958740234, "epoch": 0.48644025157232706, "grad_norm": 0.5732334852218628, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.726803719997406, "reward_std": 0.1088336743414402, "rewards/accuracy_reward": 0.726803719997406, "rewards/format_reward": 1.0, "step": 4834 }, { "completion_length": 264.87754821777344, "epoch": 0.48654088050314465, "grad_norm": 0.9402608871459961, "kl": 0.103271484375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7067408561706543, "reward_std": 0.2371494546532631, "rewards/accuracy_reward": 0.7271490097045898, "rewards/format_reward": 0.9795918166637421, "step": 4835 }, { "completion_length": 238.846923828125, "epoch": 0.4866415094339623, "grad_norm": 0.5486835837364197, "kl": 0.0657958984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7897958755493164, "reward_std": 0.152779221534729, "rewards/accuracy_reward": 0.800000011920929, "rewards/format_reward": 0.9897959232330322, "step": 4836 }, { "completion_length": 198.77550506591797, "epoch": 0.48674213836477986, "grad_norm": 1.294238567352295, "kl": 0.144775390625, "learning_rate": 1e-06, "loss": 0.0058, "reward": 1.6491392254829407, "reward_std": 0.2000008150935173, "rewards/accuracy_reward": 0.669547438621521, "rewards/format_reward": 0.9795918464660645, "step": 4837 }, { "completion_length": 188.23468780517578, "epoch": 0.4868427672955975, "grad_norm": 0.892350971698761, "kl": 0.0865478515625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.9010071158409119, "reward_std": 0.16863467916846275, "rewards/accuracy_reward": 0.9112111926078796, "rewards/format_reward": 0.9897959232330322, "step": 4838 }, { "completion_length": 213.86734008789062, "epoch": 0.4869433962264151, "grad_norm": 0.6884859204292297, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7998055815696716, "reward_std": 0.18990978226065636, "rewards/accuracy_reward": 0.8202137649059296, "rewards/format_reward": 0.9795918464660645, "step": 4839 }, { "completion_length": 273.9387741088867, "epoch": 0.4870440251572327, "grad_norm": 1.4233161211013794, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5045421123504639, "reward_std": 0.20436374843120575, "rewards/accuracy_reward": 0.5249503254890442, "rewards/format_reward": 0.9795918166637421, "step": 4840 }, { "completion_length": 204.04080963134766, "epoch": 0.4871446540880503, "grad_norm": 0.779191255569458, "kl": 0.0718994140625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8571428060531616, "reward_std": 0.13001416623592377, "rewards/accuracy_reward": 0.8673469126224518, "rewards/format_reward": 0.9897959232330322, "step": 4841 }, { "completion_length": 231.60203170776367, "epoch": 0.48724528301886794, "grad_norm": 0.737473726272583, "kl": 0.1181640625, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.6882991194725037, "reward_std": 0.12428133934736252, "rewards/accuracy_reward": 0.6985031962394714, "rewards/format_reward": 0.9897959232330322, "step": 4842 }, { "completion_length": 210.86734008789062, "epoch": 0.4873459119496855, "grad_norm": 0.5207542181015015, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8678571581840515, "reward_std": 0.11270562559366226, "rewards/accuracy_reward": 0.8780612051486969, "rewards/format_reward": 0.9897959232330322, "step": 4843 }, { "completion_length": 204.74488830566406, "epoch": 0.48744654088050315, "grad_norm": 0.8021518588066101, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.9306645393371582, "reward_std": 0.0964675322175026, "rewards/accuracy_reward": 0.9408686459064484, "rewards/format_reward": 0.9897959232330322, "step": 4844 }, { "completion_length": 277.55101013183594, "epoch": 0.48754716981132074, "grad_norm": 0.5978388786315918, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.5481589436531067, "reward_std": 0.1293957531452179, "rewards/accuracy_reward": 0.558363139629364, "rewards/format_reward": 0.9897959232330322, "step": 4845 }, { "completion_length": 193.50000381469727, "epoch": 0.48764779874213837, "grad_norm": 0.73763108253479, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8454081416130066, "reward_std": 0.05466986075043678, "rewards/accuracy_reward": 0.8454081416130066, "rewards/format_reward": 1.0, "step": 4846 }, { "completion_length": 229.13265228271484, "epoch": 0.48774842767295595, "grad_norm": 1.1337454319000244, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.761937439441681, "reward_std": 0.17993716150522232, "rewards/accuracy_reward": 0.7619374096393585, "rewards/format_reward": 1.0, "step": 4847 }, { "completion_length": 283.87754821777344, "epoch": 0.4878490566037736, "grad_norm": 0.8580414652824402, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6170526146888733, "reward_std": 0.2035229429602623, "rewards/accuracy_reward": 0.6374607980251312, "rewards/format_reward": 0.9795918166637421, "step": 4848 }, { "completion_length": 251.4795913696289, "epoch": 0.48794968553459117, "grad_norm": 0.7680156230926514, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.805960476398468, "reward_std": 0.11818242073059082, "rewards/accuracy_reward": 0.8365727066993713, "rewards/format_reward": 0.9693877398967743, "step": 4849 }, { "completion_length": 262.60203552246094, "epoch": 0.4880503144654088, "grad_norm": 0.9795816540718079, "kl": 0.1015625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7447314262390137, "reward_std": 0.1583823785185814, "rewards/accuracy_reward": 0.7753438055515289, "rewards/format_reward": 0.9693877398967743, "step": 4850 }, { "completion_length": 245.87754821777344, "epoch": 0.4881509433962264, "grad_norm": 0.9522464275360107, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.5742859840393066, "reward_std": 0.19129637628793716, "rewards/accuracy_reward": 0.5946940779685974, "rewards/format_reward": 0.9795918166637421, "step": 4851 }, { "completion_length": 257.3367233276367, "epoch": 0.488251572327044, "grad_norm": 1.1851056814193726, "kl": 0.093994140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.873207688331604, "reward_std": 0.09990130364894867, "rewards/accuracy_reward": 0.8834117949008942, "rewards/format_reward": 0.9897959232330322, "step": 4852 }, { "completion_length": 200.4081573486328, "epoch": 0.48835220125786166, "grad_norm": 1.5465079545974731, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.5962504744529724, "reward_std": 0.16888321191072464, "rewards/accuracy_reward": 0.606454536318779, "rewards/format_reward": 0.9897959232330322, "step": 4853 }, { "completion_length": 231.83673095703125, "epoch": 0.48845283018867924, "grad_norm": 0.306025892496109, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.736796498298645, "reward_std": 0.06022327393293381, "rewards/accuracy_reward": 0.7470006048679352, "rewards/format_reward": 0.9897959232330322, "step": 4854 }, { "completion_length": 298.33673095703125, "epoch": 0.4885534591194969, "grad_norm": 5.5860066413879395, "kl": 0.20263671875, "learning_rate": 1e-06, "loss": 0.0081, "reward": 1.7856943607330322, "reward_std": 0.1790916696190834, "rewards/accuracy_reward": 0.8061025142669678, "rewards/format_reward": 0.9795918464660645, "step": 4855 }, { "completion_length": 204.78571319580078, "epoch": 0.48865408805031446, "grad_norm": 1.2750791311264038, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.782507300376892, "reward_std": 0.17283204942941666, "rewards/accuracy_reward": 0.7825073003768921, "rewards/format_reward": 1.0, "step": 4856 }, { "completion_length": 257.89795684814453, "epoch": 0.4887547169811321, "grad_norm": 0.5636690258979797, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6523510217666626, "reward_std": 0.06900321878492832, "rewards/accuracy_reward": 0.652351051568985, "rewards/format_reward": 1.0, "step": 4857 }, { "completion_length": 223.34693908691406, "epoch": 0.4888553459119497, "grad_norm": 0.692338228225708, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.725472629070282, "reward_std": 0.14030721783638, "rewards/accuracy_reward": 0.7458808720111847, "rewards/format_reward": 0.9795918464660645, "step": 4858 }, { "completion_length": 246.08162689208984, "epoch": 0.4889559748427673, "grad_norm": 0.9893331527709961, "kl": 0.114501953125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.8497715592384338, "reward_std": 0.2221105955541134, "rewards/accuracy_reward": 0.8905879557132721, "rewards/format_reward": 0.9591836631298065, "step": 4859 }, { "completion_length": 204.84693908691406, "epoch": 0.4890566037735849, "grad_norm": 1.2232911586761475, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7551673650741577, "reward_std": 0.13657523691654205, "rewards/accuracy_reward": 0.7653715014457703, "rewards/format_reward": 0.9897959232330322, "step": 4860 }, { "completion_length": 187.07142639160156, "epoch": 0.48915723270440253, "grad_norm": 0.6807588934898376, "kl": 0.0740966796875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7993196845054626, "reward_std": 0.12817171216011047, "rewards/accuracy_reward": 0.7993197441101074, "rewards/format_reward": 1.0, "step": 4861 }, { "completion_length": 271.6224365234375, "epoch": 0.4892578616352201, "grad_norm": 0.6095892786979675, "kl": 0.059326171875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.771847128868103, "reward_std": 0.16179828345775604, "rewards/accuracy_reward": 0.7820512652397156, "rewards/format_reward": 0.9897959232330322, "step": 4862 }, { "completion_length": 219.948974609375, "epoch": 0.48935849056603775, "grad_norm": 1.369349479675293, "kl": 0.111572265625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7298889756202698, "reward_std": 0.17830424010753632, "rewards/accuracy_reward": 0.7298890054225922, "rewards/format_reward": 1.0, "step": 4863 }, { "completion_length": 235.8775405883789, "epoch": 0.48945911949685533, "grad_norm": 0.9020184278488159, "kl": 0.0709228515625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8136820793151855, "reward_std": 0.18986928462982178, "rewards/accuracy_reward": 0.8340902328491211, "rewards/format_reward": 0.9795918464660645, "step": 4864 }, { "completion_length": 242.2653045654297, "epoch": 0.48955974842767297, "grad_norm": 0.5229983329772949, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.656373143196106, "reward_std": 0.06259693391621113, "rewards/accuracy_reward": 0.6563731133937836, "rewards/format_reward": 1.0, "step": 4865 }, { "completion_length": 214.4081573486328, "epoch": 0.48966037735849055, "grad_norm": 0.6544706225395203, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8707482814788818, "reward_std": 0.1045607253909111, "rewards/accuracy_reward": 0.8707482218742371, "rewards/format_reward": 1.0, "step": 4866 }, { "completion_length": 220.89794921875, "epoch": 0.4897610062893082, "grad_norm": 0.6085944771766663, "kl": 0.1007080078125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.679411232471466, "reward_std": 0.14615334197878838, "rewards/accuracy_reward": 0.6794112920761108, "rewards/format_reward": 1.0, "step": 4867 }, { "completion_length": 235.92855834960938, "epoch": 0.48986163522012577, "grad_norm": 0.513401210308075, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7349436283111572, "reward_std": 0.09870735928416252, "rewards/accuracy_reward": 0.745147705078125, "rewards/format_reward": 0.9897959232330322, "step": 4868 }, { "completion_length": 235.30612182617188, "epoch": 0.4899622641509434, "grad_norm": 0.6242847442626953, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7560296654701233, "reward_std": 0.1293450929224491, "rewards/accuracy_reward": 0.7764378190040588, "rewards/format_reward": 0.9795918464660645, "step": 4869 }, { "completion_length": 235.12244415283203, "epoch": 0.490062893081761, "grad_norm": 0.8352572917938232, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7321717739105225, "reward_std": 0.114337969571352, "rewards/accuracy_reward": 0.7321718335151672, "rewards/format_reward": 1.0, "step": 4870 }, { "completion_length": 328.3061065673828, "epoch": 0.4901635220125786, "grad_norm": 0.5936374664306641, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7086183428764343, "reward_std": 0.1268807016313076, "rewards/accuracy_reward": 0.7290265560150146, "rewards/format_reward": 0.9795918464660645, "step": 4871 }, { "completion_length": 203.39794921875, "epoch": 0.4902641509433962, "grad_norm": 1.5113147497177124, "kl": 0.103759765625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7623091340065002, "reward_std": 0.2021441012620926, "rewards/accuracy_reward": 0.772513210773468, "rewards/format_reward": 0.9897959232330322, "step": 4872 }, { "completion_length": 171.51020050048828, "epoch": 0.49036477987421384, "grad_norm": 0.8336308002471924, "kl": 0.11181640625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.9563491940498352, "reward_std": 0.0829911082983017, "rewards/accuracy_reward": 0.966553270816803, "rewards/format_reward": 0.9897959232330322, "step": 4873 }, { "completion_length": 177.17346954345703, "epoch": 0.4904654088050314, "grad_norm": 0.665899932384491, "kl": 0.10546875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.5755664706230164, "reward_std": 0.11627421155571938, "rewards/accuracy_reward": 0.5755664706230164, "rewards/format_reward": 1.0, "step": 4874 }, { "completion_length": 244.75508880615234, "epoch": 0.49056603773584906, "grad_norm": 0.6925340890884399, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.48773193359375, "reward_std": 0.19124487042427063, "rewards/accuracy_reward": 0.5081401765346527, "rewards/format_reward": 0.9795918464660645, "step": 4875 }, { "completion_length": 231.32652282714844, "epoch": 0.49066666666666664, "grad_norm": 1.0315810441970825, "kl": 0.116455078125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.614586055278778, "reward_std": 0.3040629178285599, "rewards/accuracy_reward": 0.6451983451843262, "rewards/format_reward": 0.9693877398967743, "step": 4876 }, { "completion_length": 177.29591369628906, "epoch": 0.4907672955974843, "grad_norm": 1.1462571620941162, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7444605231285095, "reward_std": 0.07428032532334328, "rewards/accuracy_reward": 0.7444606125354767, "rewards/format_reward": 1.0, "step": 4877 }, { "completion_length": 202.39794921875, "epoch": 0.4908679245283019, "grad_norm": 0.9868799448013306, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8932234048843384, "reward_std": 0.10485908389091492, "rewards/accuracy_reward": 0.8932234942913055, "rewards/format_reward": 1.0, "step": 4878 }, { "completion_length": 204.69387817382812, "epoch": 0.4909685534591195, "grad_norm": 1.0335369110107422, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8808192014694214, "reward_std": 0.07547148736193776, "rewards/accuracy_reward": 0.880819171667099, "rewards/format_reward": 1.0, "step": 4879 }, { "completion_length": 175.64285278320312, "epoch": 0.49106918238993713, "grad_norm": 0.5536149740219116, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8834189176559448, "reward_std": 0.09931807219982147, "rewards/accuracy_reward": 0.893623024225235, "rewards/format_reward": 0.9897959232330322, "step": 4880 }, { "completion_length": 247.24488830566406, "epoch": 0.4911698113207547, "grad_norm": 0.475271612405777, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7646656036376953, "reward_std": 0.10655024275183678, "rewards/accuracy_reward": 0.7748696804046631, "rewards/format_reward": 0.9897959232330322, "step": 4881 }, { "completion_length": 223.9591827392578, "epoch": 0.49127044025157235, "grad_norm": 1.0273176431655884, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7688451409339905, "reward_std": 0.19886457920074463, "rewards/accuracy_reward": 0.7790492177009583, "rewards/format_reward": 0.9897959232330322, "step": 4882 }, { "completion_length": 222.14286041259766, "epoch": 0.49137106918238993, "grad_norm": 0.7630578875541687, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8144711256027222, "reward_std": 0.17551599442958832, "rewards/accuracy_reward": 0.8246752917766571, "rewards/format_reward": 0.9897959232330322, "step": 4883 }, { "completion_length": 178.59183502197266, "epoch": 0.49147169811320757, "grad_norm": 0.6657330393791199, "kl": 0.094970703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.786394476890564, "reward_std": 0.09622275829315186, "rewards/accuracy_reward": 0.7863945364952087, "rewards/format_reward": 1.0, "step": 4884 }, { "completion_length": 215.948974609375, "epoch": 0.49157232704402515, "grad_norm": 1.0375306606292725, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7688045501708984, "reward_std": 0.1599884293973446, "rewards/accuracy_reward": 0.7790087163448334, "rewards/format_reward": 0.9897959232330322, "step": 4885 }, { "completion_length": 199.6326446533203, "epoch": 0.4916729559748428, "grad_norm": 0.9439947605133057, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8909090161323547, "reward_std": 0.10844730958342552, "rewards/accuracy_reward": 0.9011131525039673, "rewards/format_reward": 0.9897959232330322, "step": 4886 }, { "completion_length": 232.01020050048828, "epoch": 0.49177358490566037, "grad_norm": 1.3545422554016113, "kl": 0.10205078125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8043503761291504, "reward_std": 0.17805518954992294, "rewards/accuracy_reward": 0.8451667428016663, "rewards/format_reward": 0.9591836333274841, "step": 4887 }, { "completion_length": 203.38774871826172, "epoch": 0.491874213836478, "grad_norm": 0.347550630569458, "kl": 0.125732421875, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.8209123611450195, "reward_std": 0.01767267193645239, "rewards/accuracy_reward": 0.8209123611450195, "rewards/format_reward": 1.0, "step": 4888 }, { "completion_length": 179.21428298950195, "epoch": 0.4919748427672956, "grad_norm": 1.266100287437439, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.808389961719513, "reward_std": 0.1289878711104393, "rewards/accuracy_reward": 0.8185940682888031, "rewards/format_reward": 0.9897959232330322, "step": 4889 }, { "completion_length": 194.40816497802734, "epoch": 0.4920754716981132, "grad_norm": 1.017807960510254, "kl": 0.098876953125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7335600852966309, "reward_std": 0.15054573118686676, "rewards/accuracy_reward": 0.7539681792259216, "rewards/format_reward": 0.9795918166637421, "step": 4890 }, { "completion_length": 206.87754821777344, "epoch": 0.4921761006289308, "grad_norm": 0.4411521852016449, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.9211825132369995, "reward_std": 0.07798636890947819, "rewards/accuracy_reward": 0.9313865602016449, "rewards/format_reward": 0.9897959232330322, "step": 4891 }, { "completion_length": 230.2142791748047, "epoch": 0.49227672955974844, "grad_norm": 0.6693297624588013, "kl": 0.112060546875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.8098484873771667, "reward_std": 0.030690141953527927, "rewards/accuracy_reward": 0.8098484873771667, "rewards/format_reward": 1.0, "step": 4892 }, { "completion_length": 199.2551040649414, "epoch": 0.492377358490566, "grad_norm": 1.511189579963684, "kl": 0.10693359375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.6833333373069763, "reward_std": 0.10098336264491081, "rewards/accuracy_reward": 0.6935374140739441, "rewards/format_reward": 0.9897959232330322, "step": 4893 }, { "completion_length": 184.60204315185547, "epoch": 0.49247798742138366, "grad_norm": 0.44934046268463135, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8617191910743713, "reward_std": 0.07618824392557144, "rewards/accuracy_reward": 0.8719232976436615, "rewards/format_reward": 0.9897959232330322, "step": 4894 }, { "completion_length": 159.67346954345703, "epoch": 0.49257861635220124, "grad_norm": 1.5540691614151, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8582766652107239, "reward_std": 0.16738979518413544, "rewards/accuracy_reward": 0.8582766652107239, "rewards/format_reward": 1.0, "step": 4895 }, { "completion_length": 199.14286041259766, "epoch": 0.4926792452830189, "grad_norm": 1.1022104024887085, "kl": 0.0986328125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.762710988521576, "reward_std": 0.18631907552480698, "rewards/accuracy_reward": 0.7627108991146088, "rewards/format_reward": 1.0, "step": 4896 }, { "completion_length": 245.9795913696289, "epoch": 0.49277987421383646, "grad_norm": 0.5853568315505981, "kl": 0.0635986328125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8265305757522583, "reward_std": 0.13498730212450027, "rewards/accuracy_reward": 0.8469387590885162, "rewards/format_reward": 0.9795918464660645, "step": 4897 }, { "completion_length": 206.48979949951172, "epoch": 0.4928805031446541, "grad_norm": 0.6957160234451294, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7603078484535217, "reward_std": 0.1517604272812605, "rewards/accuracy_reward": 0.7603077590465546, "rewards/format_reward": 1.0, "step": 4898 }, { "completion_length": 201.87754821777344, "epoch": 0.4929811320754717, "grad_norm": 0.30896809697151184, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8881475329399109, "reward_std": 0.03647006303071976, "rewards/accuracy_reward": 0.8881475925445557, "rewards/format_reward": 1.0, "step": 4899 }, { "completion_length": 190.61223602294922, "epoch": 0.4930817610062893, "grad_norm": 2.2040045261383057, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7514517307281494, "reward_std": 0.16827724128961563, "rewards/accuracy_reward": 0.7820640206336975, "rewards/format_reward": 0.9693877398967743, "step": 4900 }, { "completion_length": 188.53060913085938, "epoch": 0.4931823899371069, "grad_norm": 0.4381922483444214, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7424861192703247, "reward_std": 0.0656522735953331, "rewards/accuracy_reward": 0.7526901066303253, "rewards/format_reward": 0.9897959232330322, "step": 4901 }, { "completion_length": 165.1326446533203, "epoch": 0.4932830188679245, "grad_norm": 0.42176365852355957, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.9371852278709412, "reward_std": 0.026132196187973022, "rewards/accuracy_reward": 0.9371852874755859, "rewards/format_reward": 1.0, "step": 4902 }, { "completion_length": 193.2653045654297, "epoch": 0.49338364779874216, "grad_norm": 0.8859517574310303, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.797278881072998, "reward_std": 0.16456154733896255, "rewards/accuracy_reward": 0.797278881072998, "rewards/format_reward": 1.0, "step": 4903 }, { "completion_length": 239.67346954345703, "epoch": 0.49348427672955975, "grad_norm": 0.296044260263443, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6446042656898499, "reward_std": 0.041388511657714844, "rewards/accuracy_reward": 0.65480837225914, "rewards/format_reward": 0.9897959232330322, "step": 4904 }, { "completion_length": 243.2551040649414, "epoch": 0.4935849056603774, "grad_norm": 0.7793812155723572, "kl": 0.049072265625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6850744485855103, "reward_std": 0.137783695012331, "rewards/accuracy_reward": 0.6952785551548004, "rewards/format_reward": 0.9897959232330322, "step": 4905 }, { "completion_length": 248.9693832397461, "epoch": 0.49368553459119496, "grad_norm": 0.9615302085876465, "kl": 0.132568359375, "learning_rate": 1e-06, "loss": 0.0053, "reward": 1.5690122842788696, "reward_std": 0.1670832261443138, "rewards/accuracy_reward": 0.5690122991800308, "rewards/format_reward": 1.0, "step": 4906 }, { "completion_length": 265.0918273925781, "epoch": 0.4937861635220126, "grad_norm": 0.7120948433876038, "kl": 0.109130859375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.6009897589683533, "reward_std": 0.15866431593894958, "rewards/accuracy_reward": 0.6213980317115784, "rewards/format_reward": 0.9795918464660645, "step": 4907 }, { "completion_length": 245.30611419677734, "epoch": 0.4938867924528302, "grad_norm": 0.5289466381072998, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7684979438781738, "reward_std": 0.10412321798503399, "rewards/accuracy_reward": 0.778702050447464, "rewards/format_reward": 0.9897959232330322, "step": 4908 }, { "completion_length": 207.42857360839844, "epoch": 0.4939874213836478, "grad_norm": 0.7213405966758728, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6652692556381226, "reward_std": 0.14941181428730488, "rewards/accuracy_reward": 0.6856773793697357, "rewards/format_reward": 0.9795918166637421, "step": 4909 }, { "completion_length": 222.11224365234375, "epoch": 0.4940880503144654, "grad_norm": 0.6385487914085388, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.918367326259613, "reward_std": 0.10412057116627693, "rewards/accuracy_reward": 0.9285714030265808, "rewards/format_reward": 0.9897959232330322, "step": 4910 }, { "completion_length": 196.60203552246094, "epoch": 0.49418867924528304, "grad_norm": 1.1364006996154785, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.827403724193573, "reward_std": 0.19298964738845825, "rewards/accuracy_reward": 0.8478118479251862, "rewards/format_reward": 0.9795918464660645, "step": 4911 }, { "completion_length": 253.9081573486328, "epoch": 0.4942893081761006, "grad_norm": 0.9414361119270325, "kl": 0.130859375, "learning_rate": 1e-06, "loss": 0.0052, "reward": 1.5933790802955627, "reward_std": 0.1866876557469368, "rewards/accuracy_reward": 0.6341954171657562, "rewards/format_reward": 0.9591836631298065, "step": 4912 }, { "completion_length": 238.16326141357422, "epoch": 0.49438993710691825, "grad_norm": 1.5048046112060547, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7876408696174622, "reward_std": 0.09649087488651276, "rewards/accuracy_reward": 0.8080490827560425, "rewards/format_reward": 0.9795918166637421, "step": 4913 }, { "completion_length": 271.07142639160156, "epoch": 0.49449056603773583, "grad_norm": 0.8835852146148682, "kl": 0.108154296875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.5971291661262512, "reward_std": 0.17514153569936752, "rewards/accuracy_reward": 0.6175372898578644, "rewards/format_reward": 0.9795918464660645, "step": 4914 }, { "completion_length": 230.97958374023438, "epoch": 0.49459119496855347, "grad_norm": 0.8496811985969543, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.62742018699646, "reward_std": 0.2459905818104744, "rewards/accuracy_reward": 0.6478283405303955, "rewards/format_reward": 0.9795918464660645, "step": 4915 }, { "completion_length": 197.13265228271484, "epoch": 0.49469182389937105, "grad_norm": 0.9413219094276428, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7467297315597534, "reward_std": 0.1550457701086998, "rewards/accuracy_reward": 0.767137885093689, "rewards/format_reward": 0.9795918464660645, "step": 4916 }, { "completion_length": 257.8367233276367, "epoch": 0.4947924528301887, "grad_norm": 0.9896474480628967, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8833818435668945, "reward_std": 0.10588780045509338, "rewards/accuracy_reward": 0.8833818733692169, "rewards/format_reward": 1.0, "step": 4917 }, { "completion_length": 280.29591369628906, "epoch": 0.49489308176100627, "grad_norm": 1.1525532007217407, "kl": 0.10986328125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.5014401078224182, "reward_std": 0.257733091711998, "rewards/accuracy_reward": 0.5422563552856445, "rewards/format_reward": 0.9591836631298065, "step": 4918 }, { "completion_length": 181.6836700439453, "epoch": 0.4949937106918239, "grad_norm": 0.6387333273887634, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7897958755493164, "reward_std": 0.10143008176237345, "rewards/accuracy_reward": 0.7897959351539612, "rewards/format_reward": 1.0, "step": 4919 }, { "completion_length": 231.38774871826172, "epoch": 0.4950943396226415, "grad_norm": 1.4544717073440552, "kl": 0.12353515625, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.5153255462646484, "reward_std": 0.26642436534166336, "rewards/accuracy_reward": 0.5459378808736801, "rewards/format_reward": 0.9693877398967743, "step": 4920 }, { "completion_length": 218.45917510986328, "epoch": 0.4951949685534591, "grad_norm": 1.1382993459701538, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7541743516921997, "reward_std": 0.21538645029067993, "rewards/accuracy_reward": 0.7745825052261353, "rewards/format_reward": 0.9795918464660645, "step": 4921 }, { "completion_length": 189.58163452148438, "epoch": 0.4952955974842767, "grad_norm": 0.48410969972610474, "kl": 0.0953369140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6292619705200195, "reward_std": 0.13924242556095123, "rewards/accuracy_reward": 0.6598742604255676, "rewards/format_reward": 0.9693877398967743, "step": 4922 }, { "completion_length": 291.0714111328125, "epoch": 0.49539622641509434, "grad_norm": 0.5644989013671875, "kl": 0.108642578125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7984950542449951, "reward_std": 0.19774854183197021, "rewards/accuracy_reward": 0.8393114507198334, "rewards/format_reward": 0.9591836631298065, "step": 4923 }, { "completion_length": 186.71428680419922, "epoch": 0.4954968553459119, "grad_norm": 0.9336463212966919, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7456899881362915, "reward_std": 0.1296745352447033, "rewards/accuracy_reward": 0.7558940649032593, "rewards/format_reward": 0.9897959232330322, "step": 4924 }, { "completion_length": 174.17346954345703, "epoch": 0.49559748427672956, "grad_norm": 0.3317992389202118, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.876722514629364, "reward_std": 0.014239666052162647, "rewards/accuracy_reward": 0.8767224848270416, "rewards/format_reward": 1.0, "step": 4925 }, { "completion_length": 217.95917510986328, "epoch": 0.4956981132075472, "grad_norm": 1.5860340595245361, "kl": 0.1259765625, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.7316591143608093, "reward_std": 0.12755125388503075, "rewards/accuracy_reward": 0.7316591739654541, "rewards/format_reward": 1.0, "step": 4926 }, { "completion_length": 177.56121826171875, "epoch": 0.4957987421383648, "grad_norm": 0.5073120594024658, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8448979258537292, "reward_std": 0.08884849399328232, "rewards/accuracy_reward": 0.8448979258537292, "rewards/format_reward": 1.0, "step": 4927 }, { "completion_length": 214.87754821777344, "epoch": 0.4958993710691824, "grad_norm": 0.7025574445724487, "kl": 0.0748291015625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7833819389343262, "reward_std": 0.11081837117671967, "rewards/accuracy_reward": 0.7833819389343262, "rewards/format_reward": 1.0, "step": 4928 }, { "completion_length": 191.93877410888672, "epoch": 0.496, "grad_norm": 3.1887595653533936, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.697959065437317, "reward_std": 0.11055347323417664, "rewards/accuracy_reward": 0.718367338180542, "rewards/format_reward": 0.9795918166637421, "step": 4929 }, { "completion_length": 219.53060913085938, "epoch": 0.49610062893081763, "grad_norm": 1.2452657222747803, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8005494475364685, "reward_std": 0.14787910878658295, "rewards/accuracy_reward": 0.8107535243034363, "rewards/format_reward": 0.9897959232330322, "step": 4930 }, { "completion_length": 281.4081497192383, "epoch": 0.4962012578616352, "grad_norm": 2.9169042110443115, "kl": 0.0694580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6306122541427612, "reward_std": 0.09142526797950268, "rewards/accuracy_reward": 0.6408163160085678, "rewards/format_reward": 0.9897959232330322, "step": 4931 }, { "completion_length": 143.9795913696289, "epoch": 0.49630188679245285, "grad_norm": 1.6297686100006104, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.805254876613617, "reward_std": 0.14483319595456123, "rewards/accuracy_reward": 0.8052547574043274, "rewards/format_reward": 1.0, "step": 4932 }, { "completion_length": 222.39795684814453, "epoch": 0.49640251572327043, "grad_norm": 0.5813524127006531, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.682189166545868, "reward_std": 0.16849827021360397, "rewards/accuracy_reward": 0.6923933029174805, "rewards/format_reward": 0.9897959232330322, "step": 4933 }, { "completion_length": 238.55101776123047, "epoch": 0.49650314465408807, "grad_norm": 0.784790575504303, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.531902551651001, "reward_std": 0.26081324741244316, "rewards/accuracy_reward": 0.5625148415565491, "rewards/format_reward": 0.9693877398967743, "step": 4934 }, { "completion_length": 179.62244415283203, "epoch": 0.49660377358490565, "grad_norm": 0.5601709485054016, "kl": 0.107666015625, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.6210996508598328, "reward_std": 0.09202015772461891, "rewards/accuracy_reward": 0.6313037127256393, "rewards/format_reward": 0.9897959232330322, "step": 4935 }, { "completion_length": 241.8775405883789, "epoch": 0.4967044025157233, "grad_norm": 0.6649903655052185, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7857142686843872, "reward_std": 0.15402613580226898, "rewards/accuracy_reward": 0.795918345451355, "rewards/format_reward": 0.9897959232330322, "step": 4936 }, { "completion_length": 271.72447967529297, "epoch": 0.49680503144654087, "grad_norm": 0.6319305300712585, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.4947235584259033, "reward_std": 0.166815847158432, "rewards/accuracy_reward": 0.5151316821575165, "rewards/format_reward": 0.9795918464660645, "step": 4937 }, { "completion_length": 153.6836700439453, "epoch": 0.4969056603773585, "grad_norm": 0.41042834520339966, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.918367326259613, "reward_std": 0.10335781425237656, "rewards/accuracy_reward": 0.9285714328289032, "rewards/format_reward": 0.9897959232330322, "step": 4938 }, { "completion_length": 196.05101776123047, "epoch": 0.4970062893081761, "grad_norm": 0.8686167597770691, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7901204824447632, "reward_std": 0.11101321503520012, "rewards/accuracy_reward": 0.7901205718517303, "rewards/format_reward": 1.0, "step": 4939 }, { "completion_length": 222.01020050048828, "epoch": 0.4971069182389937, "grad_norm": 0.9722061157226562, "kl": 0.1025390625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6330598592758179, "reward_std": 0.1818888820707798, "rewards/accuracy_reward": 0.6330599188804626, "rewards/format_reward": 1.0, "step": 4940 }, { "completion_length": 218.52040100097656, "epoch": 0.4972075471698113, "grad_norm": 0.5757729411125183, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6658816933631897, "reward_std": 0.13980232924222946, "rewards/accuracy_reward": 0.6862899363040924, "rewards/format_reward": 0.9795918464660645, "step": 4941 }, { "completion_length": 262.57142639160156, "epoch": 0.49730817610062894, "grad_norm": 0.7009809017181396, "kl": 0.09765625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6403513550758362, "reward_std": 0.1946173757314682, "rewards/accuracy_reward": 0.6505555212497711, "rewards/format_reward": 0.9897959232330322, "step": 4942 }, { "completion_length": 193.37754821777344, "epoch": 0.4974088050314465, "grad_norm": 1.9912331104278564, "kl": 0.1572265625, "learning_rate": 1e-06, "loss": 0.0063, "reward": 1.7066326141357422, "reward_std": 0.19331305474042892, "rewards/accuracy_reward": 0.7270407974720001, "rewards/format_reward": 0.9795918464660645, "step": 4943 }, { "completion_length": 253.60204315185547, "epoch": 0.49750943396226416, "grad_norm": 0.47381290793418884, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8645691871643066, "reward_std": 0.09593497961759567, "rewards/accuracy_reward": 0.8747732937335968, "rewards/format_reward": 0.9897959232330322, "step": 4944 }, { "completion_length": 253.34693145751953, "epoch": 0.49761006289308174, "grad_norm": 0.5059123039245605, "kl": 0.11328125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7921459674835205, "reward_std": 0.17471230402588844, "rewards/accuracy_reward": 0.8329622149467468, "rewards/format_reward": 0.9591836631298065, "step": 4945 }, { "completion_length": 247.57142639160156, "epoch": 0.4977106918238994, "grad_norm": 0.9363774657249451, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7521644234657288, "reward_std": 0.07043789327144623, "rewards/accuracy_reward": 0.7521644830703735, "rewards/format_reward": 1.0, "step": 4946 }, { "completion_length": 263.27549743652344, "epoch": 0.49781132075471696, "grad_norm": 0.7491059303283691, "kl": 0.0906982421875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7937219142913818, "reward_std": 0.22762055695056915, "rewards/accuracy_reward": 0.803926020860672, "rewards/format_reward": 0.9897959232330322, "step": 4947 }, { "completion_length": 219.88774871826172, "epoch": 0.4979119496855346, "grad_norm": 0.7311815023422241, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8177841901779175, "reward_std": 0.24097781628370285, "rewards/accuracy_reward": 0.8688046038150787, "rewards/format_reward": 0.9489795565605164, "step": 4948 }, { "completion_length": 206.35713958740234, "epoch": 0.4980125786163522, "grad_norm": 1.071073055267334, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7851299047470093, "reward_std": 0.15392089262604713, "rewards/accuracy_reward": 0.7851299345493317, "rewards/format_reward": 1.0, "step": 4949 }, { "completion_length": 186.55101776123047, "epoch": 0.4981132075471698, "grad_norm": 2.1953117847442627, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.5207998156547546, "reward_std": 0.1835612952709198, "rewards/accuracy_reward": 0.541207954287529, "rewards/format_reward": 0.9795918166637421, "step": 4950 }, { "completion_length": 256.7550964355469, "epoch": 0.49821383647798745, "grad_norm": 0.8930234313011169, "kl": 0.119384765625, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7350854873657227, "reward_std": 0.14274248480796814, "rewards/accuracy_reward": 0.7861059904098511, "rewards/format_reward": 0.9489795863628387, "step": 4951 }, { "completion_length": 214.10203552246094, "epoch": 0.49831446540880503, "grad_norm": 0.9075384140014648, "kl": 0.107666015625, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7888092398643494, "reward_std": 0.23507621139287949, "rewards/accuracy_reward": 0.8194214105606079, "rewards/format_reward": 0.9693877398967743, "step": 4952 }, { "completion_length": 149.65306091308594, "epoch": 0.49841509433962267, "grad_norm": 1.2719134092330933, "kl": 0.1279296875, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.7272483110427856, "reward_std": 0.2298879772424698, "rewards/accuracy_reward": 0.7782687544822693, "rewards/format_reward": 0.9489795863628387, "step": 4953 }, { "completion_length": 212.4183578491211, "epoch": 0.49851572327044025, "grad_norm": 0.8727229237556458, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6550621390342712, "reward_std": 0.26037096977233887, "rewards/accuracy_reward": 0.7060825824737549, "rewards/format_reward": 0.9489795565605164, "step": 4954 }, { "completion_length": 294.4183654785156, "epoch": 0.4986163522012579, "grad_norm": 1.0004295110702515, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.5902621150016785, "reward_std": 0.20699871331453323, "rewards/accuracy_reward": 0.6208743304014206, "rewards/format_reward": 0.9693877398967743, "step": 4955 }, { "completion_length": 242.61223602294922, "epoch": 0.49871698113207547, "grad_norm": 0.5274947285652161, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7001728415489197, "reward_std": 0.19983858615159988, "rewards/accuracy_reward": 0.7307851016521454, "rewards/format_reward": 0.9693877398967743, "step": 4956 }, { "completion_length": 167.88774871826172, "epoch": 0.4988176100628931, "grad_norm": 0.6581611037254333, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8084547519683838, "reward_std": 0.1285407766699791, "rewards/accuracy_reward": 0.8288629651069641, "rewards/format_reward": 0.9795918166637421, "step": 4957 }, { "completion_length": 215.22447967529297, "epoch": 0.4989182389937107, "grad_norm": 0.9418533444404602, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6775590181350708, "reward_std": 0.2889188826084137, "rewards/accuracy_reward": 0.7387835085391998, "rewards/format_reward": 0.9387754797935486, "step": 4958 }, { "completion_length": 242.98978424072266, "epoch": 0.4990188679245283, "grad_norm": 0.701977550983429, "kl": 0.135498046875, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.7342918515205383, "reward_std": 0.18511932715773582, "rewards/accuracy_reward": 0.7649040818214417, "rewards/format_reward": 0.9693877398967743, "step": 4959 }, { "completion_length": 236.90816497802734, "epoch": 0.4991194968553459, "grad_norm": 0.7625026702880859, "kl": 0.062744140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6379008293151855, "reward_std": 0.14688795059919357, "rewards/accuracy_reward": 0.6583090126514435, "rewards/format_reward": 0.9795918464660645, "step": 4960 }, { "completion_length": 262.2142791748047, "epoch": 0.49922012578616354, "grad_norm": 0.8298355340957642, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6306032538414001, "reward_std": 0.3367747813463211, "rewards/accuracy_reward": 0.6714197099208832, "rewards/format_reward": 0.9591836631298065, "step": 4961 }, { "completion_length": 214.17346954345703, "epoch": 0.4993207547169811, "grad_norm": 3.2334225177764893, "kl": 0.0767822265625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7212099432945251, "reward_std": 0.27688198536634445, "rewards/accuracy_reward": 0.7518221735954285, "rewards/format_reward": 0.9693877398967743, "step": 4962 }, { "completion_length": 186.0204086303711, "epoch": 0.49942138364779876, "grad_norm": 1.344386339187622, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6960778832435608, "reward_std": 0.24186492711305618, "rewards/accuracy_reward": 0.7164859771728516, "rewards/format_reward": 0.9795918464660645, "step": 4963 }, { "completion_length": 216.1836700439453, "epoch": 0.49952201257861634, "grad_norm": 0.687527596950531, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7944753170013428, "reward_std": 0.10308316070586443, "rewards/accuracy_reward": 0.7944753766059875, "rewards/format_reward": 1.0, "step": 4964 }, { "completion_length": 221.13265228271484, "epoch": 0.499622641509434, "grad_norm": 0.631860613822937, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7653312683105469, "reward_std": 0.2133843582123518, "rewards/accuracy_reward": 0.7857394516468048, "rewards/format_reward": 0.9795918166637421, "step": 4965 }, { "completion_length": 177.09182739257812, "epoch": 0.49972327044025155, "grad_norm": 0.8125877380371094, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6753689646720886, "reward_std": 0.11672315374016762, "rewards/accuracy_reward": 0.675368994474411, "rewards/format_reward": 1.0, "step": 4966 }, { "completion_length": 155.78570556640625, "epoch": 0.4998238993710692, "grad_norm": 0.7660629153251648, "kl": 0.1063232421875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8053980469703674, "reward_std": 0.08971695974469185, "rewards/accuracy_reward": 0.8053980469703674, "rewards/format_reward": 1.0, "step": 4967 }, { "completion_length": 256.9693908691406, "epoch": 0.4999245283018868, "grad_norm": 0.5584716200828552, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7640644311904907, "reward_std": 0.0989719107747078, "rewards/accuracy_reward": 0.7640644609928131, "rewards/format_reward": 1.0, "step": 4968 }, { "completion_length": 243.79591369628906, "epoch": 0.5000251572327044, "grad_norm": 0.9550023674964905, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7169322967529297, "reward_std": 0.12587561830878258, "rewards/accuracy_reward": 0.7271362841129303, "rewards/format_reward": 0.9897959232330322, "step": 4969 }, { "completion_length": 171.26530075073242, "epoch": 0.500125786163522, "grad_norm": 0.18318364024162292, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.9897959232330322, "reward_std": 0.026997461915016174, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.9897959232330322, "step": 4970 }, { "completion_length": 179.4591827392578, "epoch": 0.5002264150943396, "grad_norm": 1.1486148834228516, "kl": 0.118408203125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.6619065403938293, "reward_std": 0.14952551573514938, "rewards/accuracy_reward": 0.6721106171607971, "rewards/format_reward": 0.9897959232330322, "step": 4971 }, { "completion_length": 258.33673095703125, "epoch": 0.5003270440251573, "grad_norm": 0.6761021614074707, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8473355174064636, "reward_std": 0.1423741802573204, "rewards/accuracy_reward": 0.8779478669166565, "rewards/format_reward": 0.9693877398967743, "step": 4972 }, { "completion_length": 187.27550506591797, "epoch": 0.5004276729559748, "grad_norm": 0.5037219524383545, "kl": 0.0718994140625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8426019549369812, "reward_std": 0.05311660235747695, "rewards/accuracy_reward": 0.842602014541626, "rewards/format_reward": 1.0, "step": 4973 }, { "completion_length": 177.01020050048828, "epoch": 0.5005283018867924, "grad_norm": 0.9762531518936157, "kl": 0.1201171875, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.8191730976104736, "reward_std": 0.11808564141392708, "rewards/accuracy_reward": 0.8293772041797638, "rewards/format_reward": 0.9897959232330322, "step": 4974 }, { "completion_length": 199.32653045654297, "epoch": 0.5006289308176101, "grad_norm": 0.8970268368721008, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6978906393051147, "reward_std": 0.15221915766596794, "rewards/accuracy_reward": 0.7387069761753082, "rewards/format_reward": 0.9591836631298065, "step": 4975 }, { "completion_length": 237.7142791748047, "epoch": 0.5007295597484277, "grad_norm": 1.12375807762146, "kl": 0.0482177734375, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7857142090797424, "reward_std": 0.13821138814091682, "rewards/accuracy_reward": 0.8061224222183228, "rewards/format_reward": 0.9795918166637421, "step": 4976 }, { "completion_length": 284.948974609375, "epoch": 0.5008301886792453, "grad_norm": 0.8841550350189209, "kl": 0.0821533203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.5902066230773926, "reward_std": 0.24175682663917542, "rewards/accuracy_reward": 0.6004107743501663, "rewards/format_reward": 0.9897959232330322, "step": 4977 }, { "completion_length": 191.86734008789062, "epoch": 0.5009308176100629, "grad_norm": 2.4203145503997803, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7384044528007507, "reward_std": 0.1749011054635048, "rewards/accuracy_reward": 0.7486084997653961, "rewards/format_reward": 0.9897959232330322, "step": 4978 }, { "completion_length": 183.77550506591797, "epoch": 0.5010314465408805, "grad_norm": 0.7145485877990723, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8505101203918457, "reward_std": 0.15705366805195808, "rewards/accuracy_reward": 0.8811224699020386, "rewards/format_reward": 0.9693877398967743, "step": 4979 }, { "completion_length": 228.65306091308594, "epoch": 0.5011320754716981, "grad_norm": 0.9088283777236938, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7288562655448914, "reward_std": 0.17554091662168503, "rewards/accuracy_reward": 0.7390603721141815, "rewards/format_reward": 0.9897959232330322, "step": 4980 }, { "completion_length": 281.89794921875, "epoch": 0.5012327044025158, "grad_norm": 0.7126973271369934, "kl": 0.108642578125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7061049938201904, "reward_std": 0.224705271422863, "rewards/accuracy_reward": 0.7367173135280609, "rewards/format_reward": 0.9693877398967743, "step": 4981 }, { "completion_length": 271.58162689208984, "epoch": 0.5013333333333333, "grad_norm": 0.722080409526825, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8159892559051514, "reward_std": 0.24306365102529526, "rewards/accuracy_reward": 0.8261932730674744, "rewards/format_reward": 0.9897959232330322, "step": 4982 }, { "completion_length": 279.55101013183594, "epoch": 0.5014339622641509, "grad_norm": 0.5204454660415649, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6157347559928894, "reward_std": 0.12723458558321, "rewards/accuracy_reward": 0.636142909526825, "rewards/format_reward": 0.9795918464660645, "step": 4983 }, { "completion_length": 228.06121826171875, "epoch": 0.5015345911949686, "grad_norm": 2.1477103233337402, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.829931914806366, "reward_std": 0.13864652439951897, "rewards/accuracy_reward": 0.8401360213756561, "rewards/format_reward": 0.9897959232330322, "step": 4984 }, { "completion_length": 272.6530532836914, "epoch": 0.5016352201257862, "grad_norm": 0.9105746150016785, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7680889964103699, "reward_std": 0.284355565905571, "rewards/accuracy_reward": 0.7987012565135956, "rewards/format_reward": 0.9693877398967743, "step": 4985 }, { "completion_length": 313.9591827392578, "epoch": 0.5017358490566037, "grad_norm": 0.6659881472587585, "kl": 0.0635986328125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7135751247406006, "reward_std": 0.2470514327287674, "rewards/accuracy_reward": 0.7339833378791809, "rewards/format_reward": 0.9795918464660645, "step": 4986 }, { "completion_length": 268.9795837402344, "epoch": 0.5018364779874214, "grad_norm": 0.9095332026481628, "kl": 0.0478515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.7489470839500427, "reward_std": 0.25043974071741104, "rewards/accuracy_reward": 0.7693553268909454, "rewards/format_reward": 0.9795918464660645, "step": 4987 }, { "completion_length": 197.32653045654297, "epoch": 0.501937106918239, "grad_norm": 0.46751171350479126, "kl": 0.045654296875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7448979020118713, "reward_std": 0.1158459484577179, "rewards/accuracy_reward": 0.7551020383834839, "rewards/format_reward": 0.9897959232330322, "step": 4988 }, { "completion_length": 242.83673095703125, "epoch": 0.5020377358490566, "grad_norm": 0.5695253014564514, "kl": 0.0799560546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7463397979736328, "reward_std": 0.15214326232671738, "rewards/accuracy_reward": 0.7871561646461487, "rewards/format_reward": 0.9591836333274841, "step": 4989 }, { "completion_length": 248.1530532836914, "epoch": 0.5021383647798742, "grad_norm": 0.9279359579086304, "kl": 0.138427734375, "learning_rate": 1e-06, "loss": 0.0055, "reward": 1.6954385042190552, "reward_std": 0.19867445528507233, "rewards/accuracy_reward": 0.6954384744167328, "rewards/format_reward": 1.0, "step": 4990 }, { "completion_length": 233.53060150146484, "epoch": 0.5022389937106918, "grad_norm": 0.9308122992515564, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7312283515930176, "reward_std": 0.16651330888271332, "rewards/accuracy_reward": 0.741432398557663, "rewards/format_reward": 0.9897959232330322, "step": 4991 }, { "completion_length": 246.7653045654297, "epoch": 0.5023396226415094, "grad_norm": 1.017710566520691, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8209052085876465, "reward_std": 0.1591494306921959, "rewards/accuracy_reward": 0.8311093151569366, "rewards/format_reward": 0.9897959232330322, "step": 4992 }, { "completion_length": 257.08162689208984, "epoch": 0.5024402515723271, "grad_norm": 0.6838573813438416, "kl": 0.0750732421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6739553213119507, "reward_std": 0.21924357116222382, "rewards/accuracy_reward": 0.6841593980789185, "rewards/format_reward": 0.9897959232330322, "step": 4993 }, { "completion_length": 217.28570556640625, "epoch": 0.5025408805031446, "grad_norm": 0.3047111928462982, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7961889505386353, "reward_std": 0.03229646384716034, "rewards/accuracy_reward": 0.7961889803409576, "rewards/format_reward": 1.0, "step": 4994 }, { "completion_length": 242.47958374023438, "epoch": 0.5026415094339622, "grad_norm": 0.7252097725868225, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8222788572311401, "reward_std": 0.13891933485865593, "rewards/accuracy_reward": 0.8426870703697205, "rewards/format_reward": 0.9795918464660645, "step": 4995 }, { "completion_length": 321.7857208251953, "epoch": 0.5027421383647799, "grad_norm": 0.7026520371437073, "kl": 0.0504150390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8354713916778564, "reward_std": 0.2593572586774826, "rewards/accuracy_reward": 0.8558794558048248, "rewards/format_reward": 0.9795918464660645, "step": 4996 }, { "completion_length": 197.4897918701172, "epoch": 0.5028427672955975, "grad_norm": 0.2996717095375061, "kl": 0.0491943359375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7574853301048279, "reward_std": 0.08979466930031776, "rewards/accuracy_reward": 0.7778935134410858, "rewards/format_reward": 0.9795918464660645, "step": 4997 }, { "completion_length": 247.4591827392578, "epoch": 0.502943396226415, "grad_norm": 0.967225193977356, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.659701943397522, "reward_std": 0.1767757534980774, "rewards/accuracy_reward": 0.6699060499668121, "rewards/format_reward": 0.9897959232330322, "step": 4998 }, { "completion_length": 255.10203552246094, "epoch": 0.5030440251572327, "grad_norm": 0.518444299697876, "kl": 0.11865234375, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.661264181137085, "reward_std": 0.15812261775135994, "rewards/accuracy_reward": 0.6714683175086975, "rewards/format_reward": 0.9897959232330322, "step": 4999 }, { "completion_length": 212.63265228271484, "epoch": 0.5031446540880503, "grad_norm": 0.8465983271598816, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8921282291412354, "reward_std": 0.1327827200293541, "rewards/accuracy_reward": 0.9023323357105255, "rewards/format_reward": 0.9897959232330322, "step": 5000 }, { "completion_length": 249.78571319580078, "epoch": 0.503245283018868, "grad_norm": 0.6816108226776123, "kl": 0.0865478515625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8618298172950745, "reward_std": 0.108705073595047, "rewards/accuracy_reward": 0.8618297874927521, "rewards/format_reward": 1.0, "step": 5001 }, { "completion_length": 253.25508880615234, "epoch": 0.5033459119496856, "grad_norm": 0.9595901966094971, "kl": 0.117919921875, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.6796240210533142, "reward_std": 0.22261976450681686, "rewards/accuracy_reward": 0.7102362513542175, "rewards/format_reward": 0.9693877398967743, "step": 5002 }, { "completion_length": 237.99999237060547, "epoch": 0.5034465408805031, "grad_norm": 0.43978151679039, "kl": 0.0556640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.800680160522461, "reward_std": 0.08890028670430183, "rewards/accuracy_reward": 0.8006802499294281, "rewards/format_reward": 1.0, "step": 5003 }, { "completion_length": 276.2550964355469, "epoch": 0.5035471698113207, "grad_norm": 0.6340532302856445, "kl": 0.0513916015625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7472060322761536, "reward_std": 0.16463660448789597, "rewards/accuracy_reward": 0.7574101090431213, "rewards/format_reward": 0.9897959232330322, "step": 5004 }, { "completion_length": 224.9591827392578, "epoch": 0.5036477987421384, "grad_norm": 1.115044355392456, "kl": 0.0966796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7028816938400269, "reward_std": 0.19385602325201035, "rewards/accuracy_reward": 0.7334939241409302, "rewards/format_reward": 0.9693877398967743, "step": 5005 }, { "completion_length": 254.9897918701172, "epoch": 0.503748427672956, "grad_norm": 0.8585566282272339, "kl": 0.09716796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.810374140739441, "reward_std": 0.23290744423866272, "rewards/accuracy_reward": 0.87159863114357, "rewards/format_reward": 0.9387754797935486, "step": 5006 }, { "completion_length": 257.4081573486328, "epoch": 0.5038490566037735, "grad_norm": 0.6135386228561401, "kl": 0.05029296875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6835408210754395, "reward_std": 0.13963313773274422, "rewards/accuracy_reward": 0.683540940284729, "rewards/format_reward": 1.0, "step": 5007 }, { "completion_length": 250.11224746704102, "epoch": 0.5039496855345912, "grad_norm": 0.7459693551063538, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7800453305244446, "reward_std": 0.2084490805864334, "rewards/accuracy_reward": 0.8004534840583801, "rewards/format_reward": 0.9795918166637421, "step": 5008 }, { "completion_length": 148.18367385864258, "epoch": 0.5040503144654088, "grad_norm": 0.6555361151695251, "kl": 0.0733642578125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.901209831237793, "reward_std": 0.06791037507355213, "rewards/accuracy_reward": 0.9012097716331482, "rewards/format_reward": 1.0, "step": 5009 }, { "completion_length": 198.2040786743164, "epoch": 0.5041509433962265, "grad_norm": 0.5818530321121216, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8056122064590454, "reward_std": 0.12709227204322815, "rewards/accuracy_reward": 0.8056122362613678, "rewards/format_reward": 1.0, "step": 5010 }, { "completion_length": 288.6326446533203, "epoch": 0.504251572327044, "grad_norm": 0.6481471657752991, "kl": 0.110595703125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.690902292728424, "reward_std": 0.150467898696661, "rewards/accuracy_reward": 0.711310476064682, "rewards/format_reward": 0.9795918464660645, "step": 5011 }, { "completion_length": 223.03060913085938, "epoch": 0.5043522012578616, "grad_norm": 1.062549114227295, "kl": 0.103515625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.84312504529953, "reward_std": 0.1647598296403885, "rewards/accuracy_reward": 0.8431250751018524, "rewards/format_reward": 1.0, "step": 5012 }, { "completion_length": 203.57142639160156, "epoch": 0.5044528301886793, "grad_norm": 1021.0195922851562, "kl": 0.60107421875, "learning_rate": 1e-06, "loss": 0.024, "reward": 1.772108793258667, "reward_std": 0.17489881813526154, "rewards/accuracy_reward": 0.7721088230609894, "rewards/format_reward": 1.0, "step": 5013 }, { "completion_length": 208.28571319580078, "epoch": 0.5045534591194969, "grad_norm": 0.6015095710754395, "kl": 0.0614013671875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6420372128486633, "reward_std": 0.13667885586619377, "rewards/accuracy_reward": 0.6522412300109863, "rewards/format_reward": 0.9897959232330322, "step": 5014 }, { "completion_length": 178.4897918701172, "epoch": 0.5046540880503144, "grad_norm": 0.5637004375457764, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8111289739608765, "reward_std": 0.07179713249206543, "rewards/accuracy_reward": 0.8111290037631989, "rewards/format_reward": 1.0, "step": 5015 }, { "completion_length": 184.9081573486328, "epoch": 0.5047547169811321, "grad_norm": 0.22261366248130798, "kl": 0.052001953125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.929879605770111, "reward_std": 0.06621775403618813, "rewards/accuracy_reward": 0.9400837123394012, "rewards/format_reward": 0.9897959232330322, "step": 5016 }, { "completion_length": 171.05101776123047, "epoch": 0.5048553459119497, "grad_norm": 0.7606609463691711, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.833951711654663, "reward_std": 0.11995800584554672, "rewards/accuracy_reward": 0.844155877828598, "rewards/format_reward": 0.9897959232330322, "step": 5017 }, { "completion_length": 201.71428680419922, "epoch": 0.5049559748427673, "grad_norm": 1.2648743391036987, "kl": 0.110595703125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.751266598701477, "reward_std": 0.14949678629636765, "rewards/accuracy_reward": 0.7512665390968323, "rewards/format_reward": 1.0, "step": 5018 }, { "completion_length": 241.2448959350586, "epoch": 0.5050566037735849, "grad_norm": 0.6654538512229919, "kl": 0.119873046875, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.6731685996055603, "reward_std": 0.09654968231916428, "rewards/accuracy_reward": 0.6935767829418182, "rewards/format_reward": 0.9795918464660645, "step": 5019 }, { "completion_length": 222.5306167602539, "epoch": 0.5051572327044025, "grad_norm": 0.9254222512245178, "kl": 0.0643310546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5988337993621826, "reward_std": 0.1342386081814766, "rewards/accuracy_reward": 0.598833829164505, "rewards/format_reward": 1.0, "step": 5020 }, { "completion_length": 236.90816497802734, "epoch": 0.5052578616352201, "grad_norm": 0.7008564472198486, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7235970497131348, "reward_std": 0.20649707317352295, "rewards/accuracy_reward": 0.7235971093177795, "rewards/format_reward": 1.0, "step": 5021 }, { "completion_length": 217.9285659790039, "epoch": 0.5053584905660378, "grad_norm": 1.6048717498779297, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6843291521072388, "reward_std": 0.21001612395048141, "rewards/accuracy_reward": 0.7047373652458191, "rewards/format_reward": 0.9795918166637421, "step": 5022 }, { "completion_length": 228.6836700439453, "epoch": 0.5054591194968553, "grad_norm": 2.4266178607940674, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7167935371398926, "reward_std": 0.2260640412569046, "rewards/accuracy_reward": 0.7372016906738281, "rewards/format_reward": 0.9795918166637421, "step": 5023 }, { "completion_length": 256.52040100097656, "epoch": 0.5055597484276729, "grad_norm": 1.009899616241455, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.716604769229889, "reward_std": 0.1830706223845482, "rewards/accuracy_reward": 0.7268088459968567, "rewards/format_reward": 0.9897959232330322, "step": 5024 }, { "completion_length": 242.2244873046875, "epoch": 0.5056603773584906, "grad_norm": 0.5330179929733276, "kl": 0.0960693359375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.727255642414093, "reward_std": 0.0771559365093708, "rewards/accuracy_reward": 0.7272556722164154, "rewards/format_reward": 1.0, "step": 5025 }, { "completion_length": 180.94898223876953, "epoch": 0.5057610062893082, "grad_norm": 1.014628291130066, "kl": 0.1015625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8877551555633545, "reward_std": 0.044179607182741165, "rewards/accuracy_reward": 0.8877551555633545, "rewards/format_reward": 1.0, "step": 5026 }, { "completion_length": 246.45917510986328, "epoch": 0.5058616352201258, "grad_norm": 1.2111848592758179, "kl": 0.0699462890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7762362360954285, "reward_std": 0.14777270704507828, "rewards/accuracy_reward": 0.7966443300247192, "rewards/format_reward": 0.9795918166637421, "step": 5027 }, { "completion_length": 230.84693908691406, "epoch": 0.5059622641509434, "grad_norm": 0.6642394065856934, "kl": 0.104736328125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7893320322036743, "reward_std": 0.0975803155452013, "rewards/accuracy_reward": 0.7893321216106415, "rewards/format_reward": 1.0, "step": 5028 }, { "completion_length": 161.9387664794922, "epoch": 0.506062893081761, "grad_norm": 0.6704601049423218, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.878911554813385, "reward_std": 0.14155936241149902, "rewards/accuracy_reward": 0.9197279214859009, "rewards/format_reward": 0.9591836333274841, "step": 5029 }, { "completion_length": 216.948974609375, "epoch": 0.5061635220125786, "grad_norm": 0.5973432660102844, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8642547130584717, "reward_std": 0.11331995204091072, "rewards/accuracy_reward": 0.8642547428607941, "rewards/format_reward": 1.0, "step": 5030 }, { "completion_length": 220.28570556640625, "epoch": 0.5062641509433963, "grad_norm": 0.9865796566009521, "kl": 0.098388671875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6985995173454285, "reward_std": 0.17027116939425468, "rewards/accuracy_reward": 0.7088036239147186, "rewards/format_reward": 0.9897959232330322, "step": 5031 }, { "completion_length": 211.07142639160156, "epoch": 0.5063647798742138, "grad_norm": 0.8195310831069946, "kl": 0.0789794921875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7755101323127747, "reward_std": 0.19153589382767677, "rewards/accuracy_reward": 0.7857142686843872, "rewards/format_reward": 0.9897959232330322, "step": 5032 }, { "completion_length": 208.09183502197266, "epoch": 0.5064654088050314, "grad_norm": 0.7461174726486206, "kl": 0.0889892578125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8558441400527954, "reward_std": 0.10652963072061539, "rewards/accuracy_reward": 0.8558441400527954, "rewards/format_reward": 1.0, "step": 5033 }, { "completion_length": 242.9693832397461, "epoch": 0.5065660377358491, "grad_norm": 0.5401979088783264, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.768528938293457, "reward_std": 0.12578733637928963, "rewards/accuracy_reward": 0.7889372110366821, "rewards/format_reward": 0.9795918166637421, "step": 5034 }, { "completion_length": 208.80611419677734, "epoch": 0.5066666666666667, "grad_norm": 0.827152669429779, "kl": 0.10302734375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7396198511123657, "reward_std": 0.1753057986497879, "rewards/accuracy_reward": 0.7600280344486237, "rewards/format_reward": 0.9795918464660645, "step": 5035 }, { "completion_length": 326.39794921875, "epoch": 0.5067672955974842, "grad_norm": 0.6279213428497314, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.5302148461341858, "reward_std": 0.22728731855750084, "rewards/accuracy_reward": 0.5608271062374115, "rewards/format_reward": 0.9693877398967743, "step": 5036 }, { "completion_length": 211.55101776123047, "epoch": 0.5068679245283019, "grad_norm": 0.5504342317581177, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8133437633514404, "reward_std": 0.10195350646972656, "rewards/accuracy_reward": 0.8235478699207306, "rewards/format_reward": 0.9897959232330322, "step": 5037 }, { "completion_length": 185.58162689208984, "epoch": 0.5069685534591195, "grad_norm": 5.692886829376221, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8002135157585144, "reward_std": 0.10195811372250319, "rewards/accuracy_reward": 0.8104175925254822, "rewards/format_reward": 0.9897959232330322, "step": 5038 }, { "completion_length": 180.54080963134766, "epoch": 0.5070691823899371, "grad_norm": 0.9056167602539062, "kl": 0.065185546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8301992416381836, "reward_std": 0.17497656494379044, "rewards/accuracy_reward": 0.8404033184051514, "rewards/format_reward": 0.9897959232330322, "step": 5039 }, { "completion_length": 214.84693145751953, "epoch": 0.5071698113207547, "grad_norm": 0.8206852674484253, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7413926720619202, "reward_std": 0.11131516844034195, "rewards/accuracy_reward": 0.7413927912712097, "rewards/format_reward": 1.0, "step": 5040 }, { "completion_length": 214.1836700439453, "epoch": 0.5072704402515723, "grad_norm": 1.043078899383545, "kl": 0.109375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7192195653915405, "reward_std": 0.2955106347799301, "rewards/accuracy_reward": 0.7294236123561859, "rewards/format_reward": 0.9897959232330322, "step": 5041 }, { "completion_length": 220.3775405883789, "epoch": 0.5073710691823899, "grad_norm": 0.6119822859764099, "kl": 0.0802001953125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8190627694129944, "reward_std": 0.14552105963230133, "rewards/accuracy_reward": 0.8496749997138977, "rewards/format_reward": 0.9693877398967743, "step": 5042 }, { "completion_length": 215.51019287109375, "epoch": 0.5074716981132076, "grad_norm": 0.8376485109329224, "kl": 0.09423828125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8614256381988525, "reward_std": 0.16154934838414192, "rewards/accuracy_reward": 0.8818339407444, "rewards/format_reward": 0.9795918166637421, "step": 5043 }, { "completion_length": 207.1326446533203, "epoch": 0.5075723270440251, "grad_norm": 0.5734276175498962, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6713637709617615, "reward_std": 0.12831005826592445, "rewards/accuracy_reward": 0.691771924495697, "rewards/format_reward": 0.9795918166637421, "step": 5044 }, { "completion_length": 227.80611419677734, "epoch": 0.5076729559748427, "grad_norm": 0.6977515816688538, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.786784052848816, "reward_std": 0.11807456985116005, "rewards/accuracy_reward": 0.7969882190227509, "rewards/format_reward": 0.9897959232330322, "step": 5045 }, { "completion_length": 231.38775634765625, "epoch": 0.5077735849056604, "grad_norm": 0.6958316564559937, "kl": 0.109619140625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.780210256576538, "reward_std": 0.12080342322587967, "rewards/accuracy_reward": 0.7802102267742157, "rewards/format_reward": 1.0, "step": 5046 }, { "completion_length": 173.9591827392578, "epoch": 0.507874213836478, "grad_norm": 2.380316972732544, "kl": 0.0986328125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.652466058731079, "reward_std": 0.10084103420376778, "rewards/accuracy_reward": 0.6524661779403687, "rewards/format_reward": 1.0, "step": 5047 }, { "completion_length": 193.16326141357422, "epoch": 0.5079748427672955, "grad_norm": 0.6844357848167419, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8715986609458923, "reward_std": 0.11843333765864372, "rewards/accuracy_reward": 0.8818026781082153, "rewards/format_reward": 0.9897959232330322, "step": 5048 }, { "completion_length": 228.9183578491211, "epoch": 0.5080754716981132, "grad_norm": 2.2579424381256104, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.759848415851593, "reward_std": 0.13641650415956974, "rewards/accuracy_reward": 0.7598485052585602, "rewards/format_reward": 1.0, "step": 5049 }, { "completion_length": 226.4591827392578, "epoch": 0.5081761006289308, "grad_norm": 1.1301686763763428, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6781075596809387, "reward_std": 0.13280561938881874, "rewards/accuracy_reward": 0.6985157430171967, "rewards/format_reward": 0.9795918464660645, "step": 5050 }, { "completion_length": 245.33673095703125, "epoch": 0.5082767295597485, "grad_norm": 0.6853586435317993, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.745869755744934, "reward_std": 0.21039291098713875, "rewards/accuracy_reward": 0.7560738623142242, "rewards/format_reward": 0.9897959232330322, "step": 5051 }, { "completion_length": 215.51020050048828, "epoch": 0.5083773584905661, "grad_norm": 0.6387204527854919, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8444650769233704, "reward_std": 0.18758435547351837, "rewards/accuracy_reward": 0.8648731410503387, "rewards/format_reward": 0.9795918166637421, "step": 5052 }, { "completion_length": 230.53060913085938, "epoch": 0.5084779874213836, "grad_norm": 0.5140377283096313, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7570456266403198, "reward_std": 0.1244032084941864, "rewards/accuracy_reward": 0.7978619933128357, "rewards/format_reward": 0.9591836631298065, "step": 5053 }, { "completion_length": 290.9795913696289, "epoch": 0.5085786163522013, "grad_norm": 0.45237839221954346, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8775509595870972, "reward_std": 0.07303375005722046, "rewards/accuracy_reward": 0.8775509893894196, "rewards/format_reward": 1.0, "step": 5054 }, { "completion_length": 172.59183502197266, "epoch": 0.5086792452830189, "grad_norm": 0.9251919984817505, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7940868139266968, "reward_std": 0.13461196050047874, "rewards/accuracy_reward": 0.8042909204959869, "rewards/format_reward": 0.9897959232330322, "step": 5055 }, { "completion_length": 184.9285659790039, "epoch": 0.5087798742138365, "grad_norm": 0.6296919584274292, "kl": 0.0848388671875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8516749739646912, "reward_std": 0.11098637897521257, "rewards/accuracy_reward": 0.8618790805339813, "rewards/format_reward": 0.9897959232330322, "step": 5056 }, { "completion_length": 216.94896697998047, "epoch": 0.508880503144654, "grad_norm": 0.3913858234882355, "kl": 0.0694580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8748299479484558, "reward_std": 0.12304526939988136, "rewards/accuracy_reward": 0.8850339949131012, "rewards/format_reward": 0.9897959232330322, "step": 5057 }, { "completion_length": 242.55101013183594, "epoch": 0.5089811320754717, "grad_norm": 0.4232484698295593, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8665003776550293, "reward_std": 0.08246702700853348, "rewards/accuracy_reward": 0.8665003776550293, "rewards/format_reward": 1.0, "step": 5058 }, { "completion_length": 246.448974609375, "epoch": 0.5090817610062893, "grad_norm": 0.7662995457649231, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6826943755149841, "reward_std": 0.1275762841105461, "rewards/accuracy_reward": 0.7031024694442749, "rewards/format_reward": 0.9795918464660645, "step": 5059 }, { "completion_length": 263.4387664794922, "epoch": 0.509182389937107, "grad_norm": 0.6488785743713379, "kl": 0.0787353515625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7329931855201721, "reward_std": 0.15919150412082672, "rewards/accuracy_reward": 0.7431972920894623, "rewards/format_reward": 0.9897959232330322, "step": 5060 }, { "completion_length": 199.36734008789062, "epoch": 0.5092830188679245, "grad_norm": 0.6137634515762329, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.9285714030265808, "reward_std": 0.06970714777708054, "rewards/accuracy_reward": 0.9285714328289032, "rewards/format_reward": 1.0, "step": 5061 }, { "completion_length": 240.87754821777344, "epoch": 0.5093836477987421, "grad_norm": 0.8004067540168762, "kl": 0.1064453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.6967735290527344, "reward_std": 0.1655954122543335, "rewards/accuracy_reward": 0.7069776654243469, "rewards/format_reward": 0.9897959232330322, "step": 5062 }, { "completion_length": 227.91836547851562, "epoch": 0.5094842767295598, "grad_norm": 1.0725808143615723, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7932084798812866, "reward_std": 0.19638492912054062, "rewards/accuracy_reward": 0.8136166334152222, "rewards/format_reward": 0.9795918166637421, "step": 5063 }, { "completion_length": 226.9897918701172, "epoch": 0.5095849056603774, "grad_norm": 0.6776474118232727, "kl": 0.0831298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7135347127914429, "reward_std": 0.1638834998011589, "rewards/accuracy_reward": 0.723738819360733, "rewards/format_reward": 0.9897959232330322, "step": 5064 }, { "completion_length": 224.64285278320312, "epoch": 0.5096855345911949, "grad_norm": 5.8818793296813965, "kl": 0.0538330078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.643620491027832, "reward_std": 0.22772132605314255, "rewards/accuracy_reward": 0.6436206102371216, "rewards/format_reward": 1.0, "step": 5065 }, { "completion_length": 216.5, "epoch": 0.5097861635220126, "grad_norm": 1.3376542329788208, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.9271137118339539, "reward_std": 0.1354149580001831, "rewards/accuracy_reward": 0.9271136820316315, "rewards/format_reward": 1.0, "step": 5066 }, { "completion_length": 251.71428680419922, "epoch": 0.5098867924528302, "grad_norm": 0.5334334373474121, "kl": 0.0762939453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7047595381736755, "reward_std": 0.13442093413323164, "rewards/accuracy_reward": 0.7251677215099335, "rewards/format_reward": 0.9795918166637421, "step": 5067 }, { "completion_length": 223.9285659790039, "epoch": 0.5099874213836478, "grad_norm": 0.5063884258270264, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7037724256515503, "reward_std": 0.08480571769177914, "rewards/accuracy_reward": 0.7037723660469055, "rewards/format_reward": 1.0, "step": 5068 }, { "completion_length": 174.61224365234375, "epoch": 0.5100880503144654, "grad_norm": 1.3494889736175537, "kl": 0.0733642578125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.9387754201889038, "reward_std": 0.10003120824694633, "rewards/accuracy_reward": 0.9387754797935486, "rewards/format_reward": 1.0, "step": 5069 }, { "completion_length": 250.7040786743164, "epoch": 0.510188679245283, "grad_norm": 0.13794143497943878, "kl": 0.057373046875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7904762029647827, "reward_std": 0.007142857648432255, "rewards/accuracy_reward": 0.7904762029647827, "rewards/format_reward": 1.0, "step": 5070 }, { "completion_length": 265.17346954345703, "epoch": 0.5102893081761006, "grad_norm": 0.8908107280731201, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6012592911720276, "reward_std": 0.08223158121109009, "rewards/accuracy_reward": 0.6012592911720276, "rewards/format_reward": 1.0, "step": 5071 }, { "completion_length": 246.95917510986328, "epoch": 0.5103899371069183, "grad_norm": 0.9491437673568726, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.694521188735962, "reward_std": 0.1968473121523857, "rewards/accuracy_reward": 0.6945212185382843, "rewards/format_reward": 1.0, "step": 5072 }, { "completion_length": 256.26529693603516, "epoch": 0.5104905660377359, "grad_norm": 0.8487616181373596, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.698000431060791, "reward_std": 0.1844140738248825, "rewards/accuracy_reward": 0.7184085249900818, "rewards/format_reward": 0.9795918166637421, "step": 5073 }, { "completion_length": 316.6122360229492, "epoch": 0.5105911949685534, "grad_norm": 0.5419268608093262, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.591029703617096, "reward_std": 0.08946127817034721, "rewards/accuracy_reward": 0.6012338995933533, "rewards/format_reward": 0.9897959232330322, "step": 5074 }, { "completion_length": 253.47958374023438, "epoch": 0.5106918238993711, "grad_norm": 0.7730454206466675, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.850310742855072, "reward_std": 0.19649670645594597, "rewards/accuracy_reward": 0.8605147898197174, "rewards/format_reward": 0.9897959232330322, "step": 5075 }, { "completion_length": 245.1836700439453, "epoch": 0.5107924528301887, "grad_norm": 3.9423060417175293, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7193391919136047, "reward_std": 0.12039859965443611, "rewards/accuracy_reward": 0.7193391025066376, "rewards/format_reward": 1.0, "step": 5076 }, { "completion_length": 237.6734619140625, "epoch": 0.5108930817610063, "grad_norm": 0.5823720693588257, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7448979020118713, "reward_std": 0.20236971974372864, "rewards/accuracy_reward": 0.7653061151504517, "rewards/format_reward": 0.9795918166637421, "step": 5077 }, { "completion_length": 234.22447967529297, "epoch": 0.5109937106918239, "grad_norm": 1.0420812368392944, "kl": 0.0733642578125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6999027729034424, "reward_std": 0.18579018861055374, "rewards/accuracy_reward": 0.7203109860420227, "rewards/format_reward": 0.9795918464660645, "step": 5078 }, { "completion_length": 228.81632232666016, "epoch": 0.5110943396226415, "grad_norm": 0.7317007184028625, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7966201305389404, "reward_std": 0.1080809198319912, "rewards/accuracy_reward": 0.7966200411319733, "rewards/format_reward": 1.0, "step": 5079 }, { "completion_length": 270.8061218261719, "epoch": 0.5111949685534591, "grad_norm": 1.26204514503479, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7457817792892456, "reward_std": 0.19806838035583496, "rewards/accuracy_reward": 0.7559858560562134, "rewards/format_reward": 0.9897959232330322, "step": 5080 }, { "completion_length": 242.29590606689453, "epoch": 0.5112955974842768, "grad_norm": 0.7546953558921814, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6630135774612427, "reward_std": 0.16032477468252182, "rewards/accuracy_reward": 0.6732176840305328, "rewards/format_reward": 0.9897959232330322, "step": 5081 }, { "completion_length": 252.8877410888672, "epoch": 0.5113962264150943, "grad_norm": 0.912480890750885, "kl": 0.0765380859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8571428656578064, "reward_std": 0.1270286738872528, "rewards/accuracy_reward": 0.8673469424247742, "rewards/format_reward": 0.9897959232330322, "step": 5082 }, { "completion_length": 216.9693832397461, "epoch": 0.5114968553459119, "grad_norm": 0.5793787837028503, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7694082856178284, "reward_std": 0.12980405986309052, "rewards/accuracy_reward": 0.7796124815940857, "rewards/format_reward": 0.9897959232330322, "step": 5083 }, { "completion_length": 208.59183502197266, "epoch": 0.5115974842767296, "grad_norm": 0.6288704872131348, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7674440145492554, "reward_std": 0.13532808423042297, "rewards/accuracy_reward": 0.7776481509208679, "rewards/format_reward": 0.9897959232330322, "step": 5084 }, { "completion_length": 283.4183654785156, "epoch": 0.5116981132075472, "grad_norm": 0.6738763451576233, "kl": 0.132568359375, "learning_rate": 1e-06, "loss": 0.0053, "reward": 1.6920841336250305, "reward_std": 0.14303984865546227, "rewards/accuracy_reward": 0.7226963639259338, "rewards/format_reward": 0.9693877398967743, "step": 5085 }, { "completion_length": 268.97957611083984, "epoch": 0.5117987421383647, "grad_norm": 2.750230550765991, "kl": 0.096923828125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8335961699485779, "reward_std": 0.17271364480257034, "rewards/accuracy_reward": 0.8438002169132233, "rewards/format_reward": 0.9897959232330322, "step": 5086 }, { "completion_length": 220.07142639160156, "epoch": 0.5118993710691824, "grad_norm": 0.6250574588775635, "kl": 0.044921875, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.6979591846466064, "reward_std": 0.08625977858901024, "rewards/accuracy_reward": 0.7081632018089294, "rewards/format_reward": 0.9897959232330322, "step": 5087 }, { "completion_length": 313.77549743652344, "epoch": 0.512, "grad_norm": 0.628746747970581, "kl": 0.0792236328125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6804543137550354, "reward_std": 0.09069999866187572, "rewards/accuracy_reward": 0.7008624374866486, "rewards/format_reward": 0.9795918464660645, "step": 5088 }, { "completion_length": 290.5816345214844, "epoch": 0.5121006289308176, "grad_norm": 0.3958872854709625, "kl": 0.0721435546875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7071428298950195, "reward_std": 0.09139302372932434, "rewards/accuracy_reward": 0.7173469066619873, "rewards/format_reward": 0.9897959232330322, "step": 5089 }, { "completion_length": 246.02040100097656, "epoch": 0.5122012578616352, "grad_norm": 0.3234238922595978, "kl": 0.0513916015625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8259353041648865, "reward_std": 0.05068443715572357, "rewards/accuracy_reward": 0.8361394107341766, "rewards/format_reward": 0.9897959232330322, "step": 5090 }, { "completion_length": 220.83673095703125, "epoch": 0.5123018867924528, "grad_norm": 0.7355228662490845, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5977891087532043, "reward_std": 0.17622831836342812, "rewards/accuracy_reward": 0.6181972920894623, "rewards/format_reward": 0.9795918166637421, "step": 5091 }, { "completion_length": 228.97958374023438, "epoch": 0.5124025157232704, "grad_norm": 0.5723645091056824, "kl": 0.0726318359375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8050259947776794, "reward_std": 0.10342598240822554, "rewards/accuracy_reward": 0.815230131149292, "rewards/format_reward": 0.9897959232330322, "step": 5092 }, { "completion_length": 234.91836547851562, "epoch": 0.5125031446540881, "grad_norm": 0.579516589641571, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7160714268684387, "reward_std": 0.1477733589708805, "rewards/accuracy_reward": 0.7160713970661163, "rewards/format_reward": 1.0, "step": 5093 }, { "completion_length": 279.7857131958008, "epoch": 0.5126037735849056, "grad_norm": 0.5624598264694214, "kl": 0.0765380859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.66915625333786, "reward_std": 0.1525965854525566, "rewards/accuracy_reward": 0.6895644962787628, "rewards/format_reward": 0.9795918166637421, "step": 5094 }, { "completion_length": 282.54080963134766, "epoch": 0.5127044025157232, "grad_norm": 0.7739222645759583, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8256580233573914, "reward_std": 0.14052731543779373, "rewards/accuracy_reward": 0.8358619511127472, "rewards/format_reward": 0.9897959232330322, "step": 5095 }, { "completion_length": 256.4183578491211, "epoch": 0.5128050314465409, "grad_norm": 0.6928626894950867, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8033047914505005, "reward_std": 0.08904707804322243, "rewards/accuracy_reward": 0.8033048510551453, "rewards/format_reward": 1.0, "step": 5096 }, { "completion_length": 196.80611419677734, "epoch": 0.5129056603773585, "grad_norm": 0.6221914887428284, "kl": 0.09765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7696352005004883, "reward_std": 0.09150808304548264, "rewards/accuracy_reward": 0.7696351110935211, "rewards/format_reward": 1.0, "step": 5097 }, { "completion_length": 228.9285659790039, "epoch": 0.5130062893081762, "grad_norm": 1.0168278217315674, "kl": 0.1025390625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.843081295490265, "reward_std": 0.11264291778206825, "rewards/accuracy_reward": 0.8430813252925873, "rewards/format_reward": 1.0, "step": 5098 }, { "completion_length": 286.9183654785156, "epoch": 0.5131069182389937, "grad_norm": 0.5065007209777832, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7973667979240417, "reward_std": 0.15711618959903717, "rewards/accuracy_reward": 0.8177751004695892, "rewards/format_reward": 0.9795918166637421, "step": 5099 }, { "completion_length": 257.5612106323242, "epoch": 0.5132075471698113, "grad_norm": 1.670879602432251, "kl": 0.140380859375, "learning_rate": 1e-06, "loss": 0.0056, "reward": 1.7962672710418701, "reward_std": 0.12639480456709862, "rewards/accuracy_reward": 0.8064713478088379, "rewards/format_reward": 0.9897959232330322, "step": 5100 }, { "completion_length": 243.26529693603516, "epoch": 0.513308176100629, "grad_norm": 0.5002002120018005, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7664399147033691, "reward_std": 0.12419357523322105, "rewards/accuracy_reward": 0.7766439318656921, "rewards/format_reward": 0.9897959232330322, "step": 5101 }, { "completion_length": 277.948974609375, "epoch": 0.5134088050314466, "grad_norm": 0.5217273235321045, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7493320107460022, "reward_std": 0.13727395609021187, "rewards/accuracy_reward": 0.7493320107460022, "rewards/format_reward": 1.0, "step": 5102 }, { "completion_length": 221.4897918701172, "epoch": 0.5135094339622641, "grad_norm": 0.5380019545555115, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8477736115455627, "reward_std": 0.1629122942686081, "rewards/accuracy_reward": 0.8783859014511108, "rewards/format_reward": 0.9693877398967743, "step": 5103 }, { "completion_length": 176.91836547851562, "epoch": 0.5136100628930818, "grad_norm": 1.5881884098052979, "kl": 0.116455078125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.8627362251281738, "reward_std": 0.10055889934301376, "rewards/accuracy_reward": 0.8627361357212067, "rewards/format_reward": 1.0, "step": 5104 }, { "completion_length": 299.4183654785156, "epoch": 0.5137106918238994, "grad_norm": 0.6229979991912842, "kl": 0.0494384765625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6457725167274475, "reward_std": 0.16461026668548584, "rewards/accuracy_reward": 0.6661807298660278, "rewards/format_reward": 0.9795918166637421, "step": 5105 }, { "completion_length": 358.5, "epoch": 0.513811320754717, "grad_norm": 0.7977020740509033, "kl": 0.0916748046875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6743969917297363, "reward_std": 0.17691578716039658, "rewards/accuracy_reward": 0.6846011281013489, "rewards/format_reward": 0.9897959232330322, "step": 5106 }, { "completion_length": 332.9285583496094, "epoch": 0.5139119496855346, "grad_norm": 0.4041019380092621, "kl": 0.0587158203125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7752186059951782, "reward_std": 0.08871376886963844, "rewards/accuracy_reward": 0.7752186357975006, "rewards/format_reward": 1.0, "step": 5107 }, { "completion_length": 339.1326446533203, "epoch": 0.5140125786163522, "grad_norm": 1.1273818016052246, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.4486340284347534, "reward_std": 0.2797224894165993, "rewards/accuracy_reward": 0.4894503355026245, "rewards/format_reward": 0.9591836333274841, "step": 5108 }, { "completion_length": 280.60203552246094, "epoch": 0.5141132075471698, "grad_norm": 0.6095657348632812, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.619749903678894, "reward_std": 0.16094982624053955, "rewards/accuracy_reward": 0.6401581466197968, "rewards/format_reward": 0.9795918166637421, "step": 5109 }, { "completion_length": 255.10203552246094, "epoch": 0.5142138364779875, "grad_norm": 0.3468155562877655, "kl": 0.0633544921875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7953673005104065, "reward_std": 0.09384174458682537, "rewards/accuracy_reward": 0.8055713474750519, "rewards/format_reward": 0.9897959232330322, "step": 5110 }, { "completion_length": 211.99999237060547, "epoch": 0.514314465408805, "grad_norm": 0.6459230780601501, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7897748947143555, "reward_std": 0.20429765433073044, "rewards/accuracy_reward": 0.7999789416790009, "rewards/format_reward": 0.9897959232330322, "step": 5111 }, { "completion_length": 304.51019287109375, "epoch": 0.5144150943396226, "grad_norm": 0.7417438626289368, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.80454683303833, "reward_std": 0.1531563624739647, "rewards/accuracy_reward": 0.8045468330383301, "rewards/format_reward": 1.0, "step": 5112 }, { "completion_length": 223.7448959350586, "epoch": 0.5145157232704403, "grad_norm": 1.0421602725982666, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8145124316215515, "reward_std": 0.07657656446099281, "rewards/accuracy_reward": 0.8145124316215515, "rewards/format_reward": 1.0, "step": 5113 }, { "completion_length": 291.9693908691406, "epoch": 0.5146163522012579, "grad_norm": 0.528472363948822, "kl": 0.0443115234375, "learning_rate": 1e-06, "loss": 0.0018, "reward": 1.7256721258163452, "reward_std": 0.15113872289657593, "rewards/accuracy_reward": 0.7358762323856354, "rewards/format_reward": 0.9897959232330322, "step": 5114 }, { "completion_length": 299.79591369628906, "epoch": 0.5147169811320754, "grad_norm": 0.5443190932273865, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5765151381492615, "reward_std": 0.19127875566482544, "rewards/accuracy_reward": 0.5867192298173904, "rewards/format_reward": 0.9897959232330322, "step": 5115 }, { "completion_length": 189.2040786743164, "epoch": 0.5148176100628931, "grad_norm": 1.5671775341033936, "kl": 0.10888671875, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7531943321228027, "reward_std": 0.21131941676139832, "rewards/accuracy_reward": 0.7838065922260284, "rewards/format_reward": 0.9693877398967743, "step": 5116 }, { "completion_length": 223.7040786743164, "epoch": 0.5149182389937107, "grad_norm": 0.7541121244430542, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8027211427688599, "reward_std": 0.08886084519326687, "rewards/accuracy_reward": 0.8027210533618927, "rewards/format_reward": 1.0, "step": 5117 }, { "completion_length": 250.5, "epoch": 0.5150188679245283, "grad_norm": 0.27149999141693115, "kl": 0.0478515625, "learning_rate": 1e-06, "loss": 0.0019, "reward": 1.8380951881408691, "reward_std": 0.06037934869527817, "rewards/accuracy_reward": 0.8482993245124817, "rewards/format_reward": 0.9897959232330322, "step": 5118 }, { "completion_length": 227.21428680419922, "epoch": 0.5151194968553459, "grad_norm": 0.39703965187072754, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.700455904006958, "reward_std": 0.14621608145534992, "rewards/accuracy_reward": 0.7208641171455383, "rewards/format_reward": 0.9795918166637421, "step": 5119 }, { "completion_length": 289.1326446533203, "epoch": 0.5152201257861635, "grad_norm": 0.6986662745475769, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7259658575057983, "reward_std": 0.17355316132307053, "rewards/accuracy_reward": 0.756578117609024, "rewards/format_reward": 0.9693877398967743, "step": 5120 }, { "completion_length": 188.23468780517578, "epoch": 0.5153207547169811, "grad_norm": 0.8542618751525879, "kl": 0.1024169921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8723959922790527, "reward_std": 0.11846276745200157, "rewards/accuracy_reward": 0.8928041756153107, "rewards/format_reward": 0.9795918464660645, "step": 5121 }, { "completion_length": 278.6428527832031, "epoch": 0.5154213836477988, "grad_norm": 0.5749688744544983, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6748594641685486, "reward_std": 0.20873238146305084, "rewards/accuracy_reward": 0.7054716944694519, "rewards/format_reward": 0.9693877398967743, "step": 5122 }, { "completion_length": 250.21428680419922, "epoch": 0.5155220125786164, "grad_norm": 1.2570525407791138, "kl": 0.110595703125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.5966315865516663, "reward_std": 0.2858899012207985, "rewards/accuracy_reward": 0.6068356782197952, "rewards/format_reward": 0.9897959232330322, "step": 5123 }, { "completion_length": 332.60203552246094, "epoch": 0.5156226415094339, "grad_norm": 0.7650814652442932, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8441471457481384, "reward_std": 0.1985618770122528, "rewards/accuracy_reward": 0.8645553290843964, "rewards/format_reward": 0.9795918166637421, "step": 5124 }, { "completion_length": 209.9795913696289, "epoch": 0.5157232704402516, "grad_norm": 0.7807583808898926, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.80436509847641, "reward_std": 0.2023686319589615, "rewards/accuracy_reward": 0.8349773287773132, "rewards/format_reward": 0.9693877398967743, "step": 5125 }, { "completion_length": 292.1224365234375, "epoch": 0.5158238993710692, "grad_norm": 0.4197084605693817, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6910318732261658, "reward_std": 0.06256309151649475, "rewards/accuracy_reward": 0.7012360990047455, "rewards/format_reward": 0.9897959232330322, "step": 5126 }, { "completion_length": 270.67346954345703, "epoch": 0.5159245283018868, "grad_norm": 0.813694417476654, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8097917437553406, "reward_std": 0.09280332177877426, "rewards/accuracy_reward": 0.8097917437553406, "rewards/format_reward": 1.0, "step": 5127 }, { "completion_length": 268.62245178222656, "epoch": 0.5160251572327044, "grad_norm": 0.875290036201477, "kl": 0.0538330078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7725947499275208, "reward_std": 0.14601022750139236, "rewards/accuracy_reward": 0.772594690322876, "rewards/format_reward": 1.0, "step": 5128 }, { "completion_length": 183.9693832397461, "epoch": 0.516125786163522, "grad_norm": 0.879813551902771, "kl": 0.127197265625, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.728638470172882, "reward_std": 0.19471470266580582, "rewards/accuracy_reward": 0.7694547474384308, "rewards/format_reward": 0.9591836333274841, "step": 5129 }, { "completion_length": 260.49999237060547, "epoch": 0.5162264150943396, "grad_norm": 1.003627061843872, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8469387292861938, "reward_std": 0.10788732394576073, "rewards/accuracy_reward": 0.8469387590885162, "rewards/format_reward": 1.0, "step": 5130 }, { "completion_length": 293.1836700439453, "epoch": 0.5163270440251573, "grad_norm": 0.6242102384567261, "kl": 0.0594482421875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6770496368408203, "reward_std": 0.21758968383073807, "rewards/accuracy_reward": 0.6974577903747559, "rewards/format_reward": 0.9795918464660645, "step": 5131 }, { "completion_length": 372.60203552246094, "epoch": 0.5164276729559748, "grad_norm": 0.8944466710090637, "kl": 0.051513671875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7282535433769226, "reward_std": 0.20809412747621536, "rewards/accuracy_reward": 0.7486617267131805, "rewards/format_reward": 0.9795918464660645, "step": 5132 }, { "completion_length": 186.75509643554688, "epoch": 0.5165283018867924, "grad_norm": 0.7621049880981445, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.67333984375, "reward_std": 0.1306202970445156, "rewards/accuracy_reward": 0.7039520442485809, "rewards/format_reward": 0.9693877398967743, "step": 5133 }, { "completion_length": 290.87754821777344, "epoch": 0.5166289308176101, "grad_norm": 0.6243653893470764, "kl": 0.056640625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8168904781341553, "reward_std": 0.10904457420110703, "rewards/accuracy_reward": 0.8270945250988007, "rewards/format_reward": 0.9897959232330322, "step": 5134 }, { "completion_length": 270.9795837402344, "epoch": 0.5167295597484277, "grad_norm": 0.5982673168182373, "kl": 0.06103515625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7110866904258728, "reward_std": 0.16607549041509628, "rewards/accuracy_reward": 0.7212908565998077, "rewards/format_reward": 0.9897959232330322, "step": 5135 }, { "completion_length": 267.96937561035156, "epoch": 0.5168301886792452, "grad_norm": 0.3980983793735504, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7689018249511719, "reward_std": 0.09411720186471939, "rewards/accuracy_reward": 0.7893100082874298, "rewards/format_reward": 0.9795918166637421, "step": 5136 }, { "completion_length": 222.07142639160156, "epoch": 0.5169308176100629, "grad_norm": 1.288254737854004, "kl": 0.09912109375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8462422490119934, "reward_std": 0.09829713776707649, "rewards/accuracy_reward": 0.8462422788143158, "rewards/format_reward": 1.0, "step": 5137 }, { "completion_length": 210.1938705444336, "epoch": 0.5170314465408805, "grad_norm": 0.8588035702705383, "kl": 0.1201171875, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7366860508918762, "reward_std": 0.16809646040201187, "rewards/accuracy_reward": 0.7570942342281342, "rewards/format_reward": 0.9795918464660645, "step": 5138 }, { "completion_length": 204.25509643554688, "epoch": 0.5171320754716981, "grad_norm": 0.40273845195770264, "kl": 0.0966796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7740619778633118, "reward_std": 0.0759191308170557, "rewards/accuracy_reward": 0.7740620076656342, "rewards/format_reward": 1.0, "step": 5139 }, { "completion_length": 236.53060150146484, "epoch": 0.5172327044025157, "grad_norm": 0.4734170436859131, "kl": 0.0550537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7852842807769775, "reward_std": 0.11145263724029064, "rewards/accuracy_reward": 0.7852842807769775, "rewards/format_reward": 1.0, "step": 5140 }, { "completion_length": 275.89794921875, "epoch": 0.5173333333333333, "grad_norm": 0.6371392607688904, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.638363242149353, "reward_std": 0.11482786387205124, "rewards/accuracy_reward": 0.6485673189163208, "rewards/format_reward": 0.9897959232330322, "step": 5141 }, { "completion_length": 230.62244415283203, "epoch": 0.517433962264151, "grad_norm": 0.9866667985916138, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.638093650341034, "reward_std": 0.2285119667649269, "rewards/accuracy_reward": 0.6585018932819366, "rewards/format_reward": 0.9795918464660645, "step": 5142 }, { "completion_length": 317.6734619140625, "epoch": 0.5175345911949686, "grad_norm": 0.9349038004875183, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7373974919319153, "reward_std": 0.20806122571229935, "rewards/accuracy_reward": 0.7578056454658508, "rewards/format_reward": 0.9795918464660645, "step": 5143 }, { "completion_length": 257.33673095703125, "epoch": 0.5176352201257861, "grad_norm": 0.46918785572052, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7719850540161133, "reward_std": 0.13732275366783142, "rewards/accuracy_reward": 0.792393296957016, "rewards/format_reward": 0.9795918166637421, "step": 5144 }, { "completion_length": 214.4591827392578, "epoch": 0.5177358490566037, "grad_norm": 0.515216588973999, "kl": 0.10595703125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8416834473609924, "reward_std": 0.13436175137758255, "rewards/accuracy_reward": 0.8518875539302826, "rewards/format_reward": 0.9897959232330322, "step": 5145 }, { "completion_length": 234.58162689208984, "epoch": 0.5178364779874214, "grad_norm": 1.170750617980957, "kl": 0.101318359375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.5630223751068115, "reward_std": 0.18630480021238327, "rewards/accuracy_reward": 0.5834304988384247, "rewards/format_reward": 0.9795918166637421, "step": 5146 }, { "completion_length": 207.4897918701172, "epoch": 0.517937106918239, "grad_norm": 0.8609780669212341, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7814630270004272, "reward_std": 0.24929654598236084, "rewards/accuracy_reward": 0.8018712103366852, "rewards/format_reward": 0.9795918166637421, "step": 5147 }, { "completion_length": 183.01020050048828, "epoch": 0.5180377358490567, "grad_norm": 1.0154268741607666, "kl": 0.107421875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.6207703948020935, "reward_std": 0.1678370162844658, "rewards/accuracy_reward": 0.6411785781383514, "rewards/format_reward": 0.9795918464660645, "step": 5148 }, { "completion_length": 201.9897918701172, "epoch": 0.5181383647798742, "grad_norm": 1.3029537200927734, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.787755012512207, "reward_std": 0.1564827486872673, "rewards/accuracy_reward": 0.7877550721168518, "rewards/format_reward": 1.0, "step": 5149 }, { "completion_length": 195.36734008789062, "epoch": 0.5182389937106918, "grad_norm": 0.5133781433105469, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8400418162345886, "reward_std": 0.12339366599917412, "rewards/accuracy_reward": 0.8604499995708466, "rewards/format_reward": 0.9795918464660645, "step": 5150 }, { "completion_length": 269.38775634765625, "epoch": 0.5183396226415095, "grad_norm": 0.6714069843292236, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5903884768486023, "reward_std": 0.1478360891342163, "rewards/accuracy_reward": 0.6107965856790543, "rewards/format_reward": 0.9795918166637421, "step": 5151 }, { "completion_length": 287.84693908691406, "epoch": 0.5184402515723271, "grad_norm": 0.4811595678329468, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6833332777023315, "reward_std": 0.1213662251830101, "rewards/accuracy_reward": 0.6833333373069763, "rewards/format_reward": 1.0, "step": 5152 }, { "completion_length": 321.8367156982422, "epoch": 0.5185408805031446, "grad_norm": 0.6319775581359863, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7125948071479797, "reward_std": 0.1431354060769081, "rewards/accuracy_reward": 0.7330029606819153, "rewards/format_reward": 0.9795918166637421, "step": 5153 }, { "completion_length": 189.948974609375, "epoch": 0.5186415094339623, "grad_norm": 0.7116629481315613, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8577613234519958, "reward_std": 0.14456410706043243, "rewards/accuracy_reward": 0.8781694173812866, "rewards/format_reward": 0.9795918464660645, "step": 5154 }, { "completion_length": 264.84693145751953, "epoch": 0.5187421383647799, "grad_norm": 0.6220216155052185, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6128968000411987, "reward_std": 0.11539104953408241, "rewards/accuracy_reward": 0.6128968000411987, "rewards/format_reward": 1.0, "step": 5155 }, { "completion_length": 299.39794921875, "epoch": 0.5188427672955975, "grad_norm": 0.7331182360649109, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.750076413154602, "reward_std": 0.24281245470046997, "rewards/accuracy_reward": 0.7806887030601501, "rewards/format_reward": 0.9693877398967743, "step": 5156 }, { "completion_length": 172.43877410888672, "epoch": 0.518943396226415, "grad_norm": 1.0023212432861328, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8601166009902954, "reward_std": 0.11259953677654266, "rewards/accuracy_reward": 0.8601166009902954, "rewards/format_reward": 1.0, "step": 5157 }, { "completion_length": 255.81631469726562, "epoch": 0.5190440251572327, "grad_norm": 0.7879242897033691, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.646236002445221, "reward_std": 0.1820179671049118, "rewards/accuracy_reward": 0.6564401239156723, "rewards/format_reward": 0.9897959232330322, "step": 5158 }, { "completion_length": 224.07141876220703, "epoch": 0.5191446540880503, "grad_norm": 0.5933725833892822, "kl": 0.0667724609375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8714771270751953, "reward_std": 0.14120833575725555, "rewards/accuracy_reward": 0.902089387178421, "rewards/format_reward": 0.9693877398967743, "step": 5159 }, { "completion_length": 188.66326141357422, "epoch": 0.519245283018868, "grad_norm": 0.49048006534576416, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8508475422859192, "reward_std": 0.12796230241656303, "rewards/accuracy_reward": 0.861051619052887, "rewards/format_reward": 0.9897959232330322, "step": 5160 }, { "completion_length": 198.7244873046875, "epoch": 0.5193459119496855, "grad_norm": 0.4942949116230011, "kl": 0.094970703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.886181890964508, "reward_std": 0.11454259417951107, "rewards/accuracy_reward": 0.9065901339054108, "rewards/format_reward": 0.9795918166637421, "step": 5161 }, { "completion_length": 226.51020050048828, "epoch": 0.5194465408805031, "grad_norm": 15.595202445983887, "kl": 0.8359375, "learning_rate": 1e-06, "loss": 0.0333, "reward": 1.7626468539237976, "reward_std": 0.19675446301698685, "rewards/accuracy_reward": 0.7830550372600555, "rewards/format_reward": 0.9795918464660645, "step": 5162 }, { "completion_length": 211.89795684814453, "epoch": 0.5195471698113208, "grad_norm": 0.9198814034461975, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7532698512077332, "reward_std": 0.16058644652366638, "rewards/accuracy_reward": 0.7838821709156036, "rewards/format_reward": 0.9693877398967743, "step": 5163 }, { "completion_length": 300.52040100097656, "epoch": 0.5196477987421384, "grad_norm": 0.6815831661224365, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6963819861412048, "reward_std": 0.26152074337005615, "rewards/accuracy_reward": 0.7474025189876556, "rewards/format_reward": 0.9489795565605164, "step": 5164 }, { "completion_length": 273.0612258911133, "epoch": 0.5197484276729559, "grad_norm": 1.4291763305664062, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6317238211631775, "reward_std": 0.27075639367103577, "rewards/accuracy_reward": 0.682744100689888, "rewards/format_reward": 0.9489795863628387, "step": 5165 }, { "completion_length": 256.2448959350586, "epoch": 0.5198490566037736, "grad_norm": 0.6740837097167969, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6823900938034058, "reward_std": 0.16325442865490913, "rewards/accuracy_reward": 0.6823901981115341, "rewards/format_reward": 1.0, "step": 5166 }, { "completion_length": 282.49998474121094, "epoch": 0.5199496855345912, "grad_norm": 0.9006003737449646, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7986463904380798, "reward_std": 0.21571458876132965, "rewards/accuracy_reward": 0.8190546631813049, "rewards/format_reward": 0.9795918464660645, "step": 5167 }, { "completion_length": 243.55101776123047, "epoch": 0.5200503144654088, "grad_norm": 0.8790357708930969, "kl": 0.0672607421875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7687074542045593, "reward_std": 0.21924619749188423, "rewards/accuracy_reward": 0.8095238208770752, "rewards/format_reward": 0.9591836631298065, "step": 5168 }, { "completion_length": 250.0204086303711, "epoch": 0.5201509433962264, "grad_norm": 0.8439866304397583, "kl": 0.1015625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7153370380401611, "reward_std": 0.2446281909942627, "rewards/accuracy_reward": 0.7357451617717743, "rewards/format_reward": 0.9795918166637421, "step": 5169 }, { "completion_length": 250.8163299560547, "epoch": 0.520251572327044, "grad_norm": 0.9914862513542175, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6854591965675354, "reward_std": 0.20039157569408417, "rewards/accuracy_reward": 0.7160713970661163, "rewards/format_reward": 0.9693877398967743, "step": 5170 }, { "completion_length": 279.7550964355469, "epoch": 0.5203522012578616, "grad_norm": 1.0917805433273315, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6588241457939148, "reward_std": 0.26560237258672714, "rewards/accuracy_reward": 0.6792323589324951, "rewards/format_reward": 0.9795918166637421, "step": 5171 }, { "completion_length": 195.57142639160156, "epoch": 0.5204528301886793, "grad_norm": 0.9941646456718445, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7828798294067383, "reward_std": 0.2249281406402588, "rewards/accuracy_reward": 0.8134920299053192, "rewards/format_reward": 0.9693877398967743, "step": 5172 }, { "completion_length": 221.89794921875, "epoch": 0.5205534591194969, "grad_norm": 1.2615934610366821, "kl": 0.11669921875, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.73261159658432, "reward_std": 0.22021842747926712, "rewards/accuracy_reward": 0.7530196607112885, "rewards/format_reward": 0.9795918166637421, "step": 5173 }, { "completion_length": 194.9897918701172, "epoch": 0.5206540880503144, "grad_norm": 0.34049418568611145, "kl": 0.133544921875, "learning_rate": 1e-06, "loss": 0.0053, "reward": 1.7830963134765625, "reward_std": 0.05357467755675316, "rewards/accuracy_reward": 0.783096194267273, "rewards/format_reward": 1.0, "step": 5174 }, { "completion_length": 178.41836547851562, "epoch": 0.5207547169811321, "grad_norm": 1.7970935106277466, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8105157613754272, "reward_std": 0.15495674312114716, "rewards/accuracy_reward": 0.8207198977470398, "rewards/format_reward": 0.9897959232330322, "step": 5175 }, { "completion_length": 264.5918426513672, "epoch": 0.5208553459119497, "grad_norm": 0.6777452826499939, "kl": 0.062744140625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7061224579811096, "reward_std": 0.186342753469944, "rewards/accuracy_reward": 0.7265305817127228, "rewards/format_reward": 0.9795918464660645, "step": 5176 }, { "completion_length": 249.28571319580078, "epoch": 0.5209559748427673, "grad_norm": 1.6816835403442383, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8979591131210327, "reward_std": 0.15069952607154846, "rewards/accuracy_reward": 0.9183673560619354, "rewards/format_reward": 0.9795918166637421, "step": 5177 }, { "completion_length": 226.34693908691406, "epoch": 0.5210566037735849, "grad_norm": 1.6485635042190552, "kl": 0.106689453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.8361161947250366, "reward_std": 0.1700369492173195, "rewards/accuracy_reward": 0.836116224527359, "rewards/format_reward": 1.0, "step": 5178 }, { "completion_length": 295.9591827392578, "epoch": 0.5211572327044025, "grad_norm": 0.7859987020492554, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6098108887672424, "reward_std": 0.2848392352461815, "rewards/accuracy_reward": 0.6914435923099518, "rewards/format_reward": 0.918367326259613, "step": 5179 }, { "completion_length": 396.24488830566406, "epoch": 0.5212578616352201, "grad_norm": 0.785659670829773, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.57204270362854, "reward_std": 0.2849395275115967, "rewards/accuracy_reward": 0.6536754071712494, "rewards/format_reward": 0.918367326259613, "step": 5180 }, { "completion_length": 274.948974609375, "epoch": 0.5213584905660378, "grad_norm": 0.8078676462173462, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.5899335145950317, "reward_std": 0.2379891499876976, "rewards/accuracy_reward": 0.6001376509666443, "rewards/format_reward": 0.9897959232330322, "step": 5181 }, { "completion_length": 259.83673095703125, "epoch": 0.5214591194968553, "grad_norm": 0.6839627027511597, "kl": 0.06689453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6883066892623901, "reward_std": 0.1542976163327694, "rewards/accuracy_reward": 0.7087149024009705, "rewards/format_reward": 0.9795918166637421, "step": 5182 }, { "completion_length": 317.2550964355469, "epoch": 0.5215597484276729, "grad_norm": 0.6673504114151001, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5692920088768005, "reward_std": 0.15822775661945343, "rewards/accuracy_reward": 0.5692920684814453, "rewards/format_reward": 1.0, "step": 5183 }, { "completion_length": 188.59183502197266, "epoch": 0.5216603773584906, "grad_norm": 0.6973445415496826, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8285309672355652, "reward_std": 0.1251239161938429, "rewards/accuracy_reward": 0.8489390909671783, "rewards/format_reward": 0.9795918166637421, "step": 5184 }, { "completion_length": 357.6530456542969, "epoch": 0.5217610062893082, "grad_norm": 0.6001148819923401, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.574895977973938, "reward_std": 0.2701331824064255, "rewards/accuracy_reward": 0.6259165108203888, "rewards/format_reward": 0.9489795565605164, "step": 5185 }, { "completion_length": 223.34693908691406, "epoch": 0.5218616352201257, "grad_norm": 0.6691948175430298, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.870791256427765, "reward_std": 0.19302191585302353, "rewards/accuracy_reward": 0.8911994695663452, "rewards/format_reward": 0.9795918464660645, "step": 5186 }, { "completion_length": 277.67346954345703, "epoch": 0.5219622641509434, "grad_norm": 0.5071256160736084, "kl": 0.0626220703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8043084144592285, "reward_std": 0.1010714047588408, "rewards/accuracy_reward": 0.8145124912261963, "rewards/format_reward": 0.9897959232330322, "step": 5187 }, { "completion_length": 253.23468780517578, "epoch": 0.522062893081761, "grad_norm": 0.5752239227294922, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6941351890563965, "reward_std": 0.14925821125507355, "rewards/accuracy_reward": 0.6941352188587189, "rewards/format_reward": 1.0, "step": 5188 }, { "completion_length": 185.53060913085938, "epoch": 0.5221635220125787, "grad_norm": 0.7454482316970825, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7959461212158203, "reward_std": 0.12401741184294224, "rewards/accuracy_reward": 0.8061502873897552, "rewards/format_reward": 0.9897959232330322, "step": 5189 }, { "completion_length": 255.95917510986328, "epoch": 0.5222641509433962, "grad_norm": 0.9072023034095764, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7402784824371338, "reward_std": 0.15210414119064808, "rewards/accuracy_reward": 0.7504825592041016, "rewards/format_reward": 0.9897959232330322, "step": 5190 }, { "completion_length": 299.55101013183594, "epoch": 0.5223647798742138, "grad_norm": 1.709825038909912, "kl": 0.0684814453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6313491463661194, "reward_std": 0.1374606117606163, "rewards/accuracy_reward": 0.6313491761684418, "rewards/format_reward": 1.0, "step": 5191 }, { "completion_length": 285.82652282714844, "epoch": 0.5224654088050315, "grad_norm": 0.5216066837310791, "kl": 0.0606689453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.810544192790985, "reward_std": 0.13834383711218834, "rewards/accuracy_reward": 0.8105441927909851, "rewards/format_reward": 1.0, "step": 5192 }, { "completion_length": 240.9387664794922, "epoch": 0.5225660377358491, "grad_norm": 0.6428364515304565, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7551019787788391, "reward_std": 0.17393124103546143, "rewards/accuracy_reward": 0.7551020085811615, "rewards/format_reward": 1.0, "step": 5193 }, { "completion_length": 180.75509643554688, "epoch": 0.5226666666666666, "grad_norm": 1.041545033454895, "kl": 0.0758056640625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8785228729248047, "reward_std": 0.11785447224974632, "rewards/accuracy_reward": 0.8989309370517731, "rewards/format_reward": 0.9795918464660645, "step": 5194 }, { "completion_length": 243.64285278320312, "epoch": 0.5227672955974842, "grad_norm": 0.8868624567985535, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7645500302314758, "reward_std": 0.16779401898384094, "rewards/accuracy_reward": 0.764550119638443, "rewards/format_reward": 1.0, "step": 5195 }, { "completion_length": 233.27550506591797, "epoch": 0.5228679245283019, "grad_norm": 0.5780641436576843, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.800000011920929, "reward_std": 0.17162487655878067, "rewards/accuracy_reward": 0.8204081356525421, "rewards/format_reward": 0.9795918166637421, "step": 5196 }, { "completion_length": 321.23468017578125, "epoch": 0.5229685534591195, "grad_norm": 0.7361943125724792, "kl": 0.0650634765625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6744275093078613, "reward_std": 0.19323339313268661, "rewards/accuracy_reward": 0.694835752248764, "rewards/format_reward": 0.9795918166637421, "step": 5197 }, { "completion_length": 238.33673095703125, "epoch": 0.5230691823899372, "grad_norm": 0.8825574517250061, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7456459999084473, "reward_std": 0.12525764480233192, "rewards/accuracy_reward": 0.7456459701061249, "rewards/format_reward": 1.0, "step": 5198 }, { "completion_length": 270.0816345214844, "epoch": 0.5231698113207547, "grad_norm": 0.3550160825252533, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.9096209406852722, "reward_std": 0.12171623110771179, "rewards/accuracy_reward": 0.9300291538238525, "rewards/format_reward": 0.9795918166637421, "step": 5199 }, { "completion_length": 243.28570556640625, "epoch": 0.5232704402515723, "grad_norm": 0.3055654466152191, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.918367326259613, "reward_std": 0.08099238574504852, "rewards/accuracy_reward": 0.9285714328289032, "rewards/format_reward": 0.9897959232330322, "step": 5200 }, { "completion_length": 276.4693908691406, "epoch": 0.52337106918239, "grad_norm": 0.5836833715438843, "kl": 0.0740966796875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.642660140991211, "reward_std": 0.21387933194637299, "rewards/accuracy_reward": 0.6630682647228241, "rewards/format_reward": 0.9795918166637421, "step": 5201 }, { "completion_length": 235.49999237060547, "epoch": 0.5234716981132076, "grad_norm": 0.8092175722122192, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8272150754928589, "reward_std": 0.10652206465601921, "rewards/accuracy_reward": 0.8374192416667938, "rewards/format_reward": 0.9897959232330322, "step": 5202 }, { "completion_length": 237.9081573486328, "epoch": 0.5235723270440251, "grad_norm": 1.0052130222320557, "kl": 0.0853271484375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.672125220298767, "reward_std": 0.23324483633041382, "rewards/accuracy_reward": 0.6823293268680573, "rewards/format_reward": 0.9897959232330322, "step": 5203 }, { "completion_length": 216.6326446533203, "epoch": 0.5236729559748428, "grad_norm": 0.7129397392272949, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7551749348640442, "reward_std": 0.19341062009334564, "rewards/accuracy_reward": 0.7857871651649475, "rewards/format_reward": 0.9693877398967743, "step": 5204 }, { "completion_length": 219.4693832397461, "epoch": 0.5237735849056604, "grad_norm": 0.7737537622451782, "kl": 0.12548828125, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.7764462232589722, "reward_std": 0.20071500539779663, "rewards/accuracy_reward": 0.7968544363975525, "rewards/format_reward": 0.9795918464660645, "step": 5205 }, { "completion_length": 248.03060913085938, "epoch": 0.523874213836478, "grad_norm": 0.41695672273635864, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6687317490577698, "reward_std": 0.1306893415749073, "rewards/accuracy_reward": 0.668731763958931, "rewards/format_reward": 1.0, "step": 5206 }, { "completion_length": 228.61224365234375, "epoch": 0.5239748427672956, "grad_norm": 0.9939863681793213, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7148462533950806, "reward_std": 0.14448074251413345, "rewards/accuracy_reward": 0.7148463428020477, "rewards/format_reward": 1.0, "step": 5207 }, { "completion_length": 316.68365478515625, "epoch": 0.5240754716981132, "grad_norm": 0.9728083610534668, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6627069115638733, "reward_std": 0.14364323765039444, "rewards/accuracy_reward": 0.6831150203943253, "rewards/format_reward": 0.9795918464660645, "step": 5208 }, { "completion_length": 283.1632537841797, "epoch": 0.5241761006289308, "grad_norm": 0.8026014566421509, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6057149767875671, "reward_std": 0.20288535207509995, "rewards/accuracy_reward": 0.6159190535545349, "rewards/format_reward": 0.9897959232330322, "step": 5209 }, { "completion_length": 223.19387817382812, "epoch": 0.5242767295597485, "grad_norm": 0.9201270341873169, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.817946195602417, "reward_std": 0.08284449577331543, "rewards/accuracy_reward": 0.8179462254047394, "rewards/format_reward": 1.0, "step": 5210 }, { "completion_length": 296.8061065673828, "epoch": 0.524377358490566, "grad_norm": 0.6479277014732361, "kl": 0.059814453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7371675372123718, "reward_std": 0.15544729679822922, "rewards/accuracy_reward": 0.7473716139793396, "rewards/format_reward": 0.9897959232330322, "step": 5211 }, { "completion_length": 225.93877410888672, "epoch": 0.5244779874213836, "grad_norm": 0.7871312499046326, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8063887357711792, "reward_std": 0.1668041441589594, "rewards/accuracy_reward": 0.8063887655735016, "rewards/format_reward": 1.0, "step": 5212 }, { "completion_length": 221.4693832397461, "epoch": 0.5245786163522013, "grad_norm": 0.9217479825019836, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6904761791229248, "reward_std": 0.2282029092311859, "rewards/accuracy_reward": 0.7006802558898926, "rewards/format_reward": 0.9897959232330322, "step": 5213 }, { "completion_length": 183.13265228271484, "epoch": 0.5246792452830189, "grad_norm": 1.2414542436599731, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.729834794998169, "reward_std": 0.22407810389995575, "rewards/accuracy_reward": 0.7502429187297821, "rewards/format_reward": 0.9795918166637421, "step": 5214 }, { "completion_length": 208.12244415283203, "epoch": 0.5247798742138364, "grad_norm": 0.6327865719795227, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8493505716323853, "reward_std": 0.11336585134267807, "rewards/accuracy_reward": 0.8595547378063202, "rewards/format_reward": 0.9897959232330322, "step": 5215 }, { "completion_length": 260.49999237060547, "epoch": 0.5248805031446541, "grad_norm": 0.6805570721626282, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7423861622810364, "reward_std": 0.12990127503871918, "rewards/accuracy_reward": 0.742386132478714, "rewards/format_reward": 1.0, "step": 5216 }, { "completion_length": 221.9693832397461, "epoch": 0.5249811320754717, "grad_norm": 1.091187834739685, "kl": 0.0966796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.9455782771110535, "reward_std": 0.06672165170311928, "rewards/accuracy_reward": 0.9557823240756989, "rewards/format_reward": 0.9897959232330322, "step": 5217 }, { "completion_length": 262.49999237060547, "epoch": 0.5250817610062893, "grad_norm": 0.7694223523139954, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6995683312416077, "reward_std": 0.20946723222732544, "rewards/accuracy_reward": 0.7199765741825104, "rewards/format_reward": 0.9795918166637421, "step": 5218 }, { "completion_length": 246.59182739257812, "epoch": 0.5251823899371069, "grad_norm": 0.6572251319885254, "kl": 0.0684814453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8839517831802368, "reward_std": 0.14364241808652878, "rewards/accuracy_reward": 0.9043599367141724, "rewards/format_reward": 0.9795918166637421, "step": 5219 }, { "completion_length": 200.9183578491211, "epoch": 0.5252830188679245, "grad_norm": 1.166761875152588, "kl": 0.12548828125, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.7816635966300964, "reward_std": 0.11285248026251793, "rewards/accuracy_reward": 0.7816635966300964, "rewards/format_reward": 1.0, "step": 5220 }, { "completion_length": 280.0816192626953, "epoch": 0.5253836477987421, "grad_norm": 0.5254616737365723, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.9187074303627014, "reward_std": 0.07742632925510406, "rewards/accuracy_reward": 0.9187074601650238, "rewards/format_reward": 1.0, "step": 5221 }, { "completion_length": 268.448974609375, "epoch": 0.5254842767295598, "grad_norm": 0.9092671275138855, "kl": 0.0643310546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.661418378353119, "reward_std": 0.21183720231056213, "rewards/accuracy_reward": 0.6920306384563446, "rewards/format_reward": 0.9693877398967743, "step": 5222 }, { "completion_length": 268.96937561035156, "epoch": 0.5255849056603774, "grad_norm": 0.48498910665512085, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7748997807502747, "reward_std": 0.0772976316511631, "rewards/accuracy_reward": 0.795307844877243, "rewards/format_reward": 0.9795918166637421, "step": 5223 }, { "completion_length": 228.19387817382812, "epoch": 0.5256855345911949, "grad_norm": 0.6237272620201111, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7376545667648315, "reward_std": 0.0692916614934802, "rewards/accuracy_reward": 0.7478586137294769, "rewards/format_reward": 0.9897959232330322, "step": 5224 }, { "completion_length": 225.9285659790039, "epoch": 0.5257861635220126, "grad_norm": 1.8009730577468872, "kl": 0.097412109375, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7982190251350403, "reward_std": 0.2337443307042122, "rewards/accuracy_reward": 0.7982190549373627, "rewards/format_reward": 1.0, "step": 5225 }, { "completion_length": 320.7244873046875, "epoch": 0.5258867924528302, "grad_norm": 0.6276435256004333, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6423157453536987, "reward_std": 0.14011242799460888, "rewards/accuracy_reward": 0.6525197923183441, "rewards/format_reward": 0.9897959232330322, "step": 5226 }, { "completion_length": 242.56122589111328, "epoch": 0.5259874213836478, "grad_norm": 0.6865702271461487, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8833070993423462, "reward_std": 0.15533389151096344, "rewards/accuracy_reward": 0.883307158946991, "rewards/format_reward": 1.0, "step": 5227 }, { "completion_length": 226.4897918701172, "epoch": 0.5260880503144654, "grad_norm": 0.6480965614318848, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.727139949798584, "reward_std": 0.16506676748394966, "rewards/accuracy_reward": 0.7373440563678741, "rewards/format_reward": 0.9897959232330322, "step": 5228 }, { "completion_length": 197.948974609375, "epoch": 0.526188679245283, "grad_norm": 0.6723690629005432, "kl": 0.1064453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.8165326714515686, "reward_std": 0.14020515978336334, "rewards/accuracy_reward": 0.8165326714515686, "rewards/format_reward": 1.0, "step": 5229 }, { "completion_length": 269.06121826171875, "epoch": 0.5262893081761006, "grad_norm": 0.9192283749580383, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6941995024681091, "reward_std": 0.16835393011569977, "rewards/accuracy_reward": 0.6941995024681091, "rewards/format_reward": 1.0, "step": 5230 }, { "completion_length": 212.89795684814453, "epoch": 0.5263899371069183, "grad_norm": 0.5624850392341614, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8491905331611633, "reward_std": 0.09919120743870735, "rewards/accuracy_reward": 0.8491905629634857, "rewards/format_reward": 1.0, "step": 5231 }, { "completion_length": 210.56121826171875, "epoch": 0.5264905660377358, "grad_norm": 0.9196571111679077, "kl": 0.09765625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7111935019493103, "reward_std": 0.11799495294690132, "rewards/accuracy_reward": 0.7111935317516327, "rewards/format_reward": 1.0, "step": 5232 }, { "completion_length": 270.8163299560547, "epoch": 0.5265911949685534, "grad_norm": 0.4825630486011505, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7688543796539307, "reward_std": 0.1570558287203312, "rewards/accuracy_reward": 0.7892625629901886, "rewards/format_reward": 0.9795918166637421, "step": 5233 }, { "completion_length": 210.2244873046875, "epoch": 0.5266918238993711, "grad_norm": 1.0214226245880127, "kl": 0.1103515625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.793245792388916, "reward_std": 0.2018771693110466, "rewards/accuracy_reward": 0.7932458817958832, "rewards/format_reward": 1.0, "step": 5234 }, { "completion_length": 246.9897918701172, "epoch": 0.5267924528301887, "grad_norm": 0.808918833732605, "kl": 0.0853271484375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7925344705581665, "reward_std": 0.22541704028844833, "rewards/accuracy_reward": 0.8027385771274567, "rewards/format_reward": 0.9897959232330322, "step": 5235 }, { "completion_length": 252.28571319580078, "epoch": 0.5268930817610062, "grad_norm": 1.021119236946106, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.671302855014801, "reward_std": 0.17161810398101807, "rewards/accuracy_reward": 0.6815068870782852, "rewards/format_reward": 0.9897959232330322, "step": 5236 }, { "completion_length": 215.07142639160156, "epoch": 0.5269937106918239, "grad_norm": 0.48865488171577454, "kl": 0.0517578125, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7769532203674316, "reward_std": 0.05673887301236391, "rewards/accuracy_reward": 0.7769532203674316, "rewards/format_reward": 1.0, "step": 5237 }, { "completion_length": 236.56122589111328, "epoch": 0.5270943396226415, "grad_norm": 0.987099826335907, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8650760054588318, "reward_std": 0.11804336309432983, "rewards/accuracy_reward": 0.8650760054588318, "rewards/format_reward": 1.0, "step": 5238 }, { "completion_length": 163.08162689208984, "epoch": 0.5271949685534592, "grad_norm": 0.37925809621810913, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8805004358291626, "reward_std": 0.1119472123682499, "rewards/accuracy_reward": 0.8907045423984528, "rewards/format_reward": 0.9897959232330322, "step": 5239 }, { "completion_length": 204.16326141357422, "epoch": 0.5272955974842767, "grad_norm": 0.9195266962051392, "kl": 0.0704345703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7407898306846619, "reward_std": 0.16017349436879158, "rewards/accuracy_reward": 0.7407898306846619, "rewards/format_reward": 1.0, "step": 5240 }, { "completion_length": 206.55101776123047, "epoch": 0.5273962264150943, "grad_norm": 0.5524552464485168, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7755101323127747, "reward_std": 0.08099237829446793, "rewards/accuracy_reward": 0.7857142686843872, "rewards/format_reward": 0.9897959232330322, "step": 5241 }, { "completion_length": 225.82653045654297, "epoch": 0.527496855345912, "grad_norm": 0.8801888823509216, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6958653330802917, "reward_std": 0.2199248969554901, "rewards/accuracy_reward": 0.6958653330802917, "rewards/format_reward": 1.0, "step": 5242 }, { "completion_length": 194.66326141357422, "epoch": 0.5275974842767296, "grad_norm": 0.44990289211273193, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7380467057228088, "reward_std": 0.08586550503969193, "rewards/accuracy_reward": 0.7482506632804871, "rewards/format_reward": 0.9897959232330322, "step": 5243 }, { "completion_length": 207.39795684814453, "epoch": 0.5276981132075471, "grad_norm": 0.5595727562904358, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6528226137161255, "reward_std": 0.0928700789809227, "rewards/accuracy_reward": 0.6528226733207703, "rewards/format_reward": 1.0, "step": 5244 }, { "completion_length": 217.61223602294922, "epoch": 0.5277987421383648, "grad_norm": 0.9078994989395142, "kl": 0.089111328125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7873530983924866, "reward_std": 0.16220341622829437, "rewards/accuracy_reward": 0.7873530983924866, "rewards/format_reward": 1.0, "step": 5245 }, { "completion_length": 215.1734619140625, "epoch": 0.5278993710691824, "grad_norm": 0.9339219331741333, "kl": 0.0911865234375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7122294902801514, "reward_std": 0.17127049714326859, "rewards/accuracy_reward": 0.7428417801856995, "rewards/format_reward": 0.9693877398967743, "step": 5246 }, { "completion_length": 255.6326446533203, "epoch": 0.528, "grad_norm": 0.6084516048431396, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7525902390480042, "reward_std": 0.21696817129850388, "rewards/accuracy_reward": 0.7627943456172943, "rewards/format_reward": 0.9897959232330322, "step": 5247 }, { "completion_length": 276.4285583496094, "epoch": 0.5281006289308177, "grad_norm": 0.46972939372062683, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6112419962882996, "reward_std": 0.11917585879564285, "rewards/accuracy_reward": 0.6214460730552673, "rewards/format_reward": 0.9897959232330322, "step": 5248 }, { "completion_length": 230.61223602294922, "epoch": 0.5282012578616352, "grad_norm": 0.4325009286403656, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8677841424942017, "reward_std": 0.09384746849536896, "rewards/accuracy_reward": 0.8677842319011688, "rewards/format_reward": 1.0, "step": 5249 }, { "completion_length": 242.61224365234375, "epoch": 0.5283018867924528, "grad_norm": 0.5028538703918457, "kl": 0.059814453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8715505003929138, "reward_std": 0.13269216194748878, "rewards/accuracy_reward": 0.8715504705905914, "rewards/format_reward": 1.0, "step": 5250 }, { "completion_length": 279.38775634765625, "epoch": 0.5284025157232705, "grad_norm": 0.5435006618499756, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7914394736289978, "reward_std": 0.17408320680260658, "rewards/accuracy_reward": 0.7914395034313202, "rewards/format_reward": 1.0, "step": 5251 }, { "completion_length": 265.34693145751953, "epoch": 0.5285031446540881, "grad_norm": 0.795112133026123, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.5928894877433777, "reward_std": 0.21985014528036118, "rewards/accuracy_reward": 0.6132977157831192, "rewards/format_reward": 0.9795918166637421, "step": 5252 }, { "completion_length": 323.0714111328125, "epoch": 0.5286037735849056, "grad_norm": 0.5318983197212219, "kl": 0.098388671875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7225611805915833, "reward_std": 0.08629873022437096, "rewards/accuracy_reward": 0.7225611805915833, "rewards/format_reward": 1.0, "step": 5253 }, { "completion_length": 288.11224365234375, "epoch": 0.5287044025157233, "grad_norm": 0.9019546508789062, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6172089576721191, "reward_std": 0.19577071070671082, "rewards/accuracy_reward": 0.6478212177753448, "rewards/format_reward": 0.9693877398967743, "step": 5254 }, { "completion_length": 187.32652282714844, "epoch": 0.5288050314465409, "grad_norm": 0.46722981333732605, "kl": 0.131103515625, "learning_rate": 1e-06, "loss": 0.0053, "reward": 1.8168712854385376, "reward_std": 0.03655119612812996, "rewards/accuracy_reward": 0.8168713450431824, "rewards/format_reward": 1.0, "step": 5255 }, { "completion_length": 273.55101013183594, "epoch": 0.5289056603773585, "grad_norm": 1.2364506721496582, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6835188269615173, "reward_std": 0.2225690335035324, "rewards/accuracy_reward": 0.7243351638317108, "rewards/format_reward": 0.9591836333274841, "step": 5256 }, { "completion_length": 225.9795913696289, "epoch": 0.5290062893081761, "grad_norm": 1.784254550933838, "kl": 0.110107421875, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.867140769958496, "reward_std": 0.14479666203260422, "rewards/accuracy_reward": 0.8671407699584961, "rewards/format_reward": 1.0, "step": 5257 }, { "completion_length": 246.6836700439453, "epoch": 0.5291069182389937, "grad_norm": 1.0773757696151733, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7161184549331665, "reward_std": 0.14737153053283691, "rewards/accuracy_reward": 0.7263224720954895, "rewards/format_reward": 0.9897959232330322, "step": 5258 }, { "completion_length": 233.1734619140625, "epoch": 0.5292075471698113, "grad_norm": 0.7623109817504883, "kl": 0.0889892578125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8302985429763794, "reward_std": 0.134148009121418, "rewards/accuracy_reward": 0.8405026495456696, "rewards/format_reward": 0.9897959232330322, "step": 5259 }, { "completion_length": 267.9591827392578, "epoch": 0.529308176100629, "grad_norm": 1.602513074874878, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6888296604156494, "reward_std": 0.13661042600870132, "rewards/accuracy_reward": 0.6888296902179718, "rewards/format_reward": 1.0, "step": 5260 }, { "completion_length": 195.39795684814453, "epoch": 0.5294088050314465, "grad_norm": 0.8072880506515503, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7836796641349792, "reward_std": 0.148716039955616, "rewards/accuracy_reward": 0.793883740901947, "rewards/format_reward": 0.9897959232330322, "step": 5261 }, { "completion_length": 226.35713958740234, "epoch": 0.5295094339622641, "grad_norm": 0.6082744002342224, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.87852281332016, "reward_std": 0.08827473223209381, "rewards/accuracy_reward": 0.8785228133201599, "rewards/format_reward": 1.0, "step": 5262 }, { "completion_length": 242.47958374023438, "epoch": 0.5296100628930818, "grad_norm": 0.7332689762115479, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8092970252037048, "reward_std": 0.1155029684305191, "rewards/accuracy_reward": 0.8092970550060272, "rewards/format_reward": 1.0, "step": 5263 }, { "completion_length": 178.2142791748047, "epoch": 0.5297106918238994, "grad_norm": 0.8056946992874146, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8679574131965637, "reward_std": 0.1563028022646904, "rewards/accuracy_reward": 0.8781614899635315, "rewards/format_reward": 0.9897959232330322, "step": 5264 }, { "completion_length": 227.11224365234375, "epoch": 0.5298113207547169, "grad_norm": 0.5178210139274597, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8690584301948547, "reward_std": 0.0746341273188591, "rewards/accuracy_reward": 0.8690583407878876, "rewards/format_reward": 1.0, "step": 5265 }, { "completion_length": 209.14285278320312, "epoch": 0.5299119496855346, "grad_norm": 0.4666619598865509, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7819017171859741, "reward_std": 0.1000908836722374, "rewards/accuracy_reward": 0.7921058237552643, "rewards/format_reward": 0.9897959232330322, "step": 5266 }, { "completion_length": 245.87754821777344, "epoch": 0.5300125786163522, "grad_norm": 0.3280256688594818, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8852933049201965, "reward_std": 0.03379400400444865, "rewards/accuracy_reward": 0.8954972624778748, "rewards/format_reward": 0.9897959232330322, "step": 5267 }, { "completion_length": 270.3775405883789, "epoch": 0.5301132075471698, "grad_norm": 3.5154008865356445, "kl": 0.1552734375, "learning_rate": 1e-06, "loss": 0.0062, "reward": 1.6345598697662354, "reward_std": 0.23805158585309982, "rewards/accuracy_reward": 0.6651721000671387, "rewards/format_reward": 0.9693877398967743, "step": 5268 }, { "completion_length": 248.57142639160156, "epoch": 0.5302138364779875, "grad_norm": 0.7626384496688843, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.659176528453827, "reward_std": 0.12589940754696727, "rewards/accuracy_reward": 0.6693806052207947, "rewards/format_reward": 0.9897959232330322, "step": 5269 }, { "completion_length": 304.83673095703125, "epoch": 0.530314465408805, "grad_norm": 0.23641785979270935, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8709028959274292, "reward_std": 0.04320027679204941, "rewards/accuracy_reward": 0.881106972694397, "rewards/format_reward": 0.9897959232330322, "step": 5270 }, { "completion_length": 197.9081573486328, "epoch": 0.5304150943396226, "grad_norm": 0.8508731722831726, "kl": 0.09912109375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8354145288467407, "reward_std": 0.14502724632620811, "rewards/accuracy_reward": 0.8456186652183533, "rewards/format_reward": 0.9897959232330322, "step": 5271 }, { "completion_length": 311.5408172607422, "epoch": 0.5305157232704403, "grad_norm": 0.3792566955089569, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7653061151504517, "reward_std": 0.05399492383003235, "rewards/accuracy_reward": 0.7653061151504517, "rewards/format_reward": 1.0, "step": 5272 }, { "completion_length": 216.24488830566406, "epoch": 0.5306163522012579, "grad_norm": 0.39751318097114563, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7813410758972168, "reward_std": 0.0969010517001152, "rewards/accuracy_reward": 0.7813411056995392, "rewards/format_reward": 1.0, "step": 5273 }, { "completion_length": 198.1836700439453, "epoch": 0.5307169811320754, "grad_norm": 0.7636528611183167, "kl": 0.10595703125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8763612508773804, "reward_std": 0.16075430065393448, "rewards/accuracy_reward": 0.8865653574466705, "rewards/format_reward": 0.9897959232330322, "step": 5274 }, { "completion_length": 241.82652282714844, "epoch": 0.5308176100628931, "grad_norm": 0.781901478767395, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8673468828201294, "reward_std": 0.12370206788182259, "rewards/accuracy_reward": 0.8775509893894196, "rewards/format_reward": 0.9897959232330322, "step": 5275 }, { "completion_length": 256.84693145751953, "epoch": 0.5309182389937107, "grad_norm": 0.36454105377197266, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6939024925231934, "reward_std": 0.1110931895673275, "rewards/accuracy_reward": 0.7143106758594513, "rewards/format_reward": 0.9795918166637421, "step": 5276 }, { "completion_length": 235.06122589111328, "epoch": 0.5310188679245283, "grad_norm": 0.7393170595169067, "kl": 0.09326171875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7708274126052856, "reward_std": 0.11070775240659714, "rewards/accuracy_reward": 0.770827442407608, "rewards/format_reward": 1.0, "step": 5277 }, { "completion_length": 266.9387741088867, "epoch": 0.5311194968553459, "grad_norm": 0.46281716227531433, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.9081632494926453, "reward_std": 0.039724184200167656, "rewards/accuracy_reward": 0.90816330909729, "rewards/format_reward": 1.0, "step": 5278 }, { "completion_length": 219.1530532836914, "epoch": 0.5312201257861635, "grad_norm": 0.684807538986206, "kl": 0.110107421875, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7726395726203918, "reward_std": 0.09327061101794243, "rewards/accuracy_reward": 0.7726396024227142, "rewards/format_reward": 1.0, "step": 5279 }, { "completion_length": 213.948974609375, "epoch": 0.5313207547169811, "grad_norm": 0.539188027381897, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.778911530971527, "reward_std": 0.13783666864037514, "rewards/accuracy_reward": 0.799319714307785, "rewards/format_reward": 0.9795918464660645, "step": 5280 }, { "completion_length": 266.2550964355469, "epoch": 0.5314213836477988, "grad_norm": 1.0504660606384277, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7195401191711426, "reward_std": 0.1513393670320511, "rewards/accuracy_reward": 0.7297443151473999, "rewards/format_reward": 0.9897959232330322, "step": 5281 }, { "completion_length": 323.56121826171875, "epoch": 0.5315220125786163, "grad_norm": 0.6020273566246033, "kl": 0.0784912109375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5183050632476807, "reward_std": 0.24299383908510208, "rewards/accuracy_reward": 0.538713276386261, "rewards/format_reward": 0.9795918166637421, "step": 5282 }, { "completion_length": 234.68367767333984, "epoch": 0.531622641509434, "grad_norm": 0.846143901348114, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8411079049110413, "reward_std": 0.0481383390724659, "rewards/accuracy_reward": 0.8411078453063965, "rewards/format_reward": 1.0, "step": 5283 }, { "completion_length": 275.12245178222656, "epoch": 0.5317232704402516, "grad_norm": 0.7449972033500671, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.5475762486457825, "reward_std": 0.21259286999702454, "rewards/accuracy_reward": 0.5679845064878464, "rewards/format_reward": 0.9795918166637421, "step": 5284 }, { "completion_length": 282.1836700439453, "epoch": 0.5318238993710692, "grad_norm": 1.4264830350875854, "kl": 0.0692138671875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.4890934824943542, "reward_std": 0.23361147940158844, "rewards/accuracy_reward": 0.5095017105340958, "rewards/format_reward": 0.9795918166637421, "step": 5285 }, { "completion_length": 178.78571319580078, "epoch": 0.5319245283018867, "grad_norm": 1.188994288444519, "kl": 0.1009521484375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.87316232919693, "reward_std": 0.092153025791049, "rewards/accuracy_reward": 0.8731623291969299, "rewards/format_reward": 1.0, "step": 5286 }, { "completion_length": 285.0306091308594, "epoch": 0.5320251572327044, "grad_norm": 0.6467674970626831, "kl": 0.0672607421875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8326228260993958, "reward_std": 0.10727612301707268, "rewards/accuracy_reward": 0.8428269624710083, "rewards/format_reward": 0.9897959232330322, "step": 5287 }, { "completion_length": 351.7346954345703, "epoch": 0.532125786163522, "grad_norm": 1.208949327468872, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6134862303733826, "reward_std": 0.24906117469072342, "rewards/accuracy_reward": 0.6440983712673187, "rewards/format_reward": 0.9693877398967743, "step": 5288 }, { "completion_length": 225.5408172607422, "epoch": 0.5322264150943397, "grad_norm": 0.5086024403572083, "kl": 0.10791015625, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.8240363001823425, "reward_std": 0.07298501208424568, "rewards/accuracy_reward": 0.8342403769493103, "rewards/format_reward": 0.9897959232330322, "step": 5289 }, { "completion_length": 257.448974609375, "epoch": 0.5323270440251572, "grad_norm": 0.8382519483566284, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.814285695552826, "reward_std": 0.15111637860536575, "rewards/accuracy_reward": 0.8448979258537292, "rewards/format_reward": 0.9693877398967743, "step": 5290 }, { "completion_length": 279.0918426513672, "epoch": 0.5324276729559748, "grad_norm": 0.873874843120575, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6348155736923218, "reward_std": 0.19696108996868134, "rewards/accuracy_reward": 0.6348156034946442, "rewards/format_reward": 1.0, "step": 5291 }, { "completion_length": 233.87754821777344, "epoch": 0.5325283018867925, "grad_norm": 0.47965946793556213, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.836219310760498, "reward_std": 0.0654619513079524, "rewards/accuracy_reward": 0.8362193405628204, "rewards/format_reward": 1.0, "step": 5292 }, { "completion_length": 233.83672332763672, "epoch": 0.5326289308176101, "grad_norm": 1.201819658279419, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8728057146072388, "reward_std": 0.16008037328720093, "rewards/accuracy_reward": 0.8830098509788513, "rewards/format_reward": 0.9897959232330322, "step": 5293 }, { "completion_length": 251.35713958740234, "epoch": 0.5327295597484277, "grad_norm": 0.7485486268997192, "kl": 0.0714111328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7477535605430603, "reward_std": 0.16482193395495415, "rewards/accuracy_reward": 0.7579576969146729, "rewards/format_reward": 0.9897959232330322, "step": 5294 }, { "completion_length": 298.1836700439453, "epoch": 0.5328301886792453, "grad_norm": 0.8922075033187866, "kl": 0.0755615234375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.614657461643219, "reward_std": 0.17205801606178284, "rewards/accuracy_reward": 0.6248615086078644, "rewards/format_reward": 0.9897959232330322, "step": 5295 }, { "completion_length": 230.88774871826172, "epoch": 0.5329308176100629, "grad_norm": 1.3864432573318481, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7062076330184937, "reward_std": 0.13257763534784317, "rewards/accuracy_reward": 0.7470239400863647, "rewards/format_reward": 0.9591836631298065, "step": 5296 }, { "completion_length": 175.55101776123047, "epoch": 0.5330314465408805, "grad_norm": 0.7524858117103577, "kl": 0.145751953125, "learning_rate": 1e-06, "loss": 0.0058, "reward": 1.8184556365013123, "reward_std": 0.06481175869703293, "rewards/accuracy_reward": 0.8184556663036346, "rewards/format_reward": 1.0, "step": 5297 }, { "completion_length": 225.14285278320312, "epoch": 0.5331320754716982, "grad_norm": 0.4289589524269104, "kl": 0.056640625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7889941334724426, "reward_std": 0.05343756452202797, "rewards/accuracy_reward": 0.788994163274765, "rewards/format_reward": 1.0, "step": 5298 }, { "completion_length": 206.20407104492188, "epoch": 0.5332327044025157, "grad_norm": 0.7782706618309021, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7499999403953552, "reward_std": 0.10932519286870956, "rewards/accuracy_reward": 0.7602040767669678, "rewards/format_reward": 0.9897959232330322, "step": 5299 }, { "completion_length": 290.7653045654297, "epoch": 0.5333333333333333, "grad_norm": 1.218931794166565, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6639941334724426, "reward_std": 0.2067907527089119, "rewards/accuracy_reward": 0.7048105001449585, "rewards/format_reward": 0.9591836631298065, "step": 5300 }, { "completion_length": 330.3163299560547, "epoch": 0.533433962264151, "grad_norm": 0.5475854277610779, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.733916461467743, "reward_std": 0.19836997985839844, "rewards/accuracy_reward": 0.7543246150016785, "rewards/format_reward": 0.9795918464660645, "step": 5301 }, { "completion_length": 258.2040786743164, "epoch": 0.5335345911949686, "grad_norm": 0.5213103294372559, "kl": 0.054931640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.652380883693695, "reward_std": 0.07697796169668436, "rewards/accuracy_reward": 0.6523809134960175, "rewards/format_reward": 1.0, "step": 5302 }, { "completion_length": 225.23468780517578, "epoch": 0.5336352201257861, "grad_norm": 0.6617404222488403, "kl": 0.0697021484375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.9365484118461609, "reward_std": 0.06942924484610558, "rewards/accuracy_reward": 0.9365484118461609, "rewards/format_reward": 1.0, "step": 5303 }, { "completion_length": 211.2142791748047, "epoch": 0.5337358490566038, "grad_norm": 0.5753795504570007, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8095403909683228, "reward_std": 0.13718444108963013, "rewards/accuracy_reward": 0.8095404803752899, "rewards/format_reward": 1.0, "step": 5304 }, { "completion_length": 221.4387664794922, "epoch": 0.5338364779874214, "grad_norm": 0.5967491865158081, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7724003195762634, "reward_std": 0.14450311660766602, "rewards/accuracy_reward": 0.7826044261455536, "rewards/format_reward": 0.9897959232330322, "step": 5305 }, { "completion_length": 179.05101776123047, "epoch": 0.533937106918239, "grad_norm": 14127.517578125, "kl": 170.03125, "learning_rate": 1e-06, "loss": 6.7907, "reward": 1.82845801115036, "reward_std": 0.19327611476182938, "rewards/accuracy_reward": 0.848866194486618, "rewards/format_reward": 0.9795918464660645, "step": 5306 }, { "completion_length": 221.0816192626953, "epoch": 0.5340377358490566, "grad_norm": 1.074134349822998, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7566730976104736, "reward_std": 0.17161405086517334, "rewards/accuracy_reward": 0.756673127412796, "rewards/format_reward": 1.0, "step": 5307 }, { "completion_length": 266.67346954345703, "epoch": 0.5341383647798742, "grad_norm": 0.6826211810112, "kl": 0.0650634765625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7139724493026733, "reward_std": 0.22834782302379608, "rewards/accuracy_reward": 0.7241765856742859, "rewards/format_reward": 0.9897959232330322, "step": 5308 }, { "completion_length": 310.8673324584961, "epoch": 0.5342389937106918, "grad_norm": 0.5474175214767456, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.875996172428131, "reward_std": 0.1457301303744316, "rewards/accuracy_reward": 0.896404355764389, "rewards/format_reward": 0.9795918166637421, "step": 5309 }, { "completion_length": 260.0204086303711, "epoch": 0.5343396226415095, "grad_norm": 0.6574349999427795, "kl": 0.065185546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8102040886878967, "reward_std": 0.13204443454742432, "rewards/accuracy_reward": 0.8204081654548645, "rewards/format_reward": 0.9897959232330322, "step": 5310 }, { "completion_length": 300.55101013183594, "epoch": 0.534440251572327, "grad_norm": 1.2467867136001587, "kl": 0.1083984375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.738308608531952, "reward_std": 0.2611820325255394, "rewards/accuracy_reward": 0.7587167620658875, "rewards/format_reward": 0.9795918464660645, "step": 5311 }, { "completion_length": 255.7040786743164, "epoch": 0.5345408805031446, "grad_norm": 0.7738228440284729, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.75492525100708, "reward_std": 0.17893773317337036, "rewards/accuracy_reward": 0.7753334641456604, "rewards/format_reward": 0.9795918464660645, "step": 5312 }, { "completion_length": 208.82652282714844, "epoch": 0.5346415094339623, "grad_norm": 1.1489546298980713, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7179343700408936, "reward_std": 0.22938800603151321, "rewards/accuracy_reward": 0.7383425533771515, "rewards/format_reward": 0.9795918464660645, "step": 5313 }, { "completion_length": 319.4897918701172, "epoch": 0.5347421383647799, "grad_norm": 1.7123242616653442, "kl": 0.0675048828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6460575461387634, "reward_std": 0.1555232983082533, "rewards/accuracy_reward": 0.6562616527080536, "rewards/format_reward": 0.9897959232330322, "step": 5314 }, { "completion_length": 326.1326599121094, "epoch": 0.5348427672955974, "grad_norm": 0.5922532677650452, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.741111159324646, "reward_std": 0.08708547055721283, "rewards/accuracy_reward": 0.7411112487316132, "rewards/format_reward": 1.0, "step": 5315 }, { "completion_length": 271.5102005004883, "epoch": 0.5349433962264151, "grad_norm": 1.106450080871582, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6761595606803894, "reward_std": 0.2728807181119919, "rewards/accuracy_reward": 0.7271799445152283, "rewards/format_reward": 0.9489795863628387, "step": 5316 }, { "completion_length": 353.74488830566406, "epoch": 0.5350440251572327, "grad_norm": 0.5163819193840027, "kl": 0.0631103515625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6833288669586182, "reward_std": 0.23800313472747803, "rewards/accuracy_reward": 0.7139410972595215, "rewards/format_reward": 0.9693877398967743, "step": 5317 }, { "completion_length": 292.33673095703125, "epoch": 0.5351446540880503, "grad_norm": 0.6176170706748962, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6796367168426514, "reward_std": 0.15565603598952293, "rewards/accuracy_reward": 0.6898407638072968, "rewards/format_reward": 0.9897959232330322, "step": 5318 }, { "completion_length": 269.4897994995117, "epoch": 0.535245283018868, "grad_norm": 0.5860708951950073, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8155226111412048, "reward_std": 0.13348162546753883, "rewards/accuracy_reward": 0.8155226409435272, "rewards/format_reward": 1.0, "step": 5319 }, { "completion_length": 205.74488830566406, "epoch": 0.5353459119496855, "grad_norm": 1.1184033155441284, "kl": 0.093994140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7773035168647766, "reward_std": 0.14373598247766495, "rewards/accuracy_reward": 0.7773036360740662, "rewards/format_reward": 1.0, "step": 5320 }, { "completion_length": 241.58162689208984, "epoch": 0.5354465408805031, "grad_norm": 0.7629849314689636, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.9217331409454346, "reward_std": 0.10445507988333702, "rewards/accuracy_reward": 0.9421413838863373, "rewards/format_reward": 0.9795918166637421, "step": 5321 }, { "completion_length": 243.61223602294922, "epoch": 0.5355471698113208, "grad_norm": 1.0121593475341797, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8142467737197876, "reward_std": 0.17956281453371048, "rewards/accuracy_reward": 0.81424680352211, "rewards/format_reward": 1.0, "step": 5322 }, { "completion_length": 325.3877410888672, "epoch": 0.5356477987421384, "grad_norm": 0.42995893955230713, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6652331948280334, "reward_std": 0.1338164247572422, "rewards/accuracy_reward": 0.6754372715950012, "rewards/format_reward": 0.9897959232330322, "step": 5323 }, { "completion_length": 271.7346878051758, "epoch": 0.5357484276729559, "grad_norm": 0.5562853217124939, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6802590489387512, "reward_std": 0.23140480369329453, "rewards/accuracy_reward": 0.7210754156112671, "rewards/format_reward": 0.9591836333274841, "step": 5324 }, { "completion_length": 268.02040100097656, "epoch": 0.5358490566037736, "grad_norm": 0.8378008604049683, "kl": 0.0714111328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.809147298336029, "reward_std": 0.14000224135816097, "rewards/accuracy_reward": 0.8091472685337067, "rewards/format_reward": 1.0, "step": 5325 }, { "completion_length": 280.83673095703125, "epoch": 0.5359496855345912, "grad_norm": 0.3124403953552246, "kl": 0.062744140625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6236494183540344, "reward_std": 0.05717109702527523, "rewards/accuracy_reward": 0.6338535100221634, "rewards/format_reward": 0.9897959232330322, "step": 5326 }, { "completion_length": 251.02040100097656, "epoch": 0.5360503144654089, "grad_norm": 1.5784170627593994, "kl": 0.123779296875, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.7368849515914917, "reward_std": 0.2417835295200348, "rewards/accuracy_reward": 0.7470889985561371, "rewards/format_reward": 0.9897959232330322, "step": 5327 }, { "completion_length": 251.46939086914062, "epoch": 0.5361509433962264, "grad_norm": 1.1560720205307007, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8061224222183228, "reward_std": 0.21920377016067505, "rewards/accuracy_reward": 0.8163264989852905, "rewards/format_reward": 0.9897959232330322, "step": 5328 }, { "completion_length": 271.8775405883789, "epoch": 0.536251572327044, "grad_norm": 1.1045633554458618, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7015793323516846, "reward_std": 0.2019156515598297, "rewards/accuracy_reward": 0.7321915924549103, "rewards/format_reward": 0.9693877398967743, "step": 5329 }, { "completion_length": 260.5102005004883, "epoch": 0.5363522012578616, "grad_norm": 1.7020936012268066, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8479619026184082, "reward_std": 0.12192308530211449, "rewards/accuracy_reward": 0.847961962223053, "rewards/format_reward": 1.0, "step": 5330 }, { "completion_length": 249.75509643554688, "epoch": 0.5364528301886793, "grad_norm": 1.2547690868377686, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6794086694717407, "reward_std": 0.20143817365169525, "rewards/accuracy_reward": 0.6896127462387085, "rewards/format_reward": 0.9897959232330322, "step": 5331 }, { "completion_length": 287.99998474121094, "epoch": 0.5365534591194968, "grad_norm": 0.6225063800811768, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.717818796634674, "reward_std": 0.19453341886401176, "rewards/accuracy_reward": 0.7280229330062866, "rewards/format_reward": 0.9897959232330322, "step": 5332 }, { "completion_length": 248.53060150146484, "epoch": 0.5366540880503144, "grad_norm": 0.6841340065002441, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7200116515159607, "reward_std": 0.20768891274929047, "rewards/accuracy_reward": 0.760828047990799, "rewards/format_reward": 0.9591836631298065, "step": 5333 }, { "completion_length": 212.59182739257812, "epoch": 0.5367547169811321, "grad_norm": 0.9253286719322205, "kl": 0.0606689453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7782930731773376, "reward_std": 0.09120957273989916, "rewards/accuracy_reward": 0.7782931327819824, "rewards/format_reward": 1.0, "step": 5334 }, { "completion_length": 262.34693908691406, "epoch": 0.5368553459119497, "grad_norm": 0.39816755056381226, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7551020383834839, "reward_std": 0.09217509627342224, "rewards/accuracy_reward": 0.7653061151504517, "rewards/format_reward": 0.9897959232330322, "step": 5335 }, { "completion_length": 274.0, "epoch": 0.5369559748427672, "grad_norm": 0.5174372792243958, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8034528493881226, "reward_std": 0.1975226327776909, "rewards/accuracy_reward": 0.8136569559574127, "rewards/format_reward": 0.9897959232330322, "step": 5336 }, { "completion_length": 236.9285659790039, "epoch": 0.5370566037735849, "grad_norm": 0.5916594862937927, "kl": 0.12744140625, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.7814912796020508, "reward_std": 0.12592117488384247, "rewards/accuracy_reward": 0.8121035099029541, "rewards/format_reward": 0.9693877398967743, "step": 5337 }, { "completion_length": 298.2346954345703, "epoch": 0.5371572327044025, "grad_norm": 0.4742181897163391, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8739081025123596, "reward_std": 0.09202049300074577, "rewards/accuracy_reward": 0.8943161964416504, "rewards/format_reward": 0.9795918166637421, "step": 5338 }, { "completion_length": 270.07141876220703, "epoch": 0.5372578616352202, "grad_norm": 0.32696300745010376, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.9625850319862366, "reward_std": 0.09899069368839264, "rewards/accuracy_reward": 0.9727891385555267, "rewards/format_reward": 0.9897959232330322, "step": 5339 }, { "completion_length": 239.7040786743164, "epoch": 0.5373584905660377, "grad_norm": 1.0275415182113647, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7099300622940063, "reward_std": 0.19795990735292435, "rewards/accuracy_reward": 0.7303383350372314, "rewards/format_reward": 0.9795918166637421, "step": 5340 }, { "completion_length": 290.9387664794922, "epoch": 0.5374591194968553, "grad_norm": 0.6757930517196655, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.662459135055542, "reward_std": 0.09361834824085236, "rewards/accuracy_reward": 0.662459135055542, "rewards/format_reward": 1.0, "step": 5341 }, { "completion_length": 272.39796447753906, "epoch": 0.537559748427673, "grad_norm": 0.9526646137237549, "kl": 0.104736328125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6634817123413086, "reward_std": 0.1823643334209919, "rewards/accuracy_reward": 0.6736857891082764, "rewards/format_reward": 0.9897959232330322, "step": 5342 }, { "completion_length": 278.5102081298828, "epoch": 0.5376603773584906, "grad_norm": 0.8318702578544617, "kl": 0.10400390625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7303637266159058, "reward_std": 0.16635307669639587, "rewards/accuracy_reward": 0.7405679821968079, "rewards/format_reward": 0.9897959232330322, "step": 5343 }, { "completion_length": 222.9795913696289, "epoch": 0.5377610062893082, "grad_norm": 0.8395318388938904, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.735033929347992, "reward_std": 0.14564407989382744, "rewards/accuracy_reward": 0.7554421722888947, "rewards/format_reward": 0.9795918166637421, "step": 5344 }, { "completion_length": 287.7346954345703, "epoch": 0.5378616352201258, "grad_norm": 0.6198530793190002, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.799562692642212, "reward_std": 0.20900465548038483, "rewards/accuracy_reward": 0.8199708163738251, "rewards/format_reward": 0.9795918464660645, "step": 5345 }, { "completion_length": 248.45917510986328, "epoch": 0.5379622641509434, "grad_norm": 0.5654260516166687, "kl": 0.14453125, "learning_rate": 1e-06, "loss": 0.0058, "reward": 1.7292987704277039, "reward_std": 0.1277499832212925, "rewards/accuracy_reward": 0.7395028173923492, "rewards/format_reward": 0.9897959232330322, "step": 5346 }, { "completion_length": 261.0816345214844, "epoch": 0.538062893081761, "grad_norm": 0.7368282079696655, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6698088645935059, "reward_std": 0.18142931908369064, "rewards/accuracy_reward": 0.6902170479297638, "rewards/format_reward": 0.9795918464660645, "step": 5347 }, { "completion_length": 291.46937561035156, "epoch": 0.5381635220125787, "grad_norm": 0.4677811563014984, "kl": 0.10888671875, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.5679163932800293, "reward_std": 0.12996318936347961, "rewards/accuracy_reward": 0.5883245766162872, "rewards/format_reward": 0.9795918464660645, "step": 5348 }, { "completion_length": 266.9897918701172, "epoch": 0.5382641509433962, "grad_norm": 0.388339638710022, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6795918345451355, "reward_std": 0.1517886482179165, "rewards/accuracy_reward": 0.6897958815097809, "rewards/format_reward": 0.9897959232330322, "step": 5349 }, { "completion_length": 262.9795913696289, "epoch": 0.5383647798742138, "grad_norm": 0.634157121181488, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7967820167541504, "reward_std": 0.09703298658132553, "rewards/accuracy_reward": 0.8069860935211182, "rewards/format_reward": 0.9897959232330322, "step": 5350 }, { "completion_length": 286.2448959350586, "epoch": 0.5384654088050315, "grad_norm": 1.7317438125610352, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6245700120925903, "reward_std": 0.16630146652460098, "rewards/accuracy_reward": 0.6347741186618805, "rewards/format_reward": 0.9897959232330322, "step": 5351 }, { "completion_length": 204.9693832397461, "epoch": 0.5385660377358491, "grad_norm": 0.35974088311195374, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8301395773887634, "reward_std": 0.06104031251743436, "rewards/accuracy_reward": 0.8301395773887634, "rewards/format_reward": 1.0, "step": 5352 }, { "completion_length": 238.9693832397461, "epoch": 0.5386666666666666, "grad_norm": 1.3026378154754639, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6249759197235107, "reward_std": 0.20412246137857437, "rewards/accuracy_reward": 0.6453840732574463, "rewards/format_reward": 0.9795918464660645, "step": 5353 }, { "completion_length": 203.9897918701172, "epoch": 0.5387672955974843, "grad_norm": 1.7852897644042969, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8273705840110779, "reward_std": 0.19870394840836525, "rewards/accuracy_reward": 0.847778707742691, "rewards/format_reward": 0.9795918464660645, "step": 5354 }, { "completion_length": 235.30611419677734, "epoch": 0.5388679245283019, "grad_norm": 0.7575198411941528, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8857142925262451, "reward_std": 0.11548355966806412, "rewards/accuracy_reward": 0.906122475862503, "rewards/format_reward": 0.9795918464660645, "step": 5355 }, { "completion_length": 303.27549743652344, "epoch": 0.5389685534591195, "grad_norm": 0.47135964035987854, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6162165403366089, "reward_std": 0.17464439570903778, "rewards/accuracy_reward": 0.6570328772068024, "rewards/format_reward": 0.9591836631298065, "step": 5356 }, { "completion_length": 192.16326141357422, "epoch": 0.5390691823899371, "grad_norm": 0.6102521419525146, "kl": 0.093994140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8730335235595703, "reward_std": 0.12275340780615807, "rewards/accuracy_reward": 0.8832376301288605, "rewards/format_reward": 0.9897959232330322, "step": 5357 }, { "completion_length": 342.9387664794922, "epoch": 0.5391698113207547, "grad_norm": 0.4791650176048279, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6370593309402466, "reward_std": 0.1310860626399517, "rewards/accuracy_reward": 0.6472634375095367, "rewards/format_reward": 0.9897959232330322, "step": 5358 }, { "completion_length": 322.10203552246094, "epoch": 0.5392704402515723, "grad_norm": 0.5924844145774841, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.699524700641632, "reward_std": 0.23525851219892502, "rewards/accuracy_reward": 0.7403410971164703, "rewards/format_reward": 0.9591836333274841, "step": 5359 }, { "completion_length": 240.1836700439453, "epoch": 0.53937106918239, "grad_norm": 0.7238698601722717, "kl": 0.0792236328125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8274375796318054, "reward_std": 0.13319055177271366, "rewards/accuracy_reward": 0.8274376094341278, "rewards/format_reward": 1.0, "step": 5360 }, { "completion_length": 236.33673095703125, "epoch": 0.5394716981132075, "grad_norm": 0.910508930683136, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6779334545135498, "reward_std": 0.22484664618968964, "rewards/accuracy_reward": 0.6881375908851624, "rewards/format_reward": 0.9897959232330322, "step": 5361 }, { "completion_length": 159.4387664794922, "epoch": 0.5395723270440251, "grad_norm": 0.7892806529998779, "kl": 0.106689453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7036553025245667, "reward_std": 0.11516211926937103, "rewards/accuracy_reward": 0.7240634262561798, "rewards/format_reward": 0.9795918166637421, "step": 5362 }, { "completion_length": 220.31631469726562, "epoch": 0.5396729559748428, "grad_norm": 0.3876221179962158, "kl": 0.0721435546875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.757344901561737, "reward_std": 0.10061325691640377, "rewards/accuracy_reward": 0.7675490379333496, "rewards/format_reward": 0.9897959232330322, "step": 5363 }, { "completion_length": 218.26531219482422, "epoch": 0.5397735849056604, "grad_norm": 0.4577867090702057, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.899999976158142, "reward_std": 0.08044970780611038, "rewards/accuracy_reward": 0.9102040529251099, "rewards/format_reward": 0.9897959232330322, "step": 5364 }, { "completion_length": 283.4693832397461, "epoch": 0.5398742138364779, "grad_norm": 0.4105808138847351, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.741560935974121, "reward_std": 0.16586389765143394, "rewards/accuracy_reward": 0.7721732258796692, "rewards/format_reward": 0.9693877398967743, "step": 5365 }, { "completion_length": 248.74488830566406, "epoch": 0.5399748427672956, "grad_norm": 0.4463846683502197, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8163264989852905, "reward_std": 0.09217509999871254, "rewards/accuracy_reward": 0.8265306055545807, "rewards/format_reward": 0.9897959232330322, "step": 5366 }, { "completion_length": 256.6428527832031, "epoch": 0.5400754716981132, "grad_norm": 1.0019164085388184, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.608494520187378, "reward_std": 0.1907455176115036, "rewards/accuracy_reward": 0.6391068398952484, "rewards/format_reward": 0.9693877398967743, "step": 5367 }, { "completion_length": 366.61224365234375, "epoch": 0.5401761006289308, "grad_norm": 0.38127851486206055, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.727365493774414, "reward_std": 0.07199140265583992, "rewards/accuracy_reward": 0.7375695109367371, "rewards/format_reward": 0.9897959232330322, "step": 5368 }, { "completion_length": 265.7959213256836, "epoch": 0.5402767295597485, "grad_norm": 0.8744924664497375, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7077986598014832, "reward_std": 0.21753347665071487, "rewards/accuracy_reward": 0.7180027663707733, "rewards/format_reward": 0.9897959232330322, "step": 5369 }, { "completion_length": 252.4897918701172, "epoch": 0.540377358490566, "grad_norm": 0.8342390656471252, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8272715210914612, "reward_std": 0.11009008809924126, "rewards/accuracy_reward": 0.8272716403007507, "rewards/format_reward": 1.0, "step": 5370 }, { "completion_length": 230.14285278320312, "epoch": 0.5404779874213836, "grad_norm": 0.3710377514362335, "kl": 0.0853271484375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.9195963740348816, "reward_std": 0.10174901410937309, "rewards/accuracy_reward": 0.9298005104064941, "rewards/format_reward": 0.9897959232330322, "step": 5371 }, { "completion_length": 246.75509643554688, "epoch": 0.5405786163522013, "grad_norm": 0.25536730885505676, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.825860619544983, "reward_std": 0.046236214227974415, "rewards/accuracy_reward": 0.8258606195449829, "rewards/format_reward": 1.0, "step": 5372 }, { "completion_length": 204.35713958740234, "epoch": 0.5406792452830189, "grad_norm": 0.4895123541355133, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.9372448325157166, "reward_std": 0.10562878102064133, "rewards/accuracy_reward": 0.9474489390850067, "rewards/format_reward": 0.9897959232330322, "step": 5373 }, { "completion_length": 247.73468017578125, "epoch": 0.5407798742138364, "grad_norm": 0.4842635989189148, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8328474164009094, "reward_std": 0.10507797077298164, "rewards/accuracy_reward": 0.8430515229701996, "rewards/format_reward": 0.9897959232330322, "step": 5374 }, { "completion_length": 307.87754821777344, "epoch": 0.5408805031446541, "grad_norm": 0.7227833867073059, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6102349758148193, "reward_std": 0.17994852364063263, "rewards/accuracy_reward": 0.640847235918045, "rewards/format_reward": 0.9693877398967743, "step": 5375 }, { "completion_length": 245.71428680419922, "epoch": 0.5409811320754717, "grad_norm": 0.6015341877937317, "kl": 0.14208984375, "learning_rate": 1e-06, "loss": 0.0057, "reward": 1.663692593574524, "reward_std": 0.1817108392715454, "rewards/accuracy_reward": 0.694304883480072, "rewards/format_reward": 0.9693877398967743, "step": 5376 }, { "completion_length": 221.7142791748047, "epoch": 0.5410817610062894, "grad_norm": 0.7472147345542908, "kl": 0.113037109375, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7741024494171143, "reward_std": 0.14943023025989532, "rewards/accuracy_reward": 0.8047148287296295, "rewards/format_reward": 0.9693877398967743, "step": 5377 }, { "completion_length": 206.4285659790039, "epoch": 0.5411823899371069, "grad_norm": 0.4030730128288269, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.9183672666549683, "reward_std": 0.06628002412617207, "rewards/accuracy_reward": 0.918367326259613, "rewards/format_reward": 1.0, "step": 5378 }, { "completion_length": 291.4897918701172, "epoch": 0.5412830188679245, "grad_norm": 0.5231472253799438, "kl": 0.0731201171875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7767573595046997, "reward_std": 0.124642513692379, "rewards/accuracy_reward": 0.7869614362716675, "rewards/format_reward": 0.9897959232330322, "step": 5379 }, { "completion_length": 178.87754821777344, "epoch": 0.5413836477987422, "grad_norm": 1.3102667331695557, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7291059494018555, "reward_std": 0.16704729199409485, "rewards/accuracy_reward": 0.7291058897972107, "rewards/format_reward": 1.0, "step": 5380 }, { "completion_length": 206.9285659790039, "epoch": 0.5414842767295598, "grad_norm": 0.8544938564300537, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6909090876579285, "reward_std": 0.20238276571035385, "rewards/accuracy_reward": 0.6909090876579285, "rewards/format_reward": 1.0, "step": 5381 }, { "completion_length": 240.2142791748047, "epoch": 0.5415849056603773, "grad_norm": 0.7651366591453552, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7772108912467957, "reward_std": 0.19930745661258698, "rewards/accuracy_reward": 0.7976190447807312, "rewards/format_reward": 0.9795918464660645, "step": 5382 }, { "completion_length": 237.33673858642578, "epoch": 0.541685534591195, "grad_norm": 0.5504235029220581, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6920663714408875, "reward_std": 0.07721907645463943, "rewards/accuracy_reward": 0.6920664012432098, "rewards/format_reward": 1.0, "step": 5383 }, { "completion_length": 245.34693908691406, "epoch": 0.5417861635220126, "grad_norm": 0.737220048904419, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.759134829044342, "reward_std": 0.16642221435904503, "rewards/accuracy_reward": 0.7795431017875671, "rewards/format_reward": 0.9795918464660645, "step": 5384 }, { "completion_length": 260.1428527832031, "epoch": 0.5418867924528302, "grad_norm": 0.47002652287483215, "kl": 0.0499267578125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8134065866470337, "reward_std": 0.08196701481938362, "rewards/accuracy_reward": 0.8236107230186462, "rewards/format_reward": 0.9897959232330322, "step": 5385 }, { "completion_length": 179.1836700439453, "epoch": 0.5419874213836477, "grad_norm": 0.9502768516540527, "kl": 0.105224609375, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6818954348564148, "reward_std": 0.12682000920176506, "rewards/accuracy_reward": 0.6818954944610596, "rewards/format_reward": 1.0, "step": 5386 }, { "completion_length": 220.61224365234375, "epoch": 0.5420880503144654, "grad_norm": 0.5065683722496033, "kl": 0.0745849609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8318755626678467, "reward_std": 0.10027510859072208, "rewards/accuracy_reward": 0.8420796692371368, "rewards/format_reward": 0.9897959232330322, "step": 5387 }, { "completion_length": 262.9591751098633, "epoch": 0.542188679245283, "grad_norm": 0.8032950758934021, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6904820799827576, "reward_std": 0.18031538277864456, "rewards/accuracy_reward": 0.7006861567497253, "rewards/format_reward": 0.9897959232330322, "step": 5388 }, { "completion_length": 188.0204086303711, "epoch": 0.5422893081761007, "grad_norm": 0.5286028981208801, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8129969835281372, "reward_std": 0.09178251028060913, "rewards/accuracy_reward": 0.8436092138290405, "rewards/format_reward": 0.9693877398967743, "step": 5389 }, { "completion_length": 226.86734008789062, "epoch": 0.5423899371069182, "grad_norm": 0.6679721474647522, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.784548044204712, "reward_std": 0.16045787185430527, "rewards/accuracy_reward": 0.7947521507740021, "rewards/format_reward": 0.9897959232330322, "step": 5390 }, { "completion_length": 236.2448959350586, "epoch": 0.5424905660377358, "grad_norm": 0.9509451389312744, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7214367389678955, "reward_std": 0.21856918931007385, "rewards/accuracy_reward": 0.7316408455371857, "rewards/format_reward": 0.9897959232330322, "step": 5391 }, { "completion_length": 216.31632232666016, "epoch": 0.5425911949685535, "grad_norm": 0.6881844997406006, "kl": 0.10205078125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7436053156852722, "reward_std": 0.06656196527183056, "rewards/accuracy_reward": 0.7436053156852722, "rewards/format_reward": 1.0, "step": 5392 }, { "completion_length": 284.29591369628906, "epoch": 0.5426918238993711, "grad_norm": 1.8135778903961182, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6736364364624023, "reward_std": 0.20444577932357788, "rewards/accuracy_reward": 0.6940446197986603, "rewards/format_reward": 0.9795918166637421, "step": 5393 }, { "completion_length": 245.29590606689453, "epoch": 0.5427924528301887, "grad_norm": 0.7131624221801758, "kl": 0.0596923828125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7653060555458069, "reward_std": 0.10003120452165604, "rewards/accuracy_reward": 0.7653060853481293, "rewards/format_reward": 1.0, "step": 5394 }, { "completion_length": 327.5816192626953, "epoch": 0.5428930817610063, "grad_norm": 0.8629652261734009, "kl": 0.12158203125, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.729641854763031, "reward_std": 0.17242324352264404, "rewards/accuracy_reward": 0.7398459315299988, "rewards/format_reward": 0.9897959232330322, "step": 5395 }, { "completion_length": 251.6326446533203, "epoch": 0.5429937106918239, "grad_norm": 0.7098660469055176, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.790374755859375, "reward_std": 0.14972089231014252, "rewards/accuracy_reward": 0.7903746962547302, "rewards/format_reward": 1.0, "step": 5396 }, { "completion_length": 197.17346954345703, "epoch": 0.5430943396226415, "grad_norm": 0.5525109767913818, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8798184990882874, "reward_std": 0.08158425986766815, "rewards/accuracy_reward": 0.8798185586929321, "rewards/format_reward": 1.0, "step": 5397 }, { "completion_length": 260.5816345214844, "epoch": 0.5431949685534592, "grad_norm": 1.176361083984375, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5798361897468567, "reward_std": 0.22379717975854874, "rewards/accuracy_reward": 0.5798362195491791, "rewards/format_reward": 1.0, "step": 5398 }, { "completion_length": 266.39795684814453, "epoch": 0.5432955974842767, "grad_norm": 0.5950220823287964, "kl": 0.114990234375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.696026623249054, "reward_std": 0.17253416031599045, "rewards/accuracy_reward": 0.7164346873760223, "rewards/format_reward": 0.9795918166637421, "step": 5399 }, { "completion_length": 278.41835021972656, "epoch": 0.5433962264150943, "grad_norm": 0.5196862816810608, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5595356822013855, "reward_std": 0.12875677831470966, "rewards/accuracy_reward": 0.5697397589683533, "rewards/format_reward": 0.9897959232330322, "step": 5400 }, { "completion_length": 195.4897918701172, "epoch": 0.543496855345912, "grad_norm": 1.6152499914169312, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6028514504432678, "reward_std": 0.14664006605744362, "rewards/accuracy_reward": 0.602851390838623, "rewards/format_reward": 1.0, "step": 5401 }, { "completion_length": 174.09183502197266, "epoch": 0.5435974842767296, "grad_norm": 0.6683408617973328, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8244898319244385, "reward_std": 0.10278992354869843, "rewards/accuracy_reward": 0.8346938490867615, "rewards/format_reward": 0.9897959232330322, "step": 5402 }, { "completion_length": 222.36734008789062, "epoch": 0.5436981132075471, "grad_norm": 0.5570723414421082, "kl": 0.0672607421875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8756957054138184, "reward_std": 0.13989593088626862, "rewards/accuracy_reward": 0.8961038887500763, "rewards/format_reward": 0.9795918464660645, "step": 5403 }, { "completion_length": 223.36734008789062, "epoch": 0.5437987421383648, "grad_norm": 0.5233839750289917, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7979590892791748, "reward_std": 0.09757459163665771, "rewards/accuracy_reward": 0.8183673620223999, "rewards/format_reward": 0.9795918464660645, "step": 5404 }, { "completion_length": 260.61224365234375, "epoch": 0.5438993710691824, "grad_norm": 0.9338873028755188, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.5305346846580505, "reward_std": 0.20873531699180603, "rewards/accuracy_reward": 0.5407387167215347, "rewards/format_reward": 0.9897959232330322, "step": 5405 }, { "completion_length": 211.65306091308594, "epoch": 0.544, "grad_norm": 0.8634466528892517, "kl": 0.108154296875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7687288522720337, "reward_std": 0.21745546907186508, "rewards/accuracy_reward": 0.7789329290390015, "rewards/format_reward": 0.9897959232330322, "step": 5406 }, { "completion_length": 278.2653045654297, "epoch": 0.5441006289308176, "grad_norm": 0.6404840350151062, "kl": 0.0986328125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6107851266860962, "reward_std": 0.13553806394338608, "rewards/accuracy_reward": 0.6107851266860962, "rewards/format_reward": 1.0, "step": 5407 }, { "completion_length": 266.55101776123047, "epoch": 0.5442012578616352, "grad_norm": 0.6689430475234985, "kl": 0.11328125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.654222846031189, "reward_std": 0.09149285405874252, "rewards/accuracy_reward": 0.6542229652404785, "rewards/format_reward": 1.0, "step": 5408 }, { "completion_length": 196.45917510986328, "epoch": 0.5443018867924528, "grad_norm": 1.006550908088684, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8433106541633606, "reward_std": 0.19275611639022827, "rewards/accuracy_reward": 0.8433106243610382, "rewards/format_reward": 1.0, "step": 5409 }, { "completion_length": 176.41836547851562, "epoch": 0.5444025157232705, "grad_norm": 1.0503804683685303, "kl": 0.11376953125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.863095223903656, "reward_std": 0.11926737427711487, "rewards/accuracy_reward": 0.8630952537059784, "rewards/format_reward": 1.0, "step": 5410 }, { "completion_length": 207.87754821777344, "epoch": 0.544503144654088, "grad_norm": 0.5511671900749207, "kl": 0.108642578125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7222222089767456, "reward_std": 0.11746398359537125, "rewards/accuracy_reward": 0.7222221791744232, "rewards/format_reward": 1.0, "step": 5411 }, { "completion_length": 264.2142791748047, "epoch": 0.5446037735849056, "grad_norm": 2.2681729793548584, "kl": 0.0692138671875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7189238667488098, "reward_std": 0.14316972345113754, "rewards/accuracy_reward": 0.7291279733181, "rewards/format_reward": 0.9897959232330322, "step": 5412 }, { "completion_length": 239.63265228271484, "epoch": 0.5447044025157233, "grad_norm": 0.8386971950531006, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7199691534042358, "reward_std": 0.15884927287697792, "rewards/accuracy_reward": 0.719969242811203, "rewards/format_reward": 1.0, "step": 5413 }, { "completion_length": 268.8571319580078, "epoch": 0.5448050314465409, "grad_norm": 0.6280302405357361, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8787453770637512, "reward_std": 0.064787482842803, "rewards/accuracy_reward": 0.8889495432376862, "rewards/format_reward": 0.9897959232330322, "step": 5414 }, { "completion_length": 192.1836700439453, "epoch": 0.5449056603773584, "grad_norm": 0.5648868680000305, "kl": 0.09912109375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.9641722440719604, "reward_std": 0.058913158252835274, "rewards/accuracy_reward": 0.9641722738742828, "rewards/format_reward": 1.0, "step": 5415 }, { "completion_length": 196.29591369628906, "epoch": 0.5450062893081761, "grad_norm": 1.009224772453308, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8552731275558472, "reward_std": 0.13015998154878616, "rewards/accuracy_reward": 0.8552731573581696, "rewards/format_reward": 1.0, "step": 5416 }, { "completion_length": 177.65306091308594, "epoch": 0.5451069182389937, "grad_norm": 0.6313890814781189, "kl": 0.108154296875, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7540557980537415, "reward_std": 0.10249561071395874, "rewards/accuracy_reward": 0.7642598748207092, "rewards/format_reward": 0.9897959232330322, "step": 5417 }, { "completion_length": 256.83673095703125, "epoch": 0.5452075471698113, "grad_norm": 0.5650300979614258, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8000463843345642, "reward_std": 0.1355990618467331, "rewards/accuracy_reward": 0.8102504312992096, "rewards/format_reward": 0.9897959232330322, "step": 5418 }, { "completion_length": 220.28570556640625, "epoch": 0.545308176100629, "grad_norm": 0.4101482927799225, "kl": 0.0728759765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.9182538986206055, "reward_std": 0.07205204665660858, "rewards/accuracy_reward": 0.9284580051898956, "rewards/format_reward": 0.9897959232330322, "step": 5419 }, { "completion_length": 199.44898223876953, "epoch": 0.5454088050314465, "grad_norm": 0.9213453531265259, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7830755710601807, "reward_std": 0.13094666600227356, "rewards/accuracy_reward": 0.7830756306648254, "rewards/format_reward": 1.0, "step": 5420 }, { "completion_length": 167.6326446533203, "epoch": 0.5455094339622641, "grad_norm": 1.4683483839035034, "kl": 0.0863037109375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7699228525161743, "reward_std": 0.12754915282130241, "rewards/accuracy_reward": 0.7699228823184967, "rewards/format_reward": 1.0, "step": 5421 }, { "completion_length": 189.4285659790039, "epoch": 0.5456100628930818, "grad_norm": 0.7550223469734192, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.9158163666725159, "reward_std": 0.09706027433276176, "rewards/accuracy_reward": 0.9362244307994843, "rewards/format_reward": 0.9795918464660645, "step": 5422 }, { "completion_length": 189.04080963134766, "epoch": 0.5457106918238994, "grad_norm": 1.1589128971099854, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8521721959114075, "reward_std": 0.14571215584874153, "rewards/accuracy_reward": 0.8725802898406982, "rewards/format_reward": 0.9795918464660645, "step": 5423 }, { "completion_length": 208.47958374023438, "epoch": 0.5458113207547169, "grad_norm": 0.5098863244056702, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.811147153377533, "reward_std": 0.11240747570991516, "rewards/accuracy_reward": 0.8213512599468231, "rewards/format_reward": 0.9897959232330322, "step": 5424 }, { "completion_length": 214.29591369628906, "epoch": 0.5459119496855346, "grad_norm": 0.6025083661079407, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.804786205291748, "reward_std": 0.1487191990017891, "rewards/accuracy_reward": 0.8047861754894257, "rewards/format_reward": 1.0, "step": 5425 }, { "completion_length": 220.948974609375, "epoch": 0.5460125786163522, "grad_norm": 1.0677512884140015, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7429866790771484, "reward_std": 0.21924128383398056, "rewards/accuracy_reward": 0.7633948624134064, "rewards/format_reward": 0.9795918464660645, "step": 5426 }, { "completion_length": 196.33673095703125, "epoch": 0.5461132075471699, "grad_norm": 0.6704415678977966, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6684429049491882, "reward_std": 0.14377279579639435, "rewards/accuracy_reward": 0.6888510882854462, "rewards/format_reward": 0.9795918166637421, "step": 5427 }, { "completion_length": 196.4285659790039, "epoch": 0.5462138364779874, "grad_norm": 1.83766770362854, "kl": 0.118408203125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7223463654518127, "reward_std": 0.06339155789464712, "rewards/accuracy_reward": 0.7325504720211029, "rewards/format_reward": 0.9897959232330322, "step": 5428 }, { "completion_length": 165.66326141357422, "epoch": 0.546314465408805, "grad_norm": 1.2176198959350586, "kl": 0.09423828125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8060604929924011, "reward_std": 0.11933618411421776, "rewards/accuracy_reward": 0.8060605823993683, "rewards/format_reward": 1.0, "step": 5429 }, { "completion_length": 202.51019287109375, "epoch": 0.5464150943396227, "grad_norm": 5.251672744750977, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7090621590614319, "reward_std": 0.11249954998493195, "rewards/accuracy_reward": 0.7090621590614319, "rewards/format_reward": 1.0, "step": 5430 }, { "completion_length": 215.03060150146484, "epoch": 0.5465157232704403, "grad_norm": 0.6480401158332825, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.809440553188324, "reward_std": 0.11345059797167778, "rewards/accuracy_reward": 0.8196446597576141, "rewards/format_reward": 0.9897959232330322, "step": 5431 }, { "completion_length": 196.62244415283203, "epoch": 0.5466163522012578, "grad_norm": 1.4713923931121826, "kl": 0.1376953125, "learning_rate": 1e-06, "loss": 0.0055, "reward": 1.784904420375824, "reward_std": 0.24778521806001663, "rewards/accuracy_reward": 0.8155167102813721, "rewards/format_reward": 0.9693877398967743, "step": 5432 }, { "completion_length": 206.89795684814453, "epoch": 0.5467169811320755, "grad_norm": 0.6347862482070923, "kl": 0.0838623046875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7816612124443054, "reward_std": 0.13881920278072357, "rewards/accuracy_reward": 0.7918651700019836, "rewards/format_reward": 0.9897959232330322, "step": 5433 }, { "completion_length": 238.6734619140625, "epoch": 0.5468176100628931, "grad_norm": 0.626264750957489, "kl": 0.091064453125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7249655723571777, "reward_std": 0.15558510273694992, "rewards/accuracy_reward": 0.7351696789264679, "rewards/format_reward": 0.9897959232330322, "step": 5434 }, { "completion_length": 263.05101013183594, "epoch": 0.5469182389937107, "grad_norm": 0.5883913636207581, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6732838153839111, "reward_std": 0.13009824976325035, "rewards/accuracy_reward": 0.6834879219532013, "rewards/format_reward": 0.9897959232330322, "step": 5435 }, { "completion_length": 255.92855834960938, "epoch": 0.5470188679245283, "grad_norm": 0.6230690479278564, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8454436659812927, "reward_std": 0.18306614086031914, "rewards/accuracy_reward": 0.8658517897129059, "rewards/format_reward": 0.9795918166637421, "step": 5436 }, { "completion_length": 213.44898223876953, "epoch": 0.5471194968553459, "grad_norm": 0.4641305208206177, "kl": 0.0633544921875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.9155732989311218, "reward_std": 0.07810654863715172, "rewards/accuracy_reward": 0.9155733585357666, "rewards/format_reward": 1.0, "step": 5437 }, { "completion_length": 280.6632537841797, "epoch": 0.5472201257861635, "grad_norm": 0.7957077622413635, "kl": 0.10107421875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.796721875667572, "reward_std": 0.10619991645216942, "rewards/accuracy_reward": 0.806926041841507, "rewards/format_reward": 0.9897959232330322, "step": 5438 }, { "completion_length": 295.7653045654297, "epoch": 0.5473207547169812, "grad_norm": 0.5867031812667847, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7429359555244446, "reward_std": 0.12234695255756378, "rewards/accuracy_reward": 0.75314000248909, "rewards/format_reward": 0.9897959232330322, "step": 5439 }, { "completion_length": 246.51020050048828, "epoch": 0.5474213836477987, "grad_norm": 1.0438933372497559, "kl": 0.094970703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6645021438598633, "reward_std": 0.15892285108566284, "rewards/accuracy_reward": 0.7155225276947021, "rewards/format_reward": 0.9489795565605164, "step": 5440 }, { "completion_length": 207.16326141357422, "epoch": 0.5475220125786163, "grad_norm": 2.20451283454895, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7298237085342407, "reward_std": 0.18568165600299835, "rewards/accuracy_reward": 0.7298237085342407, "rewards/format_reward": 1.0, "step": 5441 }, { "completion_length": 248.2551040649414, "epoch": 0.547622641509434, "grad_norm": 0.8949685096740723, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.5297558903694153, "reward_std": 0.22448492050170898, "rewards/accuracy_reward": 0.5399600565433502, "rewards/format_reward": 0.9897959232330322, "step": 5442 }, { "completion_length": 213.97958374023438, "epoch": 0.5477232704402516, "grad_norm": 0.7031914591789246, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7412374019622803, "reward_std": 0.17838650196790695, "rewards/accuracy_reward": 0.7616455852985382, "rewards/format_reward": 0.9795918464660645, "step": 5443 }, { "completion_length": 248.89796447753906, "epoch": 0.5478238993710692, "grad_norm": 0.8955296874046326, "kl": 0.10986328125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7725505828857422, "reward_std": 0.1817004382610321, "rewards/accuracy_reward": 0.7827546000480652, "rewards/format_reward": 0.9897959232330322, "step": 5444 }, { "completion_length": 187.5918312072754, "epoch": 0.5479245283018868, "grad_norm": 1.6122699975967407, "kl": 0.0655517578125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.9211353659629822, "reward_std": 0.10056059435009956, "rewards/accuracy_reward": 0.9211353957653046, "rewards/format_reward": 1.0, "step": 5445 }, { "completion_length": 265.0918273925781, "epoch": 0.5480251572327044, "grad_norm": 0.7942748069763184, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.630223572254181, "reward_std": 0.1527908742427826, "rewards/accuracy_reward": 0.6506316661834717, "rewards/format_reward": 0.9795918166637421, "step": 5446 }, { "completion_length": 228.97958374023438, "epoch": 0.548125786163522, "grad_norm": 0.44093966484069824, "kl": 0.0572509765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8877550959587097, "reward_std": 0.03632801026105881, "rewards/accuracy_reward": 0.8877550661563873, "rewards/format_reward": 1.0, "step": 5447 }, { "completion_length": 253.2551040649414, "epoch": 0.5482264150943397, "grad_norm": 0.7768518328666687, "kl": 0.0931396484375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6962124109268188, "reward_std": 0.16552798449993134, "rewards/accuracy_reward": 0.696212500333786, "rewards/format_reward": 1.0, "step": 5448 }, { "completion_length": 241.01020050048828, "epoch": 0.5483270440251572, "grad_norm": 1.3956003189086914, "kl": 0.09765625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6546878814697266, "reward_std": 0.20966332405805588, "rewards/accuracy_reward": 0.654687911272049, "rewards/format_reward": 1.0, "step": 5449 }, { "completion_length": 205.16326141357422, "epoch": 0.5484276729559748, "grad_norm": 0.6906430125236511, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8571428060531616, "reward_std": 0.11805073171854019, "rewards/accuracy_reward": 0.8979591727256775, "rewards/format_reward": 0.9591836631298065, "step": 5450 }, { "completion_length": 278.7346954345703, "epoch": 0.5485283018867925, "grad_norm": 1.0323787927627563, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7098483443260193, "reward_std": 0.18214983493089676, "rewards/accuracy_reward": 0.7098483741283417, "rewards/format_reward": 1.0, "step": 5451 }, { "completion_length": 223.62244415283203, "epoch": 0.5486289308176101, "grad_norm": 0.6429012417793274, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8639456033706665, "reward_std": 0.08999153599143028, "rewards/accuracy_reward": 0.8741496503353119, "rewards/format_reward": 0.9897959232330322, "step": 5452 }, { "completion_length": 210.68366241455078, "epoch": 0.5487295597484276, "grad_norm": 6.087787628173828, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.774644374847412, "reward_std": 0.24274568259716034, "rewards/accuracy_reward": 0.8052566051483154, "rewards/format_reward": 0.9693877398967743, "step": 5453 }, { "completion_length": 219.97958374023438, "epoch": 0.5488301886792453, "grad_norm": 0.5979025363922119, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7165069580078125, "reward_std": 0.10856546834111214, "rewards/accuracy_reward": 0.7267110049724579, "rewards/format_reward": 0.9897959232330322, "step": 5454 }, { "completion_length": 224.6224365234375, "epoch": 0.5489308176100629, "grad_norm": 0.6251236200332642, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.855555534362793, "reward_std": 0.08121808618307114, "rewards/accuracy_reward": 0.8657596111297607, "rewards/format_reward": 0.9897959232330322, "step": 5455 }, { "completion_length": 363.24488830566406, "epoch": 0.5490314465408805, "grad_norm": 0.8695476055145264, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.533419907093048, "reward_std": 0.19500505179166794, "rewards/accuracy_reward": 0.553828090429306, "rewards/format_reward": 0.9795918464660645, "step": 5456 }, { "completion_length": 229.3571319580078, "epoch": 0.5491320754716981, "grad_norm": 6.077476501464844, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7979590892791748, "reward_std": 0.0969710685312748, "rewards/accuracy_reward": 0.8081632554531097, "rewards/format_reward": 0.9897959232330322, "step": 5457 }, { "completion_length": 230.45917510986328, "epoch": 0.5492327044025157, "grad_norm": 0.6071351170539856, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8010287880897522, "reward_std": 0.14803524315357208, "rewards/accuracy_reward": 0.8214369118213654, "rewards/format_reward": 0.9795918166637421, "step": 5458 }, { "completion_length": 234.2142791748047, "epoch": 0.5493333333333333, "grad_norm": 0.4697072505950928, "kl": 0.05810546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6920459866523743, "reward_std": 0.09825799986720085, "rewards/accuracy_reward": 0.692046046257019, "rewards/format_reward": 1.0, "step": 5459 }, { "completion_length": 268.29591369628906, "epoch": 0.549433962264151, "grad_norm": 0.695487916469574, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5878875851631165, "reward_std": 0.18996483087539673, "rewards/accuracy_reward": 0.6082957684993744, "rewards/format_reward": 0.9795918464660645, "step": 5460 }, { "completion_length": 283.9693832397461, "epoch": 0.5495345911949685, "grad_norm": 0.8899472951889038, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.81480473279953, "reward_std": 0.15091490373015404, "rewards/accuracy_reward": 0.8250088393688202, "rewards/format_reward": 0.9897959232330322, "step": 5461 }, { "completion_length": 213.86734771728516, "epoch": 0.5496352201257861, "grad_norm": 0.7297471761703491, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6832571625709534, "reward_std": 0.2101380005478859, "rewards/accuracy_reward": 0.6832571923732758, "rewards/format_reward": 1.0, "step": 5462 }, { "completion_length": 206.89795684814453, "epoch": 0.5497358490566038, "grad_norm": 0.7855319976806641, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.630589783191681, "reward_std": 0.1963721066713333, "rewards/accuracy_reward": 0.6509979665279388, "rewards/format_reward": 0.9795918464660645, "step": 5463 }, { "completion_length": 241.60203552246094, "epoch": 0.5498364779874214, "grad_norm": 1.2318227291107178, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.758898138999939, "reward_std": 0.19759554415941238, "rewards/accuracy_reward": 0.7588982284069061, "rewards/format_reward": 1.0, "step": 5464 }, { "completion_length": 203.9897918701172, "epoch": 0.5499371069182389, "grad_norm": 0.3290345370769501, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8823492527008057, "reward_std": 0.05672663915902376, "rewards/accuracy_reward": 0.8823493421077728, "rewards/format_reward": 1.0, "step": 5465 }, { "completion_length": 232.88774871826172, "epoch": 0.5500377358490566, "grad_norm": 0.3955770432949066, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7872969508171082, "reward_std": 0.06918105483055115, "rewards/accuracy_reward": 0.7872970402240753, "rewards/format_reward": 1.0, "step": 5466 }, { "completion_length": 239.4591827392578, "epoch": 0.5501383647798742, "grad_norm": 0.511570930480957, "kl": 0.11083984375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8672850728034973, "reward_std": 0.08928798511624336, "rewards/accuracy_reward": 0.8672851026058197, "rewards/format_reward": 1.0, "step": 5467 }, { "completion_length": 168.28570556640625, "epoch": 0.5502389937106918, "grad_norm": 1.172709584236145, "kl": 0.098388671875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7823129892349243, "reward_std": 0.12672308087348938, "rewards/accuracy_reward": 0.7925170063972473, "rewards/format_reward": 0.9897959232330322, "step": 5468 }, { "completion_length": 199.60203552246094, "epoch": 0.5503396226415095, "grad_norm": 0.9806456565856934, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8592645525932312, "reward_std": 0.1170160099864006, "rewards/accuracy_reward": 0.8694687485694885, "rewards/format_reward": 0.9897959232330322, "step": 5469 }, { "completion_length": 235.33673095703125, "epoch": 0.550440251572327, "grad_norm": 0.9237813353538513, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6273043751716614, "reward_std": 0.29148292541503906, "rewards/accuracy_reward": 0.6783247292041779, "rewards/format_reward": 0.9489795565605164, "step": 5470 }, { "completion_length": 240.93877410888672, "epoch": 0.5505408805031446, "grad_norm": 0.5506232380867004, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7380952835083008, "reward_std": 0.1978788524866104, "rewards/accuracy_reward": 0.7585034072399139, "rewards/format_reward": 0.9795918166637421, "step": 5471 }, { "completion_length": 210.2346954345703, "epoch": 0.5506415094339623, "grad_norm": 0.5145494341850281, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7834901809692383, "reward_std": 0.08503196761012077, "rewards/accuracy_reward": 0.7936941385269165, "rewards/format_reward": 0.9897959232330322, "step": 5472 }, { "completion_length": 261.10204315185547, "epoch": 0.5507421383647799, "grad_norm": 0.665353000164032, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6720561981201172, "reward_std": 0.12417604774236679, "rewards/accuracy_reward": 0.7026684582233429, "rewards/format_reward": 0.9693877398967743, "step": 5473 }, { "completion_length": 216.5204086303711, "epoch": 0.5508427672955974, "grad_norm": 0.8032530546188354, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7724003791809082, "reward_std": 0.16915159672498703, "rewards/accuracy_reward": 0.7724003493785858, "rewards/format_reward": 1.0, "step": 5474 }, { "completion_length": 292.05101013183594, "epoch": 0.5509433962264151, "grad_norm": 0.9190031290054321, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.619949460029602, "reward_std": 0.14096585288643837, "rewards/accuracy_reward": 0.6301535367965698, "rewards/format_reward": 0.9897959232330322, "step": 5475 }, { "completion_length": 234.08162689208984, "epoch": 0.5510440251572327, "grad_norm": 1.2606806755065918, "kl": 0.091064453125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7272108793258667, "reward_std": 0.21108409762382507, "rewards/accuracy_reward": 0.7374150454998016, "rewards/format_reward": 0.9897959232330322, "step": 5476 }, { "completion_length": 263.7142791748047, "epoch": 0.5511446540880504, "grad_norm": 0.45433101058006287, "kl": 0.0572509765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.738316833972931, "reward_std": 0.12046218104660511, "rewards/accuracy_reward": 0.7485208511352539, "rewards/format_reward": 0.9897959232330322, "step": 5477 }, { "completion_length": 260.47957611083984, "epoch": 0.5512452830188679, "grad_norm": 0.6730080246925354, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7357142567634583, "reward_std": 0.1119847521185875, "rewards/accuracy_reward": 0.7459183633327484, "rewards/format_reward": 0.9897959232330322, "step": 5478 }, { "completion_length": 277.7755126953125, "epoch": 0.5513459119496855, "grad_norm": 0.7323831915855408, "kl": 0.14501953125, "learning_rate": 1e-06, "loss": 0.0058, "reward": 1.7137895822525024, "reward_std": 0.14789441972970963, "rewards/accuracy_reward": 0.7239936590194702, "rewards/format_reward": 0.9897959232330322, "step": 5479 }, { "completion_length": 312.05101013183594, "epoch": 0.5514465408805032, "grad_norm": 0.7096781730651855, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.5687420964241028, "reward_std": 0.1448436602950096, "rewards/accuracy_reward": 0.5687420964241028, "rewards/format_reward": 1.0, "step": 5480 }, { "completion_length": 196.13265228271484, "epoch": 0.5515471698113208, "grad_norm": 0.6754462122917175, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.9083002209663391, "reward_std": 0.12361026555299759, "rewards/accuracy_reward": 0.9185042977333069, "rewards/format_reward": 0.9897959232330322, "step": 5481 }, { "completion_length": 206.2040786743164, "epoch": 0.5516477987421383, "grad_norm": 0.7370221018791199, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7602230906486511, "reward_std": 0.08590647391974926, "rewards/accuracy_reward": 0.7704271674156189, "rewards/format_reward": 0.9897959232330322, "step": 5482 }, { "completion_length": 199.9897918701172, "epoch": 0.551748427672956, "grad_norm": 0.440283864736557, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.879916489124298, "reward_std": 0.11659758165478706, "rewards/accuracy_reward": 0.900324672460556, "rewards/format_reward": 0.9795918166637421, "step": 5483 }, { "completion_length": 263.2244873046875, "epoch": 0.5518490566037736, "grad_norm": 0.6139205694198608, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6392083168029785, "reward_std": 0.12404043227434158, "rewards/accuracy_reward": 0.6596164405345917, "rewards/format_reward": 0.9795918166637421, "step": 5484 }, { "completion_length": 185.34693908691406, "epoch": 0.5519496855345912, "grad_norm": 0.05548763647675514, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 5485 }, { "completion_length": 177.95917510986328, "epoch": 0.5520503144654088, "grad_norm": 0.6698864102363586, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7865445017814636, "reward_std": 0.06978438049554825, "rewards/accuracy_reward": 0.786544531583786, "rewards/format_reward": 1.0, "step": 5486 }, { "completion_length": 204.6224479675293, "epoch": 0.5521509433962264, "grad_norm": 0.5543036460876465, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.777052879333496, "reward_std": 0.10909636318683624, "rewards/accuracy_reward": 0.7770528793334961, "rewards/format_reward": 1.0, "step": 5487 }, { "completion_length": 222.64285278320312, "epoch": 0.552251572327044, "grad_norm": 0.4202735424041748, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7096189856529236, "reward_std": 0.11344849318265915, "rewards/accuracy_reward": 0.719823032617569, "rewards/format_reward": 0.9897959232330322, "step": 5488 }, { "completion_length": 213.86734008789062, "epoch": 0.5523522012578617, "grad_norm": 0.938258707523346, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7413100004196167, "reward_std": 0.17248890548944473, "rewards/accuracy_reward": 0.7515140771865845, "rewards/format_reward": 0.9897959232330322, "step": 5489 }, { "completion_length": 213.7653045654297, "epoch": 0.5524528301886793, "grad_norm": 1.1891427040100098, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7095186114311218, "reward_std": 0.1917731910943985, "rewards/accuracy_reward": 0.719722718000412, "rewards/format_reward": 0.9897959232330322, "step": 5490 }, { "completion_length": 246.9387664794922, "epoch": 0.5525534591194968, "grad_norm": 0.3216352164745331, "kl": 0.05224609375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8061224222183228, "reward_std": 0.06517763808369637, "rewards/accuracy_reward": 0.8061224222183228, "rewards/format_reward": 1.0, "step": 5491 }, { "completion_length": 240.56121826171875, "epoch": 0.5526540880503145, "grad_norm": 0.6224284172058105, "kl": 0.0579833984375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7829932570457458, "reward_std": 0.14460619539022446, "rewards/accuracy_reward": 0.7829931974411011, "rewards/format_reward": 1.0, "step": 5492 }, { "completion_length": 265.52040100097656, "epoch": 0.5527547169811321, "grad_norm": 0.7811635136604309, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.677113652229309, "reward_std": 0.18081002682447433, "rewards/accuracy_reward": 0.6771136820316315, "rewards/format_reward": 1.0, "step": 5493 }, { "completion_length": 246.14285278320312, "epoch": 0.5528553459119497, "grad_norm": 0.567653238773346, "kl": 0.0738525390625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8190773129463196, "reward_std": 0.15356876701116562, "rewards/accuracy_reward": 0.8292813301086426, "rewards/format_reward": 0.9897959232330322, "step": 5494 }, { "completion_length": 183.03060150146484, "epoch": 0.5529559748427673, "grad_norm": 1.2441587448120117, "kl": 0.09423828125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7485347986221313, "reward_std": 0.16348731517791748, "rewards/accuracy_reward": 0.7587388455867767, "rewards/format_reward": 0.9897959232330322, "step": 5495 }, { "completion_length": 266.7551040649414, "epoch": 0.5530566037735849, "grad_norm": 0.7074794769287109, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6876980662345886, "reward_std": 0.2221144624054432, "rewards/accuracy_reward": 0.7183102667331696, "rewards/format_reward": 0.9693877398967743, "step": 5496 }, { "completion_length": 189.0408172607422, "epoch": 0.5531572327044025, "grad_norm": 0.7536935210227966, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7746031880378723, "reward_std": 0.12225153669714928, "rewards/accuracy_reward": 0.7746031284332275, "rewards/format_reward": 1.0, "step": 5497 }, { "completion_length": 186.12244415283203, "epoch": 0.5532578616352202, "grad_norm": 0.51264488697052, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8082991242408752, "reward_std": 0.11809458583593369, "rewards/accuracy_reward": 0.8287072479724884, "rewards/format_reward": 0.9795918464660645, "step": 5498 }, { "completion_length": 253.4285659790039, "epoch": 0.5533584905660377, "grad_norm": 0.9507772922515869, "kl": 0.0897216796875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.66286039352417, "reward_std": 0.24707213789224625, "rewards/accuracy_reward": 0.6832685172557831, "rewards/format_reward": 0.9795918166637421, "step": 5499 }, { "completion_length": 281.9387664794922, "epoch": 0.5534591194968553, "grad_norm": 0.44639846682548523, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8141399025917053, "reward_std": 0.11526613496243954, "rewards/accuracy_reward": 0.844752162694931, "rewards/format_reward": 0.9693877398967743, "step": 5500 }, { "completion_length": 340.6836700439453, "epoch": 0.553559748427673, "grad_norm": 0.5617629289627075, "kl": 0.0823974609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7531396746635437, "reward_std": 0.16217800974845886, "rewards/accuracy_reward": 0.7837519645690918, "rewards/format_reward": 0.9693877398967743, "step": 5501 }, { "completion_length": 259.9285659790039, "epoch": 0.5536603773584906, "grad_norm": 1.5359934568405151, "kl": 0.139404296875, "learning_rate": 1e-06, "loss": 0.0056, "reward": 1.5440945625305176, "reward_std": 0.17424143105745316, "rewards/accuracy_reward": 0.554298609495163, "rewards/format_reward": 0.9897959232330322, "step": 5502 }, { "completion_length": 222.53060150146484, "epoch": 0.5537610062893081, "grad_norm": 1.211800217628479, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6656566262245178, "reward_std": 0.22954539954662323, "rewards/accuracy_reward": 0.6860648095607758, "rewards/format_reward": 0.9795918464660645, "step": 5503 }, { "completion_length": 222.14285278320312, "epoch": 0.5538616352201258, "grad_norm": 0.9237629771232605, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8163264989852905, "reward_std": 0.1730649620294571, "rewards/accuracy_reward": 0.8265306055545807, "rewards/format_reward": 0.9897959232330322, "step": 5504 }, { "completion_length": 247.57142639160156, "epoch": 0.5539622641509434, "grad_norm": 0.32125237584114075, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.9490124583244324, "reward_std": 0.11575895920395851, "rewards/accuracy_reward": 0.9694206118583679, "rewards/format_reward": 0.9795918166637421, "step": 5505 }, { "completion_length": 165.87754821777344, "epoch": 0.554062893081761, "grad_norm": 0.7106085419654846, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.829931914806366, "reward_std": 0.08327582851052284, "rewards/accuracy_reward": 0.8503401279449463, "rewards/format_reward": 0.9795918166637421, "step": 5506 }, { "completion_length": 237.91836547851562, "epoch": 0.5541635220125786, "grad_norm": 1.0333545207977295, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.708814799785614, "reward_std": 0.28325171768665314, "rewards/accuracy_reward": 0.7496311962604523, "rewards/format_reward": 0.9591836333274841, "step": 5507 }, { "completion_length": 254.84693908691406, "epoch": 0.5542641509433962, "grad_norm": 1.2032581567764282, "kl": 0.111572265625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7161950469017029, "reward_std": 0.10097540728747845, "rewards/accuracy_reward": 0.71619513630867, "rewards/format_reward": 1.0, "step": 5508 }, { "completion_length": 204.4795913696289, "epoch": 0.5543647798742138, "grad_norm": 0.6822827458381653, "kl": 0.09423828125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7572767734527588, "reward_std": 0.1392403095960617, "rewards/accuracy_reward": 0.7878890633583069, "rewards/format_reward": 0.9693877398967743, "step": 5509 }, { "completion_length": 311.4183654785156, "epoch": 0.5544654088050315, "grad_norm": 0.8376002311706543, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6856008768081665, "reward_std": 0.21630098670721054, "rewards/accuracy_reward": 0.72641721367836, "rewards/format_reward": 0.9591836333274841, "step": 5510 }, { "completion_length": 239.89794921875, "epoch": 0.554566037735849, "grad_norm": 1.4366663694381714, "kl": 0.125732421875, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.8051373958587646, "reward_std": 0.21042033284902573, "rewards/accuracy_reward": 0.8561578691005707, "rewards/format_reward": 0.9489795565605164, "step": 5511 }, { "completion_length": 205.11223602294922, "epoch": 0.5546666666666666, "grad_norm": 1.0057427883148193, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.821630835533142, "reward_std": 0.08311872184276581, "rewards/accuracy_reward": 0.8216307759284973, "rewards/format_reward": 1.0, "step": 5512 }, { "completion_length": 284.04080963134766, "epoch": 0.5547672955974843, "grad_norm": 0.6692885160446167, "kl": 0.10400390625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7364075183868408, "reward_std": 0.14612064138054848, "rewards/accuracy_reward": 0.7568156719207764, "rewards/format_reward": 0.9795918464660645, "step": 5513 }, { "completion_length": 219.05101776123047, "epoch": 0.5548679245283019, "grad_norm": 1.2474923133850098, "kl": 0.0860595703125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.765585720539093, "reward_std": 0.21590367704629898, "rewards/accuracy_reward": 0.7961979508399963, "rewards/format_reward": 0.9693877398967743, "step": 5514 }, { "completion_length": 242.71428680419922, "epoch": 0.5549685534591196, "grad_norm": 0.5656185150146484, "kl": 0.0787353515625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7129294276237488, "reward_std": 0.12606938555836678, "rewards/accuracy_reward": 0.7333377003669739, "rewards/format_reward": 0.9795918166637421, "step": 5515 }, { "completion_length": 251.87754821777344, "epoch": 0.5550691823899371, "grad_norm": 0.7050041556358337, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7186859250068665, "reward_std": 0.21581512689590454, "rewards/accuracy_reward": 0.7288900017738342, "rewards/format_reward": 0.9897959232330322, "step": 5516 }, { "completion_length": 181.13265228271484, "epoch": 0.5551698113207547, "grad_norm": 1.2017040252685547, "kl": 0.1181640625, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.6643118858337402, "reward_std": 0.22998087108135223, "rewards/accuracy_reward": 0.6847200691699982, "rewards/format_reward": 0.9795918166637421, "step": 5517 }, { "completion_length": 241.69387817382812, "epoch": 0.5552704402515724, "grad_norm": 0.7554966807365417, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7359101176261902, "reward_std": 0.16612377762794495, "rewards/accuracy_reward": 0.7665223777294159, "rewards/format_reward": 0.9693877398967743, "step": 5518 }, { "completion_length": 291.6326446533203, "epoch": 0.55537106918239, "grad_norm": 0.7657496929168701, "kl": 0.0535888671875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6556268334388733, "reward_std": 0.25104712694883347, "rewards/accuracy_reward": 0.6862391233444214, "rewards/format_reward": 0.9693877398967743, "step": 5519 }, { "completion_length": 315.2244873046875, "epoch": 0.5554716981132075, "grad_norm": 1.0681978464126587, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5624569654464722, "reward_std": 0.36385147273540497, "rewards/accuracy_reward": 0.5828650891780853, "rewards/format_reward": 0.9795918464660645, "step": 5520 }, { "completion_length": 231.7142791748047, "epoch": 0.5555723270440251, "grad_norm": 0.6245324015617371, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6921893954277039, "reward_std": 0.1830333173274994, "rewards/accuracy_reward": 0.7228017151355743, "rewards/format_reward": 0.9693877398967743, "step": 5521 }, { "completion_length": 207.58163452148438, "epoch": 0.5556729559748428, "grad_norm": 0.7515273690223694, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8821892142295837, "reward_std": 0.10113172978162766, "rewards/accuracy_reward": 0.8923932909965515, "rewards/format_reward": 0.9897959232330322, "step": 5522 }, { "completion_length": 226.37754821777344, "epoch": 0.5557735849056604, "grad_norm": 0.7712408304214478, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7887835502624512, "reward_std": 0.17982518672943115, "rewards/accuracy_reward": 0.7989876866340637, "rewards/format_reward": 0.9897959232330322, "step": 5523 }, { "completion_length": 238.6938705444336, "epoch": 0.555874213836478, "grad_norm": 0.76201993227005, "kl": 0.0794677734375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8190475106239319, "reward_std": 0.13908324390649796, "rewards/accuracy_reward": 0.839455783367157, "rewards/format_reward": 0.9795918166637421, "step": 5524 }, { "completion_length": 273.4795837402344, "epoch": 0.5559748427672956, "grad_norm": 1.234153389930725, "kl": 0.11474609375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.6504290103912354, "reward_std": 0.15411851182579994, "rewards/accuracy_reward": 0.6606330573558807, "rewards/format_reward": 0.9897959232330322, "step": 5525 }, { "completion_length": 251.10204315185547, "epoch": 0.5560754716981132, "grad_norm": 1.4952733516693115, "kl": 0.0892333984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7543767094612122, "reward_std": 0.1335640624165535, "rewards/accuracy_reward": 0.7747849225997925, "rewards/format_reward": 0.9795918166637421, "step": 5526 }, { "completion_length": 276.8163299560547, "epoch": 0.5561761006289309, "grad_norm": 1.0042768716812134, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5357260704040527, "reward_std": 0.3111910820007324, "rewards/accuracy_reward": 0.5561343431472778, "rewards/format_reward": 0.9795918464660645, "step": 5527 }, { "completion_length": 231.47958374023438, "epoch": 0.5562767295597484, "grad_norm": 0.9302679896354675, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8142017722129822, "reward_std": 0.22282271087169647, "rewards/accuracy_reward": 0.82440584897995, "rewards/format_reward": 0.9897959232330322, "step": 5528 }, { "completion_length": 217.80611419677734, "epoch": 0.556377358490566, "grad_norm": 0.5521347522735596, "kl": 0.091064453125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8475571870803833, "reward_std": 0.21097023785114288, "rewards/accuracy_reward": 0.888373464345932, "rewards/format_reward": 0.9591836631298065, "step": 5529 }, { "completion_length": 246.77550506591797, "epoch": 0.5564779874213837, "grad_norm": 0.7044316530227661, "kl": 0.0867919921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8442795276641846, "reward_std": 0.06883900612592697, "rewards/accuracy_reward": 0.8442795276641846, "rewards/format_reward": 1.0, "step": 5530 }, { "completion_length": 188.49999237060547, "epoch": 0.5565786163522013, "grad_norm": 1.1755867004394531, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7826462984085083, "reward_std": 0.1930842101573944, "rewards/accuracy_reward": 0.7928504049777985, "rewards/format_reward": 0.9897959232330322, "step": 5531 }, { "completion_length": 228.448974609375, "epoch": 0.5566792452830188, "grad_norm": 0.8350945711135864, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6717877388000488, "reward_std": 0.18949005007743835, "rewards/accuracy_reward": 0.6921960115432739, "rewards/format_reward": 0.9795918166637421, "step": 5532 }, { "completion_length": 180.1938705444336, "epoch": 0.5567798742138365, "grad_norm": 1.4892224073410034, "kl": 0.130126953125, "learning_rate": 1e-06, "loss": 0.0052, "reward": 1.667511761188507, "reward_std": 0.16687225177884102, "rewards/accuracy_reward": 0.6879200041294098, "rewards/format_reward": 0.9795918166637421, "step": 5533 }, { "completion_length": 272.07141876220703, "epoch": 0.5568805031446541, "grad_norm": 1.09552800655365, "kl": 0.12841796875, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.6273942589759827, "reward_std": 0.21777886897325516, "rewards/accuracy_reward": 0.6478024125099182, "rewards/format_reward": 0.9795918464660645, "step": 5534 }, { "completion_length": 212.87754821777344, "epoch": 0.5569811320754717, "grad_norm": 0.8373373746871948, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6401425004005432, "reward_std": 0.1082487441599369, "rewards/accuracy_reward": 0.6503466367721558, "rewards/format_reward": 0.9897959232330322, "step": 5535 }, { "completion_length": 223.93877410888672, "epoch": 0.5570817610062893, "grad_norm": 0.7704959511756897, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7050908207893372, "reward_std": 0.10858488082885742, "rewards/accuracy_reward": 0.7050908505916595, "rewards/format_reward": 1.0, "step": 5536 }, { "completion_length": 281.1836700439453, "epoch": 0.5571823899371069, "grad_norm": 0.9245504140853882, "kl": 0.11083984375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.5628786087036133, "reward_std": 0.20749768614768982, "rewards/accuracy_reward": 0.5628786385059357, "rewards/format_reward": 1.0, "step": 5537 }, { "completion_length": 172.29591369628906, "epoch": 0.5572830188679245, "grad_norm": 0.22719953954219818, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.942176878452301, "reward_std": 0.026997461915016174, "rewards/accuracy_reward": 0.942176878452301, "rewards/format_reward": 1.0, "step": 5538 }, { "completion_length": 256.4183654785156, "epoch": 0.5573836477987422, "grad_norm": 0.728308379650116, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.687755048274994, "reward_std": 0.153732568025589, "rewards/accuracy_reward": 0.6979591548442841, "rewards/format_reward": 0.9897959232330322, "step": 5539 }, { "completion_length": 237.16326141357422, "epoch": 0.5574842767295598, "grad_norm": 1.3352497816085815, "kl": 0.089111328125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7039563655853271, "reward_std": 0.11328009516000748, "rewards/accuracy_reward": 0.7039563953876495, "rewards/format_reward": 1.0, "step": 5540 }, { "completion_length": 211.30611419677734, "epoch": 0.5575849056603773, "grad_norm": 0.3950398862361908, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.9489795565605164, "reward_std": 0.05090690031647682, "rewards/accuracy_reward": 0.9489795863628387, "rewards/format_reward": 1.0, "step": 5541 }, { "completion_length": 208.87754821777344, "epoch": 0.557685534591195, "grad_norm": 1.0134944915771484, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.756313443183899, "reward_std": 0.15688452869653702, "rewards/accuracy_reward": 0.7767215967178345, "rewards/format_reward": 0.9795918166637421, "step": 5542 }, { "completion_length": 207.73468780517578, "epoch": 0.5577861635220126, "grad_norm": 0.7706683874130249, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6410430669784546, "reward_std": 0.17660777270793915, "rewards/accuracy_reward": 0.641043096780777, "rewards/format_reward": 1.0, "step": 5543 }, { "completion_length": 223.80611419677734, "epoch": 0.5578867924528302, "grad_norm": 1.4358159303665161, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5640692710876465, "reward_std": 0.23758403956890106, "rewards/accuracy_reward": 0.5640692710876465, "rewards/format_reward": 1.0, "step": 5544 }, { "completion_length": 203.9183578491211, "epoch": 0.5579874213836478, "grad_norm": 3.362952709197998, "kl": 0.22509765625, "learning_rate": 1e-06, "loss": 0.009, "reward": 1.8435153365135193, "reward_std": 0.16485940665006638, "rewards/accuracy_reward": 0.8537193834781647, "rewards/format_reward": 0.9897959232330322, "step": 5545 }, { "completion_length": 247.7653045654297, "epoch": 0.5580880503144654, "grad_norm": 0.9800872206687927, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8381837606430054, "reward_std": 0.1777234524488449, "rewards/accuracy_reward": 0.8585919141769409, "rewards/format_reward": 0.9795918464660645, "step": 5546 }, { "completion_length": 180.19387817382812, "epoch": 0.558188679245283, "grad_norm": 0.6351377367973328, "kl": 0.1123046875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.886038899421692, "reward_std": 0.09244921430945396, "rewards/accuracy_reward": 0.8860389292240143, "rewards/format_reward": 1.0, "step": 5547 }, { "completion_length": 214.0, "epoch": 0.5582893081761007, "grad_norm": 0.5992380380630493, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.591099739074707, "reward_std": 0.07402683049440384, "rewards/accuracy_reward": 0.5910997986793518, "rewards/format_reward": 1.0, "step": 5548 }, { "completion_length": 240.6530532836914, "epoch": 0.5583899371069182, "grad_norm": 0.5868521332740784, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8419549465179443, "reward_std": 0.15534038841724396, "rewards/accuracy_reward": 0.8623631298542023, "rewards/format_reward": 0.9795918166637421, "step": 5549 }, { "completion_length": 212.25509643554688, "epoch": 0.5584905660377358, "grad_norm": 0.8594919443130493, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6263450384140015, "reward_std": 0.1019095927476883, "rewards/accuracy_reward": 0.6263450980186462, "rewards/format_reward": 1.0, "step": 5550 }, { "completion_length": 209.2448959350586, "epoch": 0.5585911949685535, "grad_norm": 0.6459029912948608, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7973760962486267, "reward_std": 0.18163014948368073, "rewards/accuracy_reward": 0.8381924331188202, "rewards/format_reward": 0.9591836631298065, "step": 5551 }, { "completion_length": 225.4387664794922, "epoch": 0.5586918238993711, "grad_norm": 0.5586177110671997, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7186962366104126, "reward_std": 0.1399804875254631, "rewards/accuracy_reward": 0.7391043901443481, "rewards/format_reward": 0.9795918464660645, "step": 5552 }, { "completion_length": 177.11223602294922, "epoch": 0.5587924528301886, "grad_norm": 0.5015910267829895, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8352769017219543, "reward_std": 0.09817423112690449, "rewards/accuracy_reward": 0.8352769017219543, "rewards/format_reward": 1.0, "step": 5553 }, { "completion_length": 218.05101776123047, "epoch": 0.5588930817610063, "grad_norm": 0.5519582629203796, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8095757365226746, "reward_std": 0.15702606737613678, "rewards/accuracy_reward": 0.8197798132896423, "rewards/format_reward": 0.9897959232330322, "step": 5554 }, { "completion_length": 187.92857360839844, "epoch": 0.5589937106918239, "grad_norm": 1.0894694328308105, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.857725977897644, "reward_std": 0.18341350182890892, "rewards/accuracy_reward": 0.8577259182929993, "rewards/format_reward": 1.0, "step": 5555 }, { "completion_length": 235.9693832397461, "epoch": 0.5590943396226415, "grad_norm": 0.6166605949401855, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7548786997795105, "reward_std": 0.12554848194122314, "rewards/accuracy_reward": 0.7752868235111237, "rewards/format_reward": 0.9795918166637421, "step": 5556 }, { "completion_length": 214.9081573486328, "epoch": 0.5591949685534591, "grad_norm": 0.8834818601608276, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8460482954978943, "reward_std": 0.13976997882127762, "rewards/accuracy_reward": 0.8460482954978943, "rewards/format_reward": 1.0, "step": 5557 }, { "completion_length": 297.18365478515625, "epoch": 0.5592955974842767, "grad_norm": 0.689169704914093, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6763591766357422, "reward_std": 0.23387560993433, "rewards/accuracy_reward": 0.6967673301696777, "rewards/format_reward": 0.9795918464660645, "step": 5558 }, { "completion_length": 201.04080963134766, "epoch": 0.5593962264150943, "grad_norm": 1.1640876531600952, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.7857142090797424, "reward_std": 0.10335781797766685, "rewards/accuracy_reward": 0.795918345451355, "rewards/format_reward": 0.9897959232330322, "step": 5559 }, { "completion_length": 238.9387664794922, "epoch": 0.559496855345912, "grad_norm": 1.5397984981536865, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8847025632858276, "reward_std": 0.06120148487389088, "rewards/accuracy_reward": 0.894906759262085, "rewards/format_reward": 0.9897959232330322, "step": 5560 }, { "completion_length": 225.22447967529297, "epoch": 0.5595974842767295, "grad_norm": 0.4248959422111511, "kl": 0.0576171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8520408272743225, "reward_std": 0.06749365478754044, "rewards/accuracy_reward": 0.8622449040412903, "rewards/format_reward": 0.9897959232330322, "step": 5561 }, { "completion_length": 184.4285659790039, "epoch": 0.5596981132075471, "grad_norm": 2.25050950050354, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.8081632256507874, "reward_std": 0.1636498048901558, "rewards/accuracy_reward": 0.8081632554531097, "rewards/format_reward": 1.0, "step": 5562 }, { "completion_length": 252.4285659790039, "epoch": 0.5597987421383648, "grad_norm": 0.7586216926574707, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.787898600101471, "reward_std": 0.17347678169608116, "rewards/accuracy_reward": 0.8083067238330841, "rewards/format_reward": 0.9795918166637421, "step": 5563 }, { "completion_length": 261.49998474121094, "epoch": 0.5598993710691824, "grad_norm": 0.4334999918937683, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8163264989852905, "reward_std": 0.1079898476600647, "rewards/accuracy_reward": 0.8367346823215485, "rewards/format_reward": 0.9795918464660645, "step": 5564 }, { "completion_length": 236.59182739257812, "epoch": 0.56, "grad_norm": 0.5069215893745422, "kl": 0.103759765625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7515381574630737, "reward_std": 0.11511239036917686, "rewards/accuracy_reward": 0.7617422044277191, "rewards/format_reward": 0.9897959232330322, "step": 5565 }, { "completion_length": 140.60203552246094, "epoch": 0.5601006289308176, "grad_norm": 1.0364196300506592, "kl": 0.103271484375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.824306607246399, "reward_std": 0.12388507276773453, "rewards/accuracy_reward": 0.8243066370487213, "rewards/format_reward": 1.0, "step": 5566 }, { "completion_length": 251.83673858642578, "epoch": 0.5602012578616352, "grad_norm": 0.8717166185379028, "kl": 0.11328125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.6518839597702026, "reward_std": 0.2487318217754364, "rewards/accuracy_reward": 0.6927004158496857, "rewards/format_reward": 0.9591836333274841, "step": 5567 }, { "completion_length": 180.42857360839844, "epoch": 0.5603018867924529, "grad_norm": 0.27016279101371765, "kl": 0.05908203125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8027210235595703, "reward_std": 0.059754878282547, "rewards/accuracy_reward": 0.8027210831642151, "rewards/format_reward": 1.0, "step": 5568 }, { "completion_length": 211.55101776123047, "epoch": 0.5604025157232705, "grad_norm": 0.7380212545394897, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7942784428596497, "reward_std": 0.15872616320848465, "rewards/accuracy_reward": 0.8146865367889404, "rewards/format_reward": 0.9795918464660645, "step": 5569 }, { "completion_length": 238.30612182617188, "epoch": 0.560503144654088, "grad_norm": 0.4465292692184448, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7585096955299377, "reward_std": 0.07384301722049713, "rewards/accuracy_reward": 0.7585096955299377, "rewards/format_reward": 1.0, "step": 5570 }, { "completion_length": 212.8571319580078, "epoch": 0.5606037735849057, "grad_norm": 0.6061676740646362, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8625850081443787, "reward_std": 0.09447964653372765, "rewards/accuracy_reward": 0.8625850081443787, "rewards/format_reward": 1.0, "step": 5571 }, { "completion_length": 238.17345428466797, "epoch": 0.5607044025157233, "grad_norm": 0.9486914277076721, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7127065062522888, "reward_std": 0.2061174213886261, "rewards/accuracy_reward": 0.7229105532169342, "rewards/format_reward": 0.9897959232330322, "step": 5572 }, { "completion_length": 251.0816192626953, "epoch": 0.5608050314465409, "grad_norm": 1.3369245529174805, "kl": 0.05615234375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7798052430152893, "reward_std": 0.18149122223258018, "rewards/accuracy_reward": 0.7798052132129669, "rewards/format_reward": 1.0, "step": 5573 }, { "completion_length": 275.3265151977539, "epoch": 0.5609056603773585, "grad_norm": 0.5530380010604858, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6448052525520325, "reward_std": 0.15927769988775253, "rewards/accuracy_reward": 0.6550092399120331, "rewards/format_reward": 0.9897959232330322, "step": 5574 }, { "completion_length": 143.96938705444336, "epoch": 0.5610062893081761, "grad_norm": 0.6784111857414246, "kl": 0.116943359375, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.8748342990875244, "reward_std": 0.1104314923286438, "rewards/accuracy_reward": 0.8850384056568146, "rewards/format_reward": 0.9897959232330322, "step": 5575 }, { "completion_length": 209.85713958740234, "epoch": 0.5611069182389937, "grad_norm": 0.6152013540267944, "kl": 0.0592041015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8017724752426147, "reward_std": 0.12962967157363892, "rewards/accuracy_reward": 0.8119765818119049, "rewards/format_reward": 0.9897959232330322, "step": 5576 }, { "completion_length": 205.10204315185547, "epoch": 0.5612075471698114, "grad_norm": 0.9864986538887024, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7723153233528137, "reward_std": 0.18919315934181213, "rewards/accuracy_reward": 0.7723153531551361, "rewards/format_reward": 1.0, "step": 5577 }, { "completion_length": 317.8163146972656, "epoch": 0.5613081761006289, "grad_norm": 0.7736590504646301, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.3907852172851562, "reward_std": 0.16923271119594574, "rewards/accuracy_reward": 0.39078524708747864, "rewards/format_reward": 1.0, "step": 5578 }, { "completion_length": 188.40816497802734, "epoch": 0.5614088050314465, "grad_norm": 0.8699218034744263, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8833937048912048, "reward_std": 0.12713966146111488, "rewards/accuracy_reward": 0.8833936750888824, "rewards/format_reward": 1.0, "step": 5579 }, { "completion_length": 213.4285659790039, "epoch": 0.5615094339622642, "grad_norm": 0.6518383026123047, "kl": 0.0596923828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.723809540271759, "reward_std": 0.12125225365161896, "rewards/accuracy_reward": 0.7340136170387268, "rewards/format_reward": 0.9897959232330322, "step": 5580 }, { "completion_length": 226.6020278930664, "epoch": 0.5616100628930818, "grad_norm": 0.6370638012886047, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8163264989852905, "reward_std": 0.18102359399199486, "rewards/accuracy_reward": 0.8367346823215485, "rewards/format_reward": 0.9795918166637421, "step": 5581 }, { "completion_length": 183.30612182617188, "epoch": 0.5617106918238993, "grad_norm": 1.165809154510498, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7084459066390991, "reward_std": 0.23956440389156342, "rewards/accuracy_reward": 0.7288540601730347, "rewards/format_reward": 0.9795918166637421, "step": 5582 }, { "completion_length": 222.66326141357422, "epoch": 0.561811320754717, "grad_norm": 1.00082266330719, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8573858737945557, "reward_std": 0.11791155487298965, "rewards/accuracy_reward": 0.8573857247829437, "rewards/format_reward": 1.0, "step": 5583 }, { "completion_length": 256.6020278930664, "epoch": 0.5619119496855346, "grad_norm": 0.6052194237709045, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7292104363441467, "reward_std": 0.10639620572328568, "rewards/accuracy_reward": 0.7394145131111145, "rewards/format_reward": 0.9897959232330322, "step": 5584 }, { "completion_length": 327.69386291503906, "epoch": 0.5620125786163522, "grad_norm": 0.38910943269729614, "kl": 0.056884765625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7442659139633179, "reward_std": 0.11752581223845482, "rewards/accuracy_reward": 0.754470020532608, "rewards/format_reward": 0.9897959232330322, "step": 5585 }, { "completion_length": 269.0102005004883, "epoch": 0.5621132075471698, "grad_norm": 0.4873662292957306, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6618289351463318, "reward_std": 0.1401524879038334, "rewards/accuracy_reward": 0.6618288606405258, "rewards/format_reward": 1.0, "step": 5586 }, { "completion_length": 256.57141876220703, "epoch": 0.5622138364779874, "grad_norm": 0.9688752889633179, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8001325130462646, "reward_std": 0.09572628885507584, "rewards/accuracy_reward": 0.8103365898132324, "rewards/format_reward": 0.9897959232330322, "step": 5587 }, { "completion_length": 282.4387664794922, "epoch": 0.562314465408805, "grad_norm": 0.6852831244468689, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5785215497016907, "reward_std": 0.13294783979654312, "rewards/accuracy_reward": 0.5785215795040131, "rewards/format_reward": 1.0, "step": 5588 }, { "completion_length": 237.74488830566406, "epoch": 0.5624150943396227, "grad_norm": 0.6074522733688354, "kl": 0.0875244140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8121268153190613, "reward_std": 0.12339507788419724, "rewards/accuracy_reward": 0.832535058259964, "rewards/format_reward": 0.9795918464660645, "step": 5589 }, { "completion_length": 222.27550506591797, "epoch": 0.5625157232704403, "grad_norm": 1.720691204071045, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.704945683479309, "reward_std": 0.18761362880468369, "rewards/accuracy_reward": 0.7355579137802124, "rewards/format_reward": 0.9693877398967743, "step": 5590 }, { "completion_length": 279.02040100097656, "epoch": 0.5626163522012578, "grad_norm": 0.7714917659759521, "kl": 0.05615234375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7154606580734253, "reward_std": 0.22849765419960022, "rewards/accuracy_reward": 0.7256647944450378, "rewards/format_reward": 0.9897959232330322, "step": 5591 }, { "completion_length": 232.16326904296875, "epoch": 0.5627169811320755, "grad_norm": 0.7052643299102783, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6929543614387512, "reward_std": 0.12054558843374252, "rewards/accuracy_reward": 0.6929543018341064, "rewards/format_reward": 1.0, "step": 5592 }, { "completion_length": 218.2040786743164, "epoch": 0.5628176100628931, "grad_norm": 0.8860108852386475, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8397958874702454, "reward_std": 0.25253520905971527, "rewards/accuracy_reward": 0.8499999642372131, "rewards/format_reward": 0.9897959232330322, "step": 5593 }, { "completion_length": 210.58162689208984, "epoch": 0.5629182389937107, "grad_norm": 0.7023053169250488, "kl": 0.0582275390625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7447869777679443, "reward_std": 0.13173653185367584, "rewards/accuracy_reward": 0.7549911439418793, "rewards/format_reward": 0.9897959232330322, "step": 5594 }, { "completion_length": 323.84693145751953, "epoch": 0.5630188679245283, "grad_norm": 0.4702785313129425, "kl": 0.0765380859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5226131081581116, "reward_std": 0.12760848551988602, "rewards/accuracy_reward": 0.5328171998262405, "rewards/format_reward": 0.9897959232330322, "step": 5595 }, { "completion_length": 211.33673095703125, "epoch": 0.5631194968553459, "grad_norm": 0.8026514053344727, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8067563772201538, "reward_std": 0.17467204481363297, "rewards/accuracy_reward": 0.827164500951767, "rewards/format_reward": 0.9795918166637421, "step": 5596 }, { "completion_length": 287.6530456542969, "epoch": 0.5632201257861635, "grad_norm": 0.825435996055603, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6542620062828064, "reward_std": 0.17666923254728317, "rewards/accuracy_reward": 0.6542620062828064, "rewards/format_reward": 1.0, "step": 5597 }, { "completion_length": 231.4795913696289, "epoch": 0.5633207547169812, "grad_norm": 1.0680564641952515, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7750231623649597, "reward_std": 0.2242964804172516, "rewards/accuracy_reward": 0.79543137550354, "rewards/format_reward": 0.9795918166637421, "step": 5598 }, { "completion_length": 294.60203552246094, "epoch": 0.5634213836477987, "grad_norm": 1.287684679031372, "kl": 0.0982666015625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7288806438446045, "reward_std": 0.2182541787624359, "rewards/accuracy_reward": 0.7594929039478302, "rewards/format_reward": 0.9693877398967743, "step": 5599 }, { "completion_length": 221.11224365234375, "epoch": 0.5635220125786163, "grad_norm": 0.7611852288246155, "kl": 0.133544921875, "learning_rate": 1e-06, "loss": 0.0053, "reward": 1.68756502866745, "reward_std": 0.15552501380443573, "rewards/accuracy_reward": 0.6977691948413849, "rewards/format_reward": 0.9897959232330322, "step": 5600 }, { "completion_length": 251.1530532836914, "epoch": 0.563622641509434, "grad_norm": 0.546486496925354, "kl": 0.118408203125, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7361680269241333, "reward_std": 0.14341207779943943, "rewards/accuracy_reward": 0.7463721930980682, "rewards/format_reward": 0.9897959232330322, "step": 5601 }, { "completion_length": 262.8265151977539, "epoch": 0.5637232704402516, "grad_norm": 0.8081946969032288, "kl": 0.09423828125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8393074870109558, "reward_std": 0.09530328586697578, "rewards/accuracy_reward": 0.8393074870109558, "rewards/format_reward": 1.0, "step": 5602 }, { "completion_length": 246.26531219482422, "epoch": 0.5638238993710691, "grad_norm": 2.644864797592163, "kl": 0.107666015625, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7463012337684631, "reward_std": 0.12131700664758682, "rewards/accuracy_reward": 0.7565052509307861, "rewards/format_reward": 0.9897959232330322, "step": 5603 }, { "completion_length": 229.95917510986328, "epoch": 0.5639245283018868, "grad_norm": 0.46266037225723267, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6881537437438965, "reward_std": 0.06612590048462152, "rewards/accuracy_reward": 0.6881538331508636, "rewards/format_reward": 1.0, "step": 5604 }, { "completion_length": 204.62244415283203, "epoch": 0.5640251572327044, "grad_norm": 0.5106791257858276, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.9323747754096985, "reward_std": 0.06575810257345438, "rewards/accuracy_reward": 0.9425788223743439, "rewards/format_reward": 0.9897959232330322, "step": 5605 }, { "completion_length": 242.33672332763672, "epoch": 0.564125786163522, "grad_norm": 0.7180734276771545, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5594328045845032, "reward_std": 0.2169114127755165, "rewards/accuracy_reward": 0.5900450050830841, "rewards/format_reward": 0.9693877398967743, "step": 5606 }, { "completion_length": 309.5306091308594, "epoch": 0.5642264150943396, "grad_norm": 0.5519096851348877, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7150145173072815, "reward_std": 0.1364288032054901, "rewards/accuracy_reward": 0.7252185940742493, "rewards/format_reward": 0.9897959232330322, "step": 5607 }, { "completion_length": 246.6734619140625, "epoch": 0.5643270440251572, "grad_norm": 0.5109907984733582, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.706668198108673, "reward_std": 0.13156291097402573, "rewards/accuracy_reward": 0.7168722152709961, "rewards/format_reward": 0.9897959232330322, "step": 5608 }, { "completion_length": 186.63265228271484, "epoch": 0.5644276729559748, "grad_norm": 3.6666808128356934, "kl": 0.1099853515625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8817253112792969, "reward_std": 0.1001129224896431, "rewards/accuracy_reward": 0.881725400686264, "rewards/format_reward": 1.0, "step": 5609 }, { "completion_length": 249.0204086303711, "epoch": 0.5645283018867925, "grad_norm": 0.9422626495361328, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.667695939540863, "reward_std": 0.20769166946411133, "rewards/accuracy_reward": 0.6881040632724762, "rewards/format_reward": 0.9795918464660645, "step": 5610 }, { "completion_length": 225.7040786743164, "epoch": 0.56462893081761, "grad_norm": 0.8462565541267395, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7651562094688416, "reward_std": 0.1424795724451542, "rewards/accuracy_reward": 0.7651562988758087, "rewards/format_reward": 1.0, "step": 5611 }, { "completion_length": 218.5204086303711, "epoch": 0.5647295597484276, "grad_norm": 1.1612077951431274, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6717996001243591, "reward_std": 0.12595641613006592, "rewards/accuracy_reward": 0.6820036470890045, "rewards/format_reward": 0.9897959232330322, "step": 5612 }, { "completion_length": 269.46937561035156, "epoch": 0.5648301886792453, "grad_norm": 0.6124368906021118, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8187280297279358, "reward_std": 0.12521562352776527, "rewards/accuracy_reward": 0.818727970123291, "rewards/format_reward": 1.0, "step": 5613 }, { "completion_length": 222.28571319580078, "epoch": 0.5649308176100629, "grad_norm": 1.3160700798034668, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6053597331047058, "reward_std": 0.19924840331077576, "rewards/accuracy_reward": 0.625767856836319, "rewards/format_reward": 0.9795918166637421, "step": 5614 }, { "completion_length": 240.10203552246094, "epoch": 0.5650314465408806, "grad_norm": 0.7690516710281372, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8557701706886292, "reward_std": 0.14534058421850204, "rewards/accuracy_reward": 0.8659741878509521, "rewards/format_reward": 0.9897959232330322, "step": 5615 }, { "completion_length": 197.59183502197266, "epoch": 0.5651320754716981, "grad_norm": 0.9424370527267456, "kl": 0.0836181640625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8483301997184753, "reward_std": 0.15198394656181335, "rewards/accuracy_reward": 0.8687383830547333, "rewards/format_reward": 0.9795918464660645, "step": 5616 }, { "completion_length": 336.8061218261719, "epoch": 0.5652327044025157, "grad_norm": 0.732101321220398, "kl": 0.094970703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.5555084347724915, "reward_std": 0.1359192058444023, "rewards/accuracy_reward": 0.5759166181087494, "rewards/format_reward": 0.9795918464660645, "step": 5617 }, { "completion_length": 221.51020050048828, "epoch": 0.5653333333333334, "grad_norm": 0.9523360133171082, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8662735223770142, "reward_std": 0.14761483669281006, "rewards/accuracy_reward": 0.8662735819816589, "rewards/format_reward": 1.0, "step": 5618 }, { "completion_length": 269.4183578491211, "epoch": 0.565433962264151, "grad_norm": 0.7544288039207458, "kl": 0.0765380859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8639455437660217, "reward_std": 0.15184256434440613, "rewards/accuracy_reward": 0.8741496205329895, "rewards/format_reward": 0.9897959232330322, "step": 5619 }, { "completion_length": 262.60204315185547, "epoch": 0.5655345911949685, "grad_norm": 0.8831070065498352, "kl": 0.0789794921875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7244898080825806, "reward_std": 0.2525869533419609, "rewards/accuracy_reward": 0.7448979616165161, "rewards/format_reward": 0.9795918464660645, "step": 5620 }, { "completion_length": 227.33673095703125, "epoch": 0.5656352201257862, "grad_norm": 0.9709045886993408, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6804323196411133, "reward_std": 0.14737647771835327, "rewards/accuracy_reward": 0.6804323792457581, "rewards/format_reward": 1.0, "step": 5621 }, { "completion_length": 273.91835021972656, "epoch": 0.5657358490566038, "grad_norm": 2.347156286239624, "kl": 0.1015625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7327560186386108, "reward_std": 0.23408614844083786, "rewards/accuracy_reward": 0.7531643211841583, "rewards/format_reward": 0.9795918464660645, "step": 5622 }, { "completion_length": 231.4183578491211, "epoch": 0.5658364779874214, "grad_norm": 0.4276762306690216, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.800088346004486, "reward_std": 0.12273125350475311, "rewards/accuracy_reward": 0.8204965889453888, "rewards/format_reward": 0.9795918166637421, "step": 5623 }, { "completion_length": 184.49999237060547, "epoch": 0.565937106918239, "grad_norm": 0.8137652277946472, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7326202988624573, "reward_std": 0.16527822613716125, "rewards/accuracy_reward": 0.75302854180336, "rewards/format_reward": 0.9795918464660645, "step": 5624 }, { "completion_length": 288.52040100097656, "epoch": 0.5660377358490566, "grad_norm": 1.2142586708068848, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7631570100784302, "reward_std": 0.1472567655146122, "rewards/accuracy_reward": 0.7733611464500427, "rewards/format_reward": 0.9897959232330322, "step": 5625 }, { "completion_length": 234.28571319580078, "epoch": 0.5661383647798742, "grad_norm": 0.42657962441444397, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7262523174285889, "reward_std": 0.06703824177384377, "rewards/accuracy_reward": 0.7466604709625244, "rewards/format_reward": 0.9795918464660645, "step": 5626 }, { "completion_length": 239.1530532836914, "epoch": 0.5662389937106919, "grad_norm": 0.3504062294960022, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8302963972091675, "reward_std": 0.059145327657461166, "rewards/accuracy_reward": 0.8405005037784576, "rewards/format_reward": 0.9897959232330322, "step": 5627 }, { "completion_length": 239.79591369628906, "epoch": 0.5663396226415094, "grad_norm": 1.3387565612792969, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.760482370853424, "reward_std": 0.14581476151943207, "rewards/accuracy_reward": 0.7706864774227142, "rewards/format_reward": 0.9897959232330322, "step": 5628 }, { "completion_length": 258.17346954345703, "epoch": 0.566440251572327, "grad_norm": 0.5567646622657776, "kl": 0.0523681640625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7551020383834839, "reward_std": 0.18102359771728516, "rewards/accuracy_reward": 0.7755101919174194, "rewards/format_reward": 0.9795918464660645, "step": 5629 }, { "completion_length": 197.2448959350586, "epoch": 0.5665408805031447, "grad_norm": 0.44525840878486633, "kl": 0.115478515625, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.8318634629249573, "reward_std": 0.0813182070851326, "rewards/accuracy_reward": 0.8318634331226349, "rewards/format_reward": 1.0, "step": 5630 }, { "completion_length": 253.23468780517578, "epoch": 0.5666415094339623, "grad_norm": 0.6526846289634705, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.5939806699752808, "reward_std": 0.20030477643013, "rewards/accuracy_reward": 0.6041847467422485, "rewards/format_reward": 0.9897959232330322, "step": 5631 }, { "completion_length": 240.54080963134766, "epoch": 0.5667421383647798, "grad_norm": 3.391082763671875, "kl": 0.0511474609375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8499072194099426, "reward_std": 0.07283752411603928, "rewards/accuracy_reward": 0.8601112961769104, "rewards/format_reward": 0.9897959232330322, "step": 5632 }, { "completion_length": 224.1326446533203, "epoch": 0.5668427672955975, "grad_norm": 0.8202300071716309, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8277453780174255, "reward_std": 0.18214626610279083, "rewards/accuracy_reward": 0.8277453780174255, "rewards/format_reward": 1.0, "step": 5633 }, { "completion_length": 242.04080963134766, "epoch": 0.5669433962264151, "grad_norm": 0.8563708066940308, "kl": 0.0657958984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.771726667881012, "reward_std": 0.09881599992513657, "rewards/accuracy_reward": 0.7819307744503021, "rewards/format_reward": 0.9897959232330322, "step": 5634 }, { "completion_length": 183.14285278320312, "epoch": 0.5670440251572327, "grad_norm": 0.9445395469665527, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8801166415214539, "reward_std": 0.10360337607562542, "rewards/accuracy_reward": 0.8903207182884216, "rewards/format_reward": 0.9897959232330322, "step": 5635 }, { "completion_length": 210.11223602294922, "epoch": 0.5671446540880503, "grad_norm": 0.6392830610275269, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8618075251579285, "reward_std": 0.1554907001554966, "rewards/accuracy_reward": 0.872011661529541, "rewards/format_reward": 0.9897959232330322, "step": 5636 }, { "completion_length": 276.8571319580078, "epoch": 0.5672452830188679, "grad_norm": 0.5162150263786316, "kl": 0.11767578125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.6978354454040527, "reward_std": 0.09790286049246788, "rewards/accuracy_reward": 0.6978354156017303, "rewards/format_reward": 1.0, "step": 5637 }, { "completion_length": 280.82652282714844, "epoch": 0.5673459119496855, "grad_norm": 0.7830498218536377, "kl": 0.05419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.761008083820343, "reward_std": 0.17536886408925056, "rewards/accuracy_reward": 0.7814161479473114, "rewards/format_reward": 0.9795918166637421, "step": 5638 }, { "completion_length": 259.61224365234375, "epoch": 0.5674465408805032, "grad_norm": 1.6742485761642456, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6425169706344604, "reward_std": 0.1908479481935501, "rewards/accuracy_reward": 0.6629251539707184, "rewards/format_reward": 0.9795918464660645, "step": 5639 }, { "completion_length": 221.24488830566406, "epoch": 0.5675471698113208, "grad_norm": 5.2277116775512695, "kl": 0.281494140625, "learning_rate": 1e-06, "loss": 0.0113, "reward": 1.8994274735450745, "reward_std": 0.09709398820996284, "rewards/accuracy_reward": 0.9096316397190094, "rewards/format_reward": 0.9897959232330322, "step": 5640 }, { "completion_length": 263.78570556640625, "epoch": 0.5676477987421383, "grad_norm": 0.40007898211479187, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8209750652313232, "reward_std": 0.10040072351694107, "rewards/accuracy_reward": 0.8209750950336456, "rewards/format_reward": 1.0, "step": 5641 }, { "completion_length": 178.2653045654297, "epoch": 0.567748427672956, "grad_norm": 3.6882224082946777, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.9077096581459045, "reward_std": 0.12886888533830643, "rewards/accuracy_reward": 0.9077097475528717, "rewards/format_reward": 1.0, "step": 5642 }, { "completion_length": 228.16326904296875, "epoch": 0.5678490566037736, "grad_norm": 0.6466816663742065, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.855102002620697, "reward_std": 0.16382713615894318, "rewards/accuracy_reward": 0.875510185956955, "rewards/format_reward": 0.9795918464660645, "step": 5643 }, { "completion_length": 230.2346954345703, "epoch": 0.5679496855345912, "grad_norm": 1.9735277891159058, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7433229684829712, "reward_std": 0.24687127023935318, "rewards/accuracy_reward": 0.7841393053531647, "rewards/format_reward": 0.9591836631298065, "step": 5644 }, { "completion_length": 260.4387664794922, "epoch": 0.5680503144654088, "grad_norm": 0.5037059783935547, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8192473649978638, "reward_std": 0.07631934434175491, "rewards/accuracy_reward": 0.8192474246025085, "rewards/format_reward": 1.0, "step": 5645 }, { "completion_length": 195.9795913696289, "epoch": 0.5681509433962264, "grad_norm": 0.9100867509841919, "kl": 0.0986328125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6966596841812134, "reward_std": 0.16013995930552483, "rewards/accuracy_reward": 0.7068639397621155, "rewards/format_reward": 0.9897959232330322, "step": 5646 }, { "completion_length": 173.53060913085938, "epoch": 0.568251572327044, "grad_norm": 1.200433611869812, "kl": 0.09326171875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7376655340194702, "reward_std": 0.13950985297560692, "rewards/accuracy_reward": 0.7376654744148254, "rewards/format_reward": 1.0, "step": 5647 }, { "completion_length": 235.88774871826172, "epoch": 0.5683522012578617, "grad_norm": 0.706159770488739, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7672541737556458, "reward_std": 0.12682022154331207, "rewards/accuracy_reward": 0.7774582207202911, "rewards/format_reward": 0.9897959232330322, "step": 5648 }, { "completion_length": 290.16326904296875, "epoch": 0.5684528301886792, "grad_norm": 1.1764404773712158, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5339694619178772, "reward_std": 0.16433120518922806, "rewards/accuracy_reward": 0.5441734790802002, "rewards/format_reward": 0.9897959232330322, "step": 5649 }, { "completion_length": 255.80611419677734, "epoch": 0.5685534591194968, "grad_norm": 0.5818873643875122, "kl": 0.1103515625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8234066367149353, "reward_std": 0.07912743464112282, "rewards/accuracy_reward": 0.8336108028888702, "rewards/format_reward": 0.9897959232330322, "step": 5650 }, { "completion_length": 199.63265228271484, "epoch": 0.5686540880503145, "grad_norm": 1.2882421016693115, "kl": 0.097412109375, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6799646615982056, "reward_std": 0.17497553676366806, "rewards/accuracy_reward": 0.6901686787605286, "rewards/format_reward": 0.9897959232330322, "step": 5651 }, { "completion_length": 260.1428451538086, "epoch": 0.5687547169811321, "grad_norm": 0.9375289082527161, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6491041779518127, "reward_std": 0.16028815507888794, "rewards/accuracy_reward": 0.6593083441257477, "rewards/format_reward": 0.9897959232330322, "step": 5652 }, { "completion_length": 214.08162689208984, "epoch": 0.5688553459119496, "grad_norm": 0.682196319103241, "kl": 0.0526123046875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.9189653396606445, "reward_std": 0.07559489831328392, "rewards/accuracy_reward": 0.9189653694629669, "rewards/format_reward": 1.0, "step": 5653 }, { "completion_length": 282.5612106323242, "epoch": 0.5689559748427673, "grad_norm": 0.747431755065918, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5897958874702454, "reward_std": 0.20750980824232101, "rewards/accuracy_reward": 0.5999999642372131, "rewards/format_reward": 0.9897959232330322, "step": 5654 }, { "completion_length": 239.64285278320312, "epoch": 0.5690566037735849, "grad_norm": 0.6184073686599731, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6218534708023071, "reward_std": 0.10441500600427389, "rewards/accuracy_reward": 0.6218535006046295, "rewards/format_reward": 1.0, "step": 5655 }, { "completion_length": 228.30611419677734, "epoch": 0.5691572327044025, "grad_norm": 1.116278052330017, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7018007636070251, "reward_std": 0.17741771042346954, "rewards/accuracy_reward": 0.7120048403739929, "rewards/format_reward": 0.9897959232330322, "step": 5656 }, { "completion_length": 198.2653045654297, "epoch": 0.5692578616352201, "grad_norm": 0.9872930645942688, "kl": 0.103515625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8891615271568298, "reward_std": 0.14394891262054443, "rewards/accuracy_reward": 0.89936563372612, "rewards/format_reward": 0.9897959232330322, "step": 5657 }, { "completion_length": 228.7448959350586, "epoch": 0.5693584905660377, "grad_norm": 0.7699339389801025, "kl": 0.0902099609375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7585033774375916, "reward_std": 0.1045607216656208, "rewards/accuracy_reward": 0.7687074840068817, "rewards/format_reward": 0.9897959232330322, "step": 5658 }, { "completion_length": 208.38774871826172, "epoch": 0.5694591194968553, "grad_norm": 1.0610369443893433, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6403473615646362, "reward_std": 0.10676391236484051, "rewards/accuracy_reward": 0.6505514681339264, "rewards/format_reward": 0.9897959232330322, "step": 5659 }, { "completion_length": 226.23468780517578, "epoch": 0.569559748427673, "grad_norm": 0.4505615830421448, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.622737467288971, "reward_std": 0.07570607960224152, "rewards/accuracy_reward": 0.6329416483640671, "rewards/format_reward": 0.9897959232330322, "step": 5660 }, { "completion_length": 220.7448959350586, "epoch": 0.5696603773584905, "grad_norm": 1.0357468128204346, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7565340995788574, "reward_std": 0.14469056203961372, "rewards/accuracy_reward": 0.7565341591835022, "rewards/format_reward": 1.0, "step": 5661 }, { "completion_length": 171.03060913085938, "epoch": 0.5697610062893081, "grad_norm": 3.111790418624878, "kl": 0.09423828125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8402183055877686, "reward_std": 0.15593161433935165, "rewards/accuracy_reward": 0.8606264889240265, "rewards/format_reward": 0.9795918464660645, "step": 5662 }, { "completion_length": 269.1122360229492, "epoch": 0.5698616352201258, "grad_norm": 1.0378146171569824, "kl": 0.097900390625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6407918334007263, "reward_std": 0.1935146450996399, "rewards/accuracy_reward": 0.6407918930053711, "rewards/format_reward": 1.0, "step": 5663 }, { "completion_length": 211.56121826171875, "epoch": 0.5699622641509434, "grad_norm": 0.9860728979110718, "kl": 0.140625, "learning_rate": 1e-06, "loss": 0.0056, "reward": 1.5983921885490417, "reward_std": 0.27568208426237106, "rewards/accuracy_reward": 0.6290043294429779, "rewards/format_reward": 0.9693877398967743, "step": 5664 }, { "completion_length": 211.49999237060547, "epoch": 0.5700628930817611, "grad_norm": 0.3681088984012604, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7098876237869263, "reward_std": 0.07720949873328209, "rewards/accuracy_reward": 0.709887683391571, "rewards/format_reward": 1.0, "step": 5665 }, { "completion_length": 194.27550506591797, "epoch": 0.5701635220125786, "grad_norm": 3.929537773132324, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7857142686843872, "reward_std": 0.05399492383003235, "rewards/accuracy_reward": 0.7857142686843872, "rewards/format_reward": 1.0, "step": 5666 }, { "completion_length": 192.58163452148438, "epoch": 0.5702641509433962, "grad_norm": 0.6807048916816711, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7781168222427368, "reward_std": 0.10391294024884701, "rewards/accuracy_reward": 0.7883209586143494, "rewards/format_reward": 0.9897959232330322, "step": 5667 }, { "completion_length": 200.79591369628906, "epoch": 0.5703647798742139, "grad_norm": 0.7614066004753113, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7637752294540405, "reward_std": 0.1227456871420145, "rewards/accuracy_reward": 0.7637751996517181, "rewards/format_reward": 1.0, "step": 5668 }, { "completion_length": 233.9591827392578, "epoch": 0.5704654088050315, "grad_norm": 1.1849679946899414, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8280413746833801, "reward_std": 0.2070264294743538, "rewards/accuracy_reward": 0.8484494388103485, "rewards/format_reward": 0.9795918464660645, "step": 5669 }, { "completion_length": 239.09183502197266, "epoch": 0.570566037735849, "grad_norm": 0.7161692976951599, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8190475702285767, "reward_std": 0.155446644872427, "rewards/accuracy_reward": 0.8292517066001892, "rewards/format_reward": 0.9897959232330322, "step": 5670 }, { "completion_length": 234.38775634765625, "epoch": 0.5706666666666667, "grad_norm": 0.668667733669281, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.9217687249183655, "reward_std": 0.11651967838406563, "rewards/accuracy_reward": 0.9217686951160431, "rewards/format_reward": 1.0, "step": 5671 }, { "completion_length": 166.79591369628906, "epoch": 0.5707672955974843, "grad_norm": 1.1879124641418457, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6846011281013489, "reward_std": 0.09571827203035355, "rewards/accuracy_reward": 0.6846010982990265, "rewards/format_reward": 1.0, "step": 5672 }, { "completion_length": 233.14285278320312, "epoch": 0.5708679245283019, "grad_norm": 0.5182040333747864, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.635435938835144, "reward_std": 0.07527294382452965, "rewards/accuracy_reward": 0.6456400752067566, "rewards/format_reward": 0.9897959232330322, "step": 5673 }, { "completion_length": 263.51019287109375, "epoch": 0.5709685534591195, "grad_norm": 0.7865725755691528, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.5942252278327942, "reward_std": 0.20167610794305801, "rewards/accuracy_reward": 0.6044294387102127, "rewards/format_reward": 0.9897959232330322, "step": 5674 }, { "completion_length": 202.88774871826172, "epoch": 0.5710691823899371, "grad_norm": 2.3988840579986572, "kl": 0.098388671875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7241787314414978, "reward_std": 0.18438620865345, "rewards/accuracy_reward": 0.7445869743824005, "rewards/format_reward": 0.9795918464660645, "step": 5675 }, { "completion_length": 195.1938705444336, "epoch": 0.5711698113207547, "grad_norm": 0.9649816751480103, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.5698097348213196, "reward_std": 0.21501729637384415, "rewards/accuracy_reward": 0.5902178883552551, "rewards/format_reward": 0.9795918464660645, "step": 5676 }, { "completion_length": 202.41836547851562, "epoch": 0.5712704402515724, "grad_norm": 0.6448673605918884, "kl": 0.119873046875, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7146809697151184, "reward_std": 0.13313856720924377, "rewards/accuracy_reward": 0.7350892126560211, "rewards/format_reward": 0.9795918166637421, "step": 5677 }, { "completion_length": 242.0306167602539, "epoch": 0.5713710691823899, "grad_norm": 0.7905222773551941, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7051830291748047, "reward_std": 0.17191962152719498, "rewards/accuracy_reward": 0.7051830291748047, "rewards/format_reward": 1.0, "step": 5678 }, { "completion_length": 285.3367233276367, "epoch": 0.5714716981132075, "grad_norm": 0.8608923554420471, "kl": 0.0643310546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7198491096496582, "reward_std": 0.22340168058872223, "rewards/accuracy_reward": 0.730053037405014, "rewards/format_reward": 0.9897959232330322, "step": 5679 }, { "completion_length": 188.83673095703125, "epoch": 0.5715723270440252, "grad_norm": 1.2329288721084595, "kl": 0.119384765625, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7607399821281433, "reward_std": 0.12304532527923584, "rewards/accuracy_reward": 0.7709441184997559, "rewards/format_reward": 0.9897959232330322, "step": 5680 }, { "completion_length": 215.22447967529297, "epoch": 0.5716729559748428, "grad_norm": 0.3558204770088196, "kl": 0.0794677734375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.793206810951233, "reward_std": 0.039984822273254395, "rewards/accuracy_reward": 0.8034108579158783, "rewards/format_reward": 0.9897959232330322, "step": 5681 }, { "completion_length": 180.9693832397461, "epoch": 0.5717735849056603, "grad_norm": 0.4671427607536316, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8276047110557556, "reward_std": 0.0600878931581974, "rewards/accuracy_reward": 0.8276046812534332, "rewards/format_reward": 1.0, "step": 5682 }, { "completion_length": 174.62244415283203, "epoch": 0.571874213836478, "grad_norm": 1.2815738916397095, "kl": 0.09912109375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.654308259487152, "reward_std": 0.16803785040974617, "rewards/accuracy_reward": 0.654308408498764, "rewards/format_reward": 1.0, "step": 5683 }, { "completion_length": 183.23468780517578, "epoch": 0.5719748427672956, "grad_norm": 3.068599224090576, "kl": 0.0650634765625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7448979616165161, "reward_std": 0.11917255818843842, "rewards/accuracy_reward": 0.7448979616165161, "rewards/format_reward": 1.0, "step": 5684 }, { "completion_length": 180.93877410888672, "epoch": 0.5720754716981132, "grad_norm": 0.7009222507476807, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7767471075057983, "reward_std": 0.14568433910608292, "rewards/accuracy_reward": 0.7767470479011536, "rewards/format_reward": 1.0, "step": 5685 }, { "completion_length": 194.43877410888672, "epoch": 0.5721761006289309, "grad_norm": 3.532881259918213, "kl": 0.108154296875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.8293105959892273, "reward_std": 0.17548038810491562, "rewards/accuracy_reward": 0.8497187495231628, "rewards/format_reward": 0.9795918464660645, "step": 5686 }, { "completion_length": 272.0306091308594, "epoch": 0.5722767295597484, "grad_norm": 0.6293508410453796, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7995279431343079, "reward_std": 0.1984199583530426, "rewards/accuracy_reward": 0.8301403224468231, "rewards/format_reward": 0.9693877398967743, "step": 5687 }, { "completion_length": 262.30611419677734, "epoch": 0.572377358490566, "grad_norm": 0.8736405372619629, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7342299818992615, "reward_std": 0.15012969449162483, "rewards/accuracy_reward": 0.7546381950378418, "rewards/format_reward": 0.9795918464660645, "step": 5688 }, { "completion_length": 214.41836547851562, "epoch": 0.5724779874213837, "grad_norm": 0.6916998624801636, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8999027609825134, "reward_std": 0.17022988572716713, "rewards/accuracy_reward": 0.9305150508880615, "rewards/format_reward": 0.9693877398967743, "step": 5689 }, { "completion_length": 214.08162689208984, "epoch": 0.5725786163522013, "grad_norm": 3.574204683303833, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6397891640663147, "reward_std": 0.30451925098896027, "rewards/accuracy_reward": 0.680605411529541, "rewards/format_reward": 0.9591836631298065, "step": 5690 }, { "completion_length": 194.0, "epoch": 0.5726792452830188, "grad_norm": 0.7673447132110596, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7824674844741821, "reward_std": 0.14772123098373413, "rewards/accuracy_reward": 0.7824675142765045, "rewards/format_reward": 1.0, "step": 5691 }, { "completion_length": 180.49999237060547, "epoch": 0.5727798742138365, "grad_norm": 1.450200080871582, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.772510051727295, "reward_std": 0.2060481682419777, "rewards/accuracy_reward": 0.8031222820281982, "rewards/format_reward": 0.9693877398967743, "step": 5692 }, { "completion_length": 202.23468780517578, "epoch": 0.5728805031446541, "grad_norm": 0.6679629683494568, "kl": 0.0565185546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8559058904647827, "reward_std": 0.0947929285466671, "rewards/accuracy_reward": 0.8763141334056854, "rewards/format_reward": 0.9795918464660645, "step": 5693 }, { "completion_length": 224.53060913085938, "epoch": 0.5729811320754717, "grad_norm": 0.8819968104362488, "kl": 0.11572265625, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.7735400795936584, "reward_std": 0.1324123963713646, "rewards/accuracy_reward": 0.793948233127594, "rewards/format_reward": 0.9795918464660645, "step": 5694 }, { "completion_length": 216.4081573486328, "epoch": 0.5730817610062893, "grad_norm": 1.2628600597381592, "kl": 0.114990234375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.756105124950409, "reward_std": 0.30556730926036835, "rewards/accuracy_reward": 0.8071255683898926, "rewards/format_reward": 0.9489795565605164, "step": 5695 }, { "completion_length": 216.05101013183594, "epoch": 0.5731823899371069, "grad_norm": 1.9305531978607178, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7097910046577454, "reward_std": 0.3618377596139908, "rewards/accuracy_reward": 0.7506073713302612, "rewards/format_reward": 0.9591836631298065, "step": 5696 }, { "completion_length": 260.5102005004883, "epoch": 0.5732830188679245, "grad_norm": 1.3363252878189087, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.770037055015564, "reward_std": 0.2111002802848816, "rewards/accuracy_reward": 0.7904452383518219, "rewards/format_reward": 0.9795918464660645, "step": 5697 }, { "completion_length": 229.34693908691406, "epoch": 0.5733836477987422, "grad_norm": 0.5708580017089844, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7201490998268127, "reward_std": 0.12739462032914162, "rewards/accuracy_reward": 0.7303532063961029, "rewards/format_reward": 0.9897959232330322, "step": 5698 }, { "completion_length": 241.61223602294922, "epoch": 0.5734842767295597, "grad_norm": 1.397451400756836, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7440099716186523, "reward_std": 0.12664325162768364, "rewards/accuracy_reward": 0.7542140185832977, "rewards/format_reward": 0.9897959232330322, "step": 5699 }, { "completion_length": 208.99999237060547, "epoch": 0.5735849056603773, "grad_norm": 0.9945037364959717, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.834954857826233, "reward_std": 0.11090050637722015, "rewards/accuracy_reward": 0.8349550068378448, "rewards/format_reward": 1.0, "step": 5700 }, { "completion_length": 215.29591369628906, "epoch": 0.573685534591195, "grad_norm": 0.7699742913246155, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6540653705596924, "reward_std": 0.2098132222890854, "rewards/accuracy_reward": 0.6744735538959503, "rewards/format_reward": 0.9795918464660645, "step": 5701 }, { "completion_length": 230.4591827392578, "epoch": 0.5737861635220126, "grad_norm": 0.7885407209396362, "kl": 0.1019287109375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.715545415878296, "reward_std": 0.15615898370742798, "rewards/accuracy_reward": 0.7461575567722321, "rewards/format_reward": 0.9693877398967743, "step": 5702 }, { "completion_length": 264.22447967529297, "epoch": 0.5738867924528301, "grad_norm": 0.579116940498352, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6548372507095337, "reward_std": 0.16161037050187588, "rewards/accuracy_reward": 0.6752453148365021, "rewards/format_reward": 0.9795918166637421, "step": 5703 }, { "completion_length": 257.8877487182617, "epoch": 0.5739874213836478, "grad_norm": 0.7324289083480835, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6964123249053955, "reward_std": 0.18074803054332733, "rewards/accuracy_reward": 0.7066164612770081, "rewards/format_reward": 0.9897959232330322, "step": 5704 }, { "completion_length": 287.87754821777344, "epoch": 0.5740880503144654, "grad_norm": 0.9620564579963684, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6559739112854004, "reward_std": 0.16962837427854538, "rewards/accuracy_reward": 0.6763821244239807, "rewards/format_reward": 0.9795918166637421, "step": 5705 }, { "completion_length": 321.051025390625, "epoch": 0.574188679245283, "grad_norm": 0.6799173951148987, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8213357329368591, "reward_std": 0.14151614904403687, "rewards/accuracy_reward": 0.8417439758777618, "rewards/format_reward": 0.9795918166637421, "step": 5706 }, { "completion_length": 225.10203552246094, "epoch": 0.5742893081761006, "grad_norm": 1.0059149265289307, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7669833898544312, "reward_std": 0.21015814691781998, "rewards/accuracy_reward": 0.7669834196567535, "rewards/format_reward": 1.0, "step": 5707 }, { "completion_length": 207.80611419677734, "epoch": 0.5743899371069182, "grad_norm": 0.8549038767814636, "kl": 0.132080078125, "learning_rate": 1e-06, "loss": 0.0053, "reward": 1.8622227311134338, "reward_std": 0.20271726697683334, "rewards/accuracy_reward": 0.8928350508213043, "rewards/format_reward": 0.9693877398967743, "step": 5708 }, { "completion_length": 324.3673400878906, "epoch": 0.5744905660377359, "grad_norm": 0.9855683445930481, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.551590919494629, "reward_std": 0.2419273927807808, "rewards/accuracy_reward": 0.5924072861671448, "rewards/format_reward": 0.9591836631298065, "step": 5709 }, { "completion_length": 195.49999237060547, "epoch": 0.5745911949685535, "grad_norm": 0.99917072057724, "kl": 0.0694580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7755101919174194, "reward_std": 0.11584595590829849, "rewards/accuracy_reward": 0.7857142984867096, "rewards/format_reward": 0.9897959232330322, "step": 5710 }, { "completion_length": 191.4795913696289, "epoch": 0.5746918238993711, "grad_norm": 0.5924642086029053, "kl": 0.0662841796875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7560738325119019, "reward_std": 0.15795090794563293, "rewards/accuracy_reward": 0.7764819860458374, "rewards/format_reward": 0.9795918166637421, "step": 5711 }, { "completion_length": 196.06122589111328, "epoch": 0.5747924528301886, "grad_norm": 0.9699711799621582, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8343271613121033, "reward_std": 0.09687286987900734, "rewards/accuracy_reward": 0.854735404253006, "rewards/format_reward": 0.9795918166637421, "step": 5712 }, { "completion_length": 282.448974609375, "epoch": 0.5748930817610063, "grad_norm": 1.390075445175171, "kl": 0.115966796875, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.7209386825561523, "reward_std": 0.36267784237861633, "rewards/accuracy_reward": 0.771959125995636, "rewards/format_reward": 0.9489795863628387, "step": 5713 }, { "completion_length": 209.28571319580078, "epoch": 0.5749937106918239, "grad_norm": 0.9509718418121338, "kl": 0.0865478515625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8677033185958862, "reward_std": 0.13801176100969315, "rewards/accuracy_reward": 0.8779073655605316, "rewards/format_reward": 0.9897959232330322, "step": 5714 }, { "completion_length": 200.79591369628906, "epoch": 0.5750943396226416, "grad_norm": 1.0461472272872925, "kl": 0.116943359375, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7973097562789917, "reward_std": 0.10466791689395905, "rewards/accuracy_reward": 0.8075138926506042, "rewards/format_reward": 0.9897959232330322, "step": 5715 }, { "completion_length": 281.23468017578125, "epoch": 0.5751949685534591, "grad_norm": 0.3693430423736572, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8303722143173218, "reward_std": 0.07050850428640842, "rewards/accuracy_reward": 0.8303722441196442, "rewards/format_reward": 1.0, "step": 5716 }, { "completion_length": 251.75509643554688, "epoch": 0.5752955974842767, "grad_norm": 0.750939130783081, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8195423483848572, "reward_std": 0.13374368846416473, "rewards/accuracy_reward": 0.8195423483848572, "rewards/format_reward": 1.0, "step": 5717 }, { "completion_length": 194.1938705444336, "epoch": 0.5753962264150944, "grad_norm": 0.6880670785903931, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7653060555458069, "reward_std": 0.1573527306318283, "rewards/accuracy_reward": 0.7755101919174194, "rewards/format_reward": 0.9897959232330322, "step": 5718 }, { "completion_length": 270.37754821777344, "epoch": 0.575496855345912, "grad_norm": 0.41902610659599304, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7551019787788391, "reward_std": 0.10003121197223663, "rewards/accuracy_reward": 0.7653061151504517, "rewards/format_reward": 0.9897959232330322, "step": 5719 }, { "completion_length": 237.76529693603516, "epoch": 0.5755974842767295, "grad_norm": 0.9998295307159424, "kl": 0.0677490234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8909340500831604, "reward_std": 0.09394278563559055, "rewards/accuracy_reward": 0.8909341394901276, "rewards/format_reward": 1.0, "step": 5720 }, { "completion_length": 217.4693832397461, "epoch": 0.5756981132075472, "grad_norm": 0.22855864465236664, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7501236200332642, "reward_std": 0.030934505630284548, "rewards/accuracy_reward": 0.7603277266025543, "rewards/format_reward": 0.9897959232330322, "step": 5721 }, { "completion_length": 194.71428680419922, "epoch": 0.5757987421383648, "grad_norm": 0.49779585003852844, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8073592782020569, "reward_std": 0.1182488203048706, "rewards/accuracy_reward": 0.8175634145736694, "rewards/format_reward": 0.9897959232330322, "step": 5722 }, { "completion_length": 239.12245178222656, "epoch": 0.5758993710691824, "grad_norm": 0.906204104423523, "kl": 0.10498046875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6705988645553589, "reward_std": 0.1922476813197136, "rewards/accuracy_reward": 0.6910070180892944, "rewards/format_reward": 0.9795918464660645, "step": 5723 }, { "completion_length": 214.2040786743164, "epoch": 0.576, "grad_norm": 1.3168635368347168, "kl": 0.10302734375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6338098645210266, "reward_std": 0.20760132372379303, "rewards/accuracy_reward": 0.6644221246242523, "rewards/format_reward": 0.9693877398967743, "step": 5724 }, { "completion_length": 256.1020278930664, "epoch": 0.5761006289308176, "grad_norm": 0.706872820854187, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8687641024589539, "reward_std": 0.22237497568130493, "rewards/accuracy_reward": 0.8993764221668243, "rewards/format_reward": 0.9693877398967743, "step": 5725 }, { "completion_length": 196.4285659790039, "epoch": 0.5762012578616352, "grad_norm": 1.537477731704712, "kl": 0.114013671875, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.8777688145637512, "reward_std": 0.13689318299293518, "rewards/accuracy_reward": 0.8879729807376862, "rewards/format_reward": 0.9897959232330322, "step": 5726 }, { "completion_length": 266.02040100097656, "epoch": 0.5763018867924529, "grad_norm": 0.847308337688446, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7923933267593384, "reward_std": 0.24514584243297577, "rewards/accuracy_reward": 0.8128015100955963, "rewards/format_reward": 0.9795918464660645, "step": 5727 }, { "completion_length": 261.56121826171875, "epoch": 0.5764025157232704, "grad_norm": 0.8395518064498901, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7993196249008179, "reward_std": 0.1420964151620865, "rewards/accuracy_reward": 0.8197278678417206, "rewards/format_reward": 0.9795918464660645, "step": 5728 }, { "completion_length": 254.74488830566406, "epoch": 0.576503144654088, "grad_norm": 0.9848540425300598, "kl": 0.101318359375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7707589864730835, "reward_std": 0.21663488447666168, "rewards/accuracy_reward": 0.8013712465763092, "rewards/format_reward": 0.9693877398967743, "step": 5729 }, { "completion_length": 194.16326141357422, "epoch": 0.5766037735849057, "grad_norm": 0.7785369157791138, "kl": 0.093994140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8411874175071716, "reward_std": 0.10952971875667572, "rewards/accuracy_reward": 0.8411873579025269, "rewards/format_reward": 1.0, "step": 5730 }, { "completion_length": 240.97957611083984, "epoch": 0.5767044025157233, "grad_norm": 0.748924732208252, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8176870346069336, "reward_std": 0.13035665079951286, "rewards/accuracy_reward": 0.8278911113739014, "rewards/format_reward": 0.9897959232330322, "step": 5731 }, { "completion_length": 230.80611419677734, "epoch": 0.5768050314465408, "grad_norm": 0.9143170118331909, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8265305757522583, "reward_std": 0.10788732394576073, "rewards/accuracy_reward": 0.8265305757522583, "rewards/format_reward": 1.0, "step": 5732 }, { "completion_length": 230.6836700439453, "epoch": 0.5769056603773585, "grad_norm": 1.4928752183914185, "kl": 0.11865234375, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.8369253873825073, "reward_std": 0.16264241188764572, "rewards/accuracy_reward": 0.8777416944503784, "rewards/format_reward": 0.9591836631298065, "step": 5733 }, { "completion_length": 242.27550506591797, "epoch": 0.5770062893081761, "grad_norm": 0.44179147481918335, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.9285714030265808, "reward_std": 0.08884849399328232, "rewards/accuracy_reward": 0.938775509595871, "rewards/format_reward": 0.9897959232330322, "step": 5734 }, { "completion_length": 226.39795684814453, "epoch": 0.5771069182389937, "grad_norm": 0.8317501544952393, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.9046587944030762, "reward_std": 0.12056005746126175, "rewards/accuracy_reward": 0.9046587944030762, "rewards/format_reward": 1.0, "step": 5735 }, { "completion_length": 188.2244873046875, "epoch": 0.5772075471698114, "grad_norm": 0.9692771434783936, "kl": 0.103271484375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.820779263973236, "reward_std": 0.11713296175003052, "rewards/accuracy_reward": 0.8411874175071716, "rewards/format_reward": 0.9795918464660645, "step": 5736 }, { "completion_length": 169.86734771728516, "epoch": 0.5773081761006289, "grad_norm": 1.230238676071167, "kl": 0.10693359375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.8066789507865906, "reward_std": 0.15591134876012802, "rewards/accuracy_reward": 0.806678980588913, "rewards/format_reward": 1.0, "step": 5737 }, { "completion_length": 194.79591369628906, "epoch": 0.5774088050314465, "grad_norm": 1.1382986307144165, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7823129892349243, "reward_std": 0.16375887393951416, "rewards/accuracy_reward": 0.7823129296302795, "rewards/format_reward": 1.0, "step": 5738 }, { "completion_length": 190.81632232666016, "epoch": 0.5775094339622642, "grad_norm": 0.9764751195907593, "kl": 0.11279296875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.8545366525650024, "reward_std": 0.14356402307748795, "rewards/accuracy_reward": 0.8647406697273254, "rewards/format_reward": 0.9897959232330322, "step": 5739 }, { "completion_length": 245.87754821777344, "epoch": 0.5776100628930818, "grad_norm": 0.7841321229934692, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8019624948501587, "reward_std": 0.14953729882836342, "rewards/accuracy_reward": 0.8223706483840942, "rewards/format_reward": 0.9795918464660645, "step": 5740 }, { "completion_length": 222.7244873046875, "epoch": 0.5777106918238993, "grad_norm": 2.5351600646972656, "kl": 0.0855712890625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7905734181404114, "reward_std": 0.22771260142326355, "rewards/accuracy_reward": 0.800777405500412, "rewards/format_reward": 0.9897959232330322, "step": 5741 }, { "completion_length": 313.53060150146484, "epoch": 0.577811320754717, "grad_norm": 0.7759007811546326, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6573052406311035, "reward_std": 0.2646574601531029, "rewards/accuracy_reward": 0.6879174709320068, "rewards/format_reward": 0.9693877398967743, "step": 5742 }, { "completion_length": 198.64284896850586, "epoch": 0.5779119496855346, "grad_norm": 1.1187714338302612, "kl": 0.0841064453125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.819727897644043, "reward_std": 0.13717086426913738, "rewards/accuracy_reward": 0.8299320042133331, "rewards/format_reward": 0.9897959232330322, "step": 5743 }, { "completion_length": 263.67345428466797, "epoch": 0.5780125786163522, "grad_norm": 2.219564914703369, "kl": 0.23681640625, "learning_rate": 1e-06, "loss": 0.0095, "reward": 1.7143049240112305, "reward_std": 0.09618794173002243, "rewards/accuracy_reward": 0.7245091199874878, "rewards/format_reward": 0.9897959232330322, "step": 5744 }, { "completion_length": 153.21428680419922, "epoch": 0.5781132075471698, "grad_norm": 0.2737914025783539, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.9115646481513977, "reward_std": 0.06517764180898666, "rewards/accuracy_reward": 0.9217686951160431, "rewards/format_reward": 0.9897959232330322, "step": 5745 }, { "completion_length": 258.56121826171875, "epoch": 0.5782138364779874, "grad_norm": 0.5820180177688599, "kl": 0.126953125, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.6364421844482422, "reward_std": 0.09189228340983391, "rewards/accuracy_reward": 0.6466463506221771, "rewards/format_reward": 0.9897959232330322, "step": 5746 }, { "completion_length": 262.6122283935547, "epoch": 0.578314465408805, "grad_norm": 0.3554336130619049, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7612671852111816, "reward_std": 0.09034007415175438, "rewards/accuracy_reward": 0.7714713215827942, "rewards/format_reward": 0.9897959232330322, "step": 5747 }, { "completion_length": 222.03060913085938, "epoch": 0.5784150943396227, "grad_norm": 0.6276743412017822, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7721088528633118, "reward_std": 0.11584595590829849, "rewards/accuracy_reward": 0.7721088826656342, "rewards/format_reward": 1.0, "step": 5748 }, { "completion_length": 209.8163299560547, "epoch": 0.5785157232704402, "grad_norm": 1.028568148612976, "kl": 0.09716796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6782281994819641, "reward_std": 0.2576686590909958, "rewards/accuracy_reward": 0.6884323358535767, "rewards/format_reward": 0.9897959232330322, "step": 5749 }, { "completion_length": 170.4183578491211, "epoch": 0.5786163522012578, "grad_norm": 0.5012839436531067, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7733898758888245, "reward_std": 0.09395762160420418, "rewards/accuracy_reward": 0.7835939228534698, "rewards/format_reward": 0.9897959232330322, "step": 5750 }, { "completion_length": 230.94898223876953, "epoch": 0.5787169811320755, "grad_norm": 1.2729589939117432, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7094782590866089, "reward_std": 0.10846519842743874, "rewards/accuracy_reward": 0.7094783186912537, "rewards/format_reward": 1.0, "step": 5751 }, { "completion_length": 252.2346954345703, "epoch": 0.5788176100628931, "grad_norm": 0.8949875235557556, "kl": 0.0731201171875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7879428267478943, "reward_std": 0.18549029529094696, "rewards/accuracy_reward": 0.7981468737125397, "rewards/format_reward": 0.9897959232330322, "step": 5752 }, { "completion_length": 288.2550964355469, "epoch": 0.5789182389937106, "grad_norm": 0.36748766899108887, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.928571343421936, "reward_std": 0.12137733772397041, "rewards/accuracy_reward": 0.9387754797935486, "rewards/format_reward": 0.9897959232330322, "step": 5753 }, { "completion_length": 266.10203552246094, "epoch": 0.5790188679245283, "grad_norm": 0.5639522671699524, "kl": 0.0885009765625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6871212124824524, "reward_std": 0.15053531527519226, "rewards/accuracy_reward": 0.68712118268013, "rewards/format_reward": 1.0, "step": 5754 }, { "completion_length": 266.5306091308594, "epoch": 0.5791194968553459, "grad_norm": 5.745730400085449, "kl": 0.201904296875, "learning_rate": 1e-06, "loss": 0.0081, "reward": 1.6823128461837769, "reward_std": 0.11229821294546127, "rewards/accuracy_reward": 0.682312935590744, "rewards/format_reward": 1.0, "step": 5755 }, { "completion_length": 267.44898223876953, "epoch": 0.5792201257861636, "grad_norm": 0.8358107805252075, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6360844373703003, "reward_std": 0.24737639725208282, "rewards/accuracy_reward": 0.6666967570781708, "rewards/format_reward": 0.9693877398967743, "step": 5756 }, { "completion_length": 296.4591751098633, "epoch": 0.5793207547169811, "grad_norm": 0.7910974621772766, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5466814041137695, "reward_std": 0.14526228606700897, "rewards/accuracy_reward": 0.5568854808807373, "rewards/format_reward": 0.9897959232330322, "step": 5757 }, { "completion_length": 302.0204086303711, "epoch": 0.5794213836477987, "grad_norm": 0.8346748948097229, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6990488171577454, "reward_std": 0.3031184449791908, "rewards/accuracy_reward": 0.7500692009925842, "rewards/format_reward": 0.9489795863628387, "step": 5758 }, { "completion_length": 253.6734619140625, "epoch": 0.5795220125786164, "grad_norm": 0.7477393746376038, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.522976279258728, "reward_std": 0.23672474920749664, "rewards/accuracy_reward": 0.5535885691642761, "rewards/format_reward": 0.9693877398967743, "step": 5759 }, { "completion_length": 217.9285659790039, "epoch": 0.579622641509434, "grad_norm": 1.0327813625335693, "kl": 0.106689453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.5626850128173828, "reward_std": 0.21662718802690506, "rewards/accuracy_reward": 0.5830931663513184, "rewards/format_reward": 0.9795918464660645, "step": 5760 }, { "completion_length": 297.9081573486328, "epoch": 0.5797232704402516, "grad_norm": 0.41561129689216614, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6198979020118713, "reward_std": 0.1003322433680296, "rewards/accuracy_reward": 0.6198979318141937, "rewards/format_reward": 1.0, "step": 5761 }, { "completion_length": 249.52039337158203, "epoch": 0.5798238993710692, "grad_norm": 0.8966880440711975, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7244513630867004, "reward_std": 0.1897623911499977, "rewards/accuracy_reward": 0.7346554696559906, "rewards/format_reward": 0.9897959232330322, "step": 5762 }, { "completion_length": 205.7653045654297, "epoch": 0.5799245283018868, "grad_norm": 0.8432127237319946, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7083512544631958, "reward_std": 0.09419146552681923, "rewards/accuracy_reward": 0.718555361032486, "rewards/format_reward": 0.9897959232330322, "step": 5763 }, { "completion_length": 270.55101013183594, "epoch": 0.5800251572327044, "grad_norm": 0.6894816756248474, "kl": 0.125732421875, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.7140983939170837, "reward_std": 0.14494133740663528, "rewards/accuracy_reward": 0.7243025302886963, "rewards/format_reward": 0.9897959232330322, "step": 5764 }, { "completion_length": 177.34693908691406, "epoch": 0.5801257861635221, "grad_norm": 0.539411723613739, "kl": 0.0921630859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.9394557476043701, "reward_std": 0.08861015737056732, "rewards/accuracy_reward": 0.9496598541736603, "rewards/format_reward": 0.9897959232330322, "step": 5765 }, { "completion_length": 289.0408172607422, "epoch": 0.5802264150943396, "grad_norm": 0.7203415632247925, "kl": 0.0848388671875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.4712430834770203, "reward_std": 0.10913673415780067, "rewards/accuracy_reward": 0.48144710063934326, "rewards/format_reward": 0.9897959232330322, "step": 5766 }, { "completion_length": 285.08162689208984, "epoch": 0.5803270440251572, "grad_norm": 0.32224854826927185, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8460348844528198, "reward_std": 0.041252932511270046, "rewards/accuracy_reward": 0.8460349142551422, "rewards/format_reward": 1.0, "step": 5767 }, { "completion_length": 211.13265228271484, "epoch": 0.5804276729559749, "grad_norm": 0.5950627326965332, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8212423920631409, "reward_std": 0.08179031684994698, "rewards/accuracy_reward": 0.8212424218654633, "rewards/format_reward": 1.0, "step": 5768 }, { "completion_length": 180.57142639160156, "epoch": 0.5805283018867925, "grad_norm": 1.2087876796722412, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7261297106742859, "reward_std": 0.2241334468126297, "rewards/accuracy_reward": 0.7261297404766083, "rewards/format_reward": 1.0, "step": 5769 }, { "completion_length": 209.6836700439453, "epoch": 0.58062893081761, "grad_norm": 0.5518460869789124, "kl": 0.10498046875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.724244773387909, "reward_std": 0.060210198163986206, "rewards/accuracy_reward": 0.7242446839809418, "rewards/format_reward": 1.0, "step": 5770 }, { "completion_length": 183.10203552246094, "epoch": 0.5807295597484277, "grad_norm": 0.36757680773735046, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7836734652519226, "reward_std": 0.04790960252285004, "rewards/accuracy_reward": 0.7836734652519226, "rewards/format_reward": 1.0, "step": 5771 }, { "completion_length": 236.4693832397461, "epoch": 0.5808301886792453, "grad_norm": 0.790739119052887, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6982848048210144, "reward_std": 0.1775430254638195, "rewards/accuracy_reward": 0.6982848048210144, "rewards/format_reward": 1.0, "step": 5772 }, { "completion_length": 218.26529693603516, "epoch": 0.5809308176100629, "grad_norm": 1.1527564525604248, "kl": 0.118408203125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7568621039390564, "reward_std": 0.15152157098054886, "rewards/accuracy_reward": 0.767066240310669, "rewards/format_reward": 0.9897959232330322, "step": 5773 }, { "completion_length": 189.41836547851562, "epoch": 0.5810314465408805, "grad_norm": 0.6102477312088013, "kl": 0.156982421875, "learning_rate": 1e-06, "loss": 0.0063, "reward": 1.6900574564933777, "reward_std": 0.1149095818400383, "rewards/accuracy_reward": 0.6900574266910553, "rewards/format_reward": 1.0, "step": 5774 }, { "completion_length": 241.09183502197266, "epoch": 0.5811320754716981, "grad_norm": 0.6789020895957947, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.83666330575943, "reward_std": 0.11883794888854027, "rewards/accuracy_reward": 0.8468674123287201, "rewards/format_reward": 0.9897959232330322, "step": 5775 }, { "completion_length": 244.87754821777344, "epoch": 0.5812327044025157, "grad_norm": 1.0727958679199219, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7203395366668701, "reward_std": 0.2454231008887291, "rewards/accuracy_reward": 0.7407476902008057, "rewards/format_reward": 0.9795918464660645, "step": 5776 }, { "completion_length": 230.41836547851562, "epoch": 0.5813333333333334, "grad_norm": 0.4608798325061798, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.827723205089569, "reward_std": 0.16519923508167267, "rewards/accuracy_reward": 0.8379273414611816, "rewards/format_reward": 0.9897959232330322, "step": 5777 }, { "completion_length": 279.04080963134766, "epoch": 0.5814339622641509, "grad_norm": 0.582203209400177, "kl": 0.113525390625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.8747477531433105, "reward_std": 0.18894927948713303, "rewards/accuracy_reward": 0.8951558470726013, "rewards/format_reward": 0.9795918464660645, "step": 5778 }, { "completion_length": 253.5, "epoch": 0.5815345911949685, "grad_norm": 0.9383226037025452, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5383868217468262, "reward_std": 0.11299064755439758, "rewards/accuracy_reward": 0.5485908389091492, "rewards/format_reward": 0.9897959232330322, "step": 5779 }, { "completion_length": 338.6326446533203, "epoch": 0.5816352201257862, "grad_norm": 0.9425595998764038, "kl": 0.0513916015625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6473920941352844, "reward_std": 0.20854037255048752, "rewards/accuracy_reward": 0.6678002178668976, "rewards/format_reward": 0.9795918166637421, "step": 5780 }, { "completion_length": 212.7040786743164, "epoch": 0.5817358490566038, "grad_norm": 0.4329240620136261, "kl": 0.0614013671875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8061224222183228, "reward_std": 0.09217509627342224, "rewards/accuracy_reward": 0.8265305757522583, "rewards/format_reward": 0.9795918166637421, "step": 5781 }, { "completion_length": 186.85713958740234, "epoch": 0.5818364779874213, "grad_norm": 0.899556577205658, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7506074905395508, "reward_std": 0.14499150216579437, "rewards/accuracy_reward": 0.7710157036781311, "rewards/format_reward": 0.9795918464660645, "step": 5782 }, { "completion_length": 241.0408172607422, "epoch": 0.581937106918239, "grad_norm": 0.47422587871551514, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7225854992866516, "reward_std": 0.09228329360485077, "rewards/accuracy_reward": 0.7327896058559418, "rewards/format_reward": 0.9897959232330322, "step": 5783 }, { "completion_length": 200.1734619140625, "epoch": 0.5820377358490566, "grad_norm": 0.8276288509368896, "kl": 0.128662109375, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.8024118542671204, "reward_std": 0.10655607655644417, "rewards/accuracy_reward": 0.8126159012317657, "rewards/format_reward": 0.9897959232330322, "step": 5784 }, { "completion_length": 278.1734619140625, "epoch": 0.5821383647798742, "grad_norm": 0.7269916534423828, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.738279938697815, "reward_std": 0.15289118140935898, "rewards/accuracy_reward": 0.7484840750694275, "rewards/format_reward": 0.9897959232330322, "step": 5785 }, { "completion_length": 290.7346878051758, "epoch": 0.5822389937106919, "grad_norm": 0.6529971957206726, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.754275918006897, "reward_std": 0.14155322313308716, "rewards/accuracy_reward": 0.7542759776115417, "rewards/format_reward": 1.0, "step": 5786 }, { "completion_length": 249.9591827392578, "epoch": 0.5823396226415094, "grad_norm": 1.2447059154510498, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6347187757492065, "reward_std": 0.0976235456764698, "rewards/accuracy_reward": 0.6449228823184967, "rewards/format_reward": 0.9897959232330322, "step": 5787 }, { "completion_length": 225.9897918701172, "epoch": 0.582440251572327, "grad_norm": 1.1742271184921265, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.812213957309723, "reward_std": 0.15185853093862534, "rewards/accuracy_reward": 0.8428261876106262, "rewards/format_reward": 0.9693877398967743, "step": 5788 }, { "completion_length": 287.6224365234375, "epoch": 0.5825408805031447, "grad_norm": 0.9550445675849915, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7317448258399963, "reward_std": 0.17955777049064636, "rewards/accuracy_reward": 0.7521529793739319, "rewards/format_reward": 0.9795918166637421, "step": 5789 }, { "completion_length": 270.3571319580078, "epoch": 0.5826415094339623, "grad_norm": 1.1999809741973877, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8195507526397705, "reward_std": 0.21916413307189941, "rewards/accuracy_reward": 0.8603670597076416, "rewards/format_reward": 0.9591836631298065, "step": 5790 }, { "completion_length": 269.1836624145508, "epoch": 0.5827421383647798, "grad_norm": 0.6327956318855286, "kl": 0.0738525390625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7842776775360107, "reward_std": 0.18202921003103256, "rewards/accuracy_reward": 0.7944817245006561, "rewards/format_reward": 0.9897959232330322, "step": 5791 }, { "completion_length": 273.5102005004883, "epoch": 0.5828427672955975, "grad_norm": 0.6260830163955688, "kl": 0.0594482421875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7756046652793884, "reward_std": 0.15092657878994942, "rewards/accuracy_reward": 0.7858087420463562, "rewards/format_reward": 0.9897959232330322, "step": 5792 }, { "completion_length": 298.2244873046875, "epoch": 0.5829433962264151, "grad_norm": 176.71072387695312, "kl": 1.8427734375, "learning_rate": 1e-06, "loss": 0.0738, "reward": 1.5084991455078125, "reward_std": 0.21273010224103928, "rewards/accuracy_reward": 0.5289072394371033, "rewards/format_reward": 0.9795918166637421, "step": 5793 }, { "completion_length": 222.60203552246094, "epoch": 0.5830440251572327, "grad_norm": 0.7532332539558411, "kl": 0.1024169921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7433387637138367, "reward_std": 0.171063132584095, "rewards/accuracy_reward": 0.7535427808761597, "rewards/format_reward": 0.9897959232330322, "step": 5794 }, { "completion_length": 249.29590606689453, "epoch": 0.5831446540880503, "grad_norm": 0.39770975708961487, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8816094398498535, "reward_std": 0.05135997012257576, "rewards/accuracy_reward": 0.8816094696521759, "rewards/format_reward": 1.0, "step": 5795 }, { "completion_length": 261.9387664794922, "epoch": 0.5832452830188679, "grad_norm": 0.49307578802108765, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8038960695266724, "reward_std": 0.08005111292004585, "rewards/accuracy_reward": 0.8038960695266724, "rewards/format_reward": 1.0, "step": 5796 }, { "completion_length": 240.26529693603516, "epoch": 0.5833459119496855, "grad_norm": 0.773222029209137, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7576582431793213, "reward_std": 0.1666794717311859, "rewards/accuracy_reward": 0.7780663371086121, "rewards/format_reward": 0.9795918464660645, "step": 5797 }, { "completion_length": 178.34693145751953, "epoch": 0.5834465408805032, "grad_norm": 0.5160748958587646, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.9081631898880005, "reward_std": 0.16188224405050278, "rewards/accuracy_reward": 0.9489795863628387, "rewards/format_reward": 0.9591836631298065, "step": 5798 }, { "completion_length": 218.64285278320312, "epoch": 0.5835471698113207, "grad_norm": 1.5761650800704956, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.9105928540229797, "reward_std": 0.10760965570807457, "rewards/accuracy_reward": 0.9207969009876251, "rewards/format_reward": 0.9897959232330322, "step": 5799 }, { "completion_length": 323.1326446533203, "epoch": 0.5836477987421383, "grad_norm": 0.7182030081748962, "kl": 0.10302734375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6153244972229004, "reward_std": 0.28424881398677826, "rewards/accuracy_reward": 0.6459366679191589, "rewards/format_reward": 0.9693877398967743, "step": 5800 }, { "completion_length": 184.39795684814453, "epoch": 0.583748427672956, "grad_norm": 0.841766893863678, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7244897484779358, "reward_std": 0.0863465778529644, "rewards/accuracy_reward": 0.7244898080825806, "rewards/format_reward": 1.0, "step": 5801 }, { "completion_length": 329.16326904296875, "epoch": 0.5838490566037736, "grad_norm": 0.5453224182128906, "kl": 0.05810546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.621019184589386, "reward_std": 0.19120758771896362, "rewards/accuracy_reward": 0.6312233209609985, "rewards/format_reward": 0.9897959232330322, "step": 5802 }, { "completion_length": 215.08162689208984, "epoch": 0.5839496855345911, "grad_norm": 0.8040762543678284, "kl": 0.059326171875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.819727897644043, "reward_std": 0.17755581066012383, "rewards/accuracy_reward": 0.8401360213756561, "rewards/format_reward": 0.9795918464660645, "step": 5803 }, { "completion_length": 222.1530532836914, "epoch": 0.5840503144654088, "grad_norm": 0.25365838408470154, "kl": 0.0592041015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.9401360750198364, "reward_std": 0.04499576799571514, "rewards/accuracy_reward": 0.9503401815891266, "rewards/format_reward": 0.9897959232330322, "step": 5804 }, { "completion_length": 319.24488830566406, "epoch": 0.5841509433962264, "grad_norm": 0.6058705449104309, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.812741994857788, "reward_std": 0.17037605494260788, "rewards/accuracy_reward": 0.8433542549610138, "rewards/format_reward": 0.9693877398967743, "step": 5805 }, { "completion_length": 286.27549743652344, "epoch": 0.584251572327044, "grad_norm": 0.8604463934898376, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.70428866147995, "reward_std": 0.22399218380451202, "rewards/accuracy_reward": 0.7246968597173691, "rewards/format_reward": 0.9795918464660645, "step": 5806 }, { "completion_length": 245.86734771728516, "epoch": 0.5843522012578616, "grad_norm": 0.491336464881897, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8083255887031555, "reward_std": 0.0776177030056715, "rewards/accuracy_reward": 0.8185296952724457, "rewards/format_reward": 0.9897959232330322, "step": 5807 }, { "completion_length": 234.85714721679688, "epoch": 0.5844528301886792, "grad_norm": 0.5875738263130188, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6645275950431824, "reward_std": 0.10675819497555494, "rewards/accuracy_reward": 0.6645276546478271, "rewards/format_reward": 1.0, "step": 5808 }, { "completion_length": 202.77550506591797, "epoch": 0.5845534591194969, "grad_norm": 1.2657219171524048, "kl": 0.100830078125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7473509907722473, "reward_std": 0.1310431957244873, "rewards/accuracy_reward": 0.7473510801792145, "rewards/format_reward": 1.0, "step": 5809 }, { "completion_length": 248.73468780517578, "epoch": 0.5846540880503145, "grad_norm": 1.8403400182724, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8200958967208862, "reward_std": 0.17357337102293968, "rewards/accuracy_reward": 0.8200959861278534, "rewards/format_reward": 1.0, "step": 5810 }, { "completion_length": 216.32652282714844, "epoch": 0.5847547169811321, "grad_norm": 0.34900832176208496, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.804227352142334, "reward_std": 0.06895832251757383, "rewards/accuracy_reward": 0.8042273819446564, "rewards/format_reward": 1.0, "step": 5811 }, { "completion_length": 234.9795913696289, "epoch": 0.5848553459119497, "grad_norm": 0.9185647964477539, "kl": 0.0565185546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7721088528633118, "reward_std": 0.09247414395213127, "rewards/accuracy_reward": 0.7721088528633118, "rewards/format_reward": 1.0, "step": 5812 }, { "completion_length": 283.04080963134766, "epoch": 0.5849559748427673, "grad_norm": 1.5108329057693481, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7259740233421326, "reward_std": 0.23673727363348007, "rewards/accuracy_reward": 0.725974053144455, "rewards/format_reward": 1.0, "step": 5813 }, { "completion_length": 265.4897766113281, "epoch": 0.5850566037735849, "grad_norm": 0.8788001537322998, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6321685910224915, "reward_std": 0.16368716955184937, "rewards/accuracy_reward": 0.6423726379871368, "rewards/format_reward": 0.9897959232330322, "step": 5814 }, { "completion_length": 272.5612258911133, "epoch": 0.5851572327044026, "grad_norm": 0.5760130286216736, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8237329721450806, "reward_std": 0.16957494616508484, "rewards/accuracy_reward": 0.833937019109726, "rewards/format_reward": 0.9897959232330322, "step": 5815 }, { "completion_length": 257.1530532836914, "epoch": 0.5852578616352201, "grad_norm": 1.707505226135254, "kl": 0.1083984375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.5738381743431091, "reward_std": 0.24360404163599014, "rewards/accuracy_reward": 0.5942464023828506, "rewards/format_reward": 0.9795918464660645, "step": 5816 }, { "completion_length": 216.7448959350586, "epoch": 0.5853584905660377, "grad_norm": 1.2244426012039185, "kl": 0.110107421875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7550813555717468, "reward_std": 0.10268420726060867, "rewards/accuracy_reward": 0.765285462141037, "rewards/format_reward": 0.9897959232330322, "step": 5817 }, { "completion_length": 211.12244415283203, "epoch": 0.5854591194968554, "grad_norm": 0.32789483666419983, "kl": 0.0986328125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8614878058433533, "reward_std": 0.08443121984601021, "rewards/accuracy_reward": 0.8614878356456757, "rewards/format_reward": 1.0, "step": 5818 }, { "completion_length": 190.33673095703125, "epoch": 0.585559748427673, "grad_norm": 1.2727183103561401, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7567027807235718, "reward_std": 0.1502934992313385, "rewards/accuracy_reward": 0.756702721118927, "rewards/format_reward": 1.0, "step": 5819 }, { "completion_length": 285.74488830566406, "epoch": 0.5856603773584905, "grad_norm": 1.4801474809646606, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6165120005607605, "reward_std": 0.1731380634009838, "rewards/accuracy_reward": 0.6471242904663086, "rewards/format_reward": 0.9693877398967743, "step": 5820 }, { "completion_length": 204.6836700439453, "epoch": 0.5857610062893082, "grad_norm": 0.8565019369125366, "kl": 0.10693359375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.9068832993507385, "reward_std": 0.0846855416893959, "rewards/accuracy_reward": 0.9170873761177063, "rewards/format_reward": 0.9897959232330322, "step": 5821 }, { "completion_length": 251.69387817382812, "epoch": 0.5858616352201258, "grad_norm": 2.090592384338379, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.692518174648285, "reward_std": 0.1931314915418625, "rewards/accuracy_reward": 0.7129262983798981, "rewards/format_reward": 0.9795918464660645, "step": 5822 }, { "completion_length": 243.02040100097656, "epoch": 0.5859622641509434, "grad_norm": 0.3783644437789917, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7971757650375366, "reward_std": 0.1444360911846161, "rewards/accuracy_reward": 0.8073798418045044, "rewards/format_reward": 0.9897959232330322, "step": 5823 }, { "completion_length": 253.551025390625, "epoch": 0.586062893081761, "grad_norm": 0.578148603439331, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.806035339832306, "reward_std": 0.15301598235964775, "rewards/accuracy_reward": 0.8162395358085632, "rewards/format_reward": 0.9897959232330322, "step": 5824 }, { "completion_length": 263.1122360229492, "epoch": 0.5861635220125786, "grad_norm": 0.45257142186164856, "kl": 0.05908203125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8092005252838135, "reward_std": 0.14205794408917427, "rewards/accuracy_reward": 0.8194046914577484, "rewards/format_reward": 0.9897959232330322, "step": 5825 }, { "completion_length": 243.6836700439453, "epoch": 0.5862641509433962, "grad_norm": 0.8125117421150208, "kl": 0.15185546875, "learning_rate": 1e-06, "loss": 0.0061, "reward": 1.7411144375801086, "reward_std": 0.1753433495759964, "rewards/accuracy_reward": 0.761522650718689, "rewards/format_reward": 0.9795918166637421, "step": 5826 }, { "completion_length": 235.54080963134766, "epoch": 0.5863647798742139, "grad_norm": 0.40794727206230164, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8380283117294312, "reward_std": 0.10814865306019783, "rewards/accuracy_reward": 0.8584364950656891, "rewards/format_reward": 0.9795918166637421, "step": 5827 }, { "completion_length": 253.9591827392578, "epoch": 0.5864654088050314, "grad_norm": 0.7425995469093323, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.781678318977356, "reward_std": 0.16657716035842896, "rewards/accuracy_reward": 0.8020865023136139, "rewards/format_reward": 0.9795918166637421, "step": 5828 }, { "completion_length": 233.49999237060547, "epoch": 0.586566037735849, "grad_norm": 0.7117882370948792, "kl": 0.103271484375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7652366161346436, "reward_std": 0.09417666681110859, "rewards/accuracy_reward": 0.7856446802616119, "rewards/format_reward": 0.9795918166637421, "step": 5829 }, { "completion_length": 168.25509643554688, "epoch": 0.5866666666666667, "grad_norm": 0.5399956703186035, "kl": 0.0633544921875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.785203993320465, "reward_std": 0.04715495929121971, "rewards/accuracy_reward": 0.7852040529251099, "rewards/format_reward": 1.0, "step": 5830 }, { "completion_length": 267.44896697998047, "epoch": 0.5867672955974843, "grad_norm": 0.40605437755584717, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7926268577575684, "reward_std": 0.07209511613473296, "rewards/accuracy_reward": 0.7926269173622131, "rewards/format_reward": 1.0, "step": 5831 }, { "completion_length": 213.08162689208984, "epoch": 0.5868679245283018, "grad_norm": 1.6662886142730713, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.874912440776825, "reward_std": 0.10613223165273666, "rewards/accuracy_reward": 0.8851164281368256, "rewards/format_reward": 0.9897959232330322, "step": 5832 }, { "completion_length": 259.40816497802734, "epoch": 0.5869685534591195, "grad_norm": 0.6854477524757385, "kl": 0.0562744140625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6498541831970215, "reward_std": 0.17596327140927315, "rewards/accuracy_reward": 0.6804664731025696, "rewards/format_reward": 0.9693877398967743, "step": 5833 }, { "completion_length": 284.68365478515625, "epoch": 0.5870691823899371, "grad_norm": 0.5171603560447693, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7617129683494568, "reward_std": 0.14985479414463043, "rewards/accuracy_reward": 0.7821211516857147, "rewards/format_reward": 0.9795918464660645, "step": 5834 }, { "completion_length": 207.2448959350586, "epoch": 0.5871698113207547, "grad_norm": 0.5623742938041687, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.881010890007019, "reward_std": 0.06723383581265807, "rewards/accuracy_reward": 0.881010890007019, "rewards/format_reward": 1.0, "step": 5835 }, { "completion_length": 271.6428527832031, "epoch": 0.5872704402515724, "grad_norm": 0.7631319165229797, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.622732400894165, "reward_std": 0.2539285346865654, "rewards/accuracy_reward": 0.643140584230423, "rewards/format_reward": 0.9795918166637421, "step": 5836 }, { "completion_length": 248.60203552246094, "epoch": 0.5873710691823899, "grad_norm": 0.8568571209907532, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7948654890060425, "reward_std": 0.24140427261590958, "rewards/accuracy_reward": 0.8152737021446228, "rewards/format_reward": 0.9795918166637421, "step": 5837 }, { "completion_length": 220.2959213256836, "epoch": 0.5874716981132075, "grad_norm": 0.7355985045433044, "kl": 0.1065673828125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7313781380653381, "reward_std": 0.08460479229688644, "rewards/accuracy_reward": 0.7313781678676605, "rewards/format_reward": 1.0, "step": 5838 }, { "completion_length": 262.9285659790039, "epoch": 0.5875723270440252, "grad_norm": 0.9209502935409546, "kl": 0.0906982421875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.701213300228119, "reward_std": 0.2310558632016182, "rewards/accuracy_reward": 0.7114173471927643, "rewards/format_reward": 0.9897959232330322, "step": 5839 }, { "completion_length": 249.1428451538086, "epoch": 0.5876729559748428, "grad_norm": 0.8154742121696472, "kl": 0.0867919921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6115405559539795, "reward_std": 0.15266824141144753, "rewards/accuracy_reward": 0.6217446625232697, "rewards/format_reward": 0.9897959232330322, "step": 5840 }, { "completion_length": 242.34693908691406, "epoch": 0.5877735849056603, "grad_norm": 1.583936095237732, "kl": 0.10302734375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.716152012348175, "reward_std": 0.05997312953695655, "rewards/accuracy_reward": 0.716152012348175, "rewards/format_reward": 1.0, "step": 5841 }, { "completion_length": 232.11223602294922, "epoch": 0.587874213836478, "grad_norm": 0.8819738030433655, "kl": 0.10205078125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8836734294891357, "reward_std": 0.11792564764618874, "rewards/accuracy_reward": 0.8836734294891357, "rewards/format_reward": 1.0, "step": 5842 }, { "completion_length": 269.89794921875, "epoch": 0.5879748427672956, "grad_norm": 0.31611910462379456, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8243041634559631, "reward_std": 0.07872217148542404, "rewards/accuracy_reward": 0.8243042528629303, "rewards/format_reward": 1.0, "step": 5843 }, { "completion_length": 220.53060150146484, "epoch": 0.5880754716981133, "grad_norm": 0.8847136497497559, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7860276699066162, "reward_std": 0.2237546443939209, "rewards/accuracy_reward": 0.8166399300098419, "rewards/format_reward": 0.9693877398967743, "step": 5844 }, { "completion_length": 216.89795684814453, "epoch": 0.5881761006289308, "grad_norm": 0.8278172016143799, "kl": 0.146484375, "learning_rate": 1e-06, "loss": 0.006, "reward": 1.6871033906936646, "reward_std": 0.16362031549215317, "rewards/accuracy_reward": 0.6973075568675995, "rewards/format_reward": 0.9897959232330322, "step": 5845 }, { "completion_length": 223.23468780517578, "epoch": 0.5882767295597484, "grad_norm": 0.9950799942016602, "kl": 0.097900390625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7728509306907654, "reward_std": 0.14963272213935852, "rewards/accuracy_reward": 0.7728509604930878, "rewards/format_reward": 1.0, "step": 5846 }, { "completion_length": 189.6326446533203, "epoch": 0.588377358490566, "grad_norm": 1.3605939149856567, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8241010308265686, "reward_std": 0.1887981966137886, "rewards/accuracy_reward": 0.8343051373958588, "rewards/format_reward": 0.9897959232330322, "step": 5847 }, { "completion_length": 304.07142639160156, "epoch": 0.5884779874213837, "grad_norm": 0.5787464380264282, "kl": 0.1259765625, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.609723150730133, "reward_std": 0.18180354312062263, "rewards/accuracy_reward": 0.6505395770072937, "rewards/format_reward": 0.9591836631298065, "step": 5848 }, { "completion_length": 182.2040786743164, "epoch": 0.5885786163522012, "grad_norm": 0.4607410132884979, "kl": 0.0875244140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8636363744735718, "reward_std": 0.10167640447616577, "rewards/accuracy_reward": 0.8636363446712494, "rewards/format_reward": 1.0, "step": 5849 }, { "completion_length": 244.9897918701172, "epoch": 0.5886792452830188, "grad_norm": 1.6737176179885864, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7477957606315613, "reward_std": 0.14119670540094376, "rewards/accuracy_reward": 0.7477957904338837, "rewards/format_reward": 1.0, "step": 5850 }, { "completion_length": 288.2040710449219, "epoch": 0.5887798742138365, "grad_norm": 2.168914556503296, "kl": 0.0897216796875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6618189811706543, "reward_std": 0.14494959264993668, "rewards/accuracy_reward": 0.6822271943092346, "rewards/format_reward": 0.9795918464660645, "step": 5851 }, { "completion_length": 254.31632232666016, "epoch": 0.5888805031446541, "grad_norm": 1.1340954303741455, "kl": 0.125244140625, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.724469780921936, "reward_std": 0.18495020270347595, "rewards/accuracy_reward": 0.7346738278865814, "rewards/format_reward": 0.9897959232330322, "step": 5852 }, { "completion_length": 313.6428451538086, "epoch": 0.5889811320754716, "grad_norm": 0.4228968322277069, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6873943209648132, "reward_std": 0.058546917513012886, "rewards/accuracy_reward": 0.6873943209648132, "rewards/format_reward": 1.0, "step": 5853 }, { "completion_length": 167.2448959350586, "epoch": 0.5890817610062893, "grad_norm": 2.1137852668762207, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8047581315040588, "reward_std": 0.1333479955792427, "rewards/accuracy_reward": 0.8047581315040588, "rewards/format_reward": 1.0, "step": 5854 }, { "completion_length": 229.45917510986328, "epoch": 0.5891823899371069, "grad_norm": 0.5557217001914978, "kl": 0.0633544921875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8696808218955994, "reward_std": 0.11846371367573738, "rewards/accuracy_reward": 0.8798848986625671, "rewards/format_reward": 0.9897959232330322, "step": 5855 }, { "completion_length": 290.2550964355469, "epoch": 0.5892830188679246, "grad_norm": 0.8522683382034302, "kl": 0.0491943359375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.7130632400512695, "reward_std": 0.17715215310454369, "rewards/accuracy_reward": 0.7436755299568176, "rewards/format_reward": 0.9693877398967743, "step": 5856 }, { "completion_length": 238.9693832397461, "epoch": 0.5893836477987421, "grad_norm": 0.7720435857772827, "kl": 0.146484375, "learning_rate": 1e-06, "loss": 0.0058, "reward": 1.7889809012413025, "reward_std": 0.14437538012862206, "rewards/accuracy_reward": 0.7991849184036255, "rewards/format_reward": 0.9897959232330322, "step": 5857 }, { "completion_length": 300.02039337158203, "epoch": 0.5894842767295597, "grad_norm": 0.548428475856781, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.777091383934021, "reward_std": 0.15321723371744156, "rewards/accuracy_reward": 0.7770914137363434, "rewards/format_reward": 1.0, "step": 5858 }, { "completion_length": 188.4591827392578, "epoch": 0.5895849056603774, "grad_norm": 0.9668177962303162, "kl": 0.1083984375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.8045559525489807, "reward_std": 0.13871153444051743, "rewards/accuracy_reward": 0.8045560121536255, "rewards/format_reward": 1.0, "step": 5859 }, { "completion_length": 243.52040100097656, "epoch": 0.589685534591195, "grad_norm": 1.7831989526748657, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7016845345497131, "reward_std": 0.3113074228167534, "rewards/accuracy_reward": 0.7220925986766815, "rewards/format_reward": 0.9795918464660645, "step": 5860 }, { "completion_length": 265.5918273925781, "epoch": 0.5897861635220126, "grad_norm": 0.9582237601280212, "kl": 0.0767822265625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7536078095436096, "reward_std": 0.14154669642448425, "rewards/accuracy_reward": 0.753607839345932, "rewards/format_reward": 1.0, "step": 5861 }, { "completion_length": 228.27550506591797, "epoch": 0.5898867924528302, "grad_norm": 1.224596381187439, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7266233563423157, "reward_std": 0.21585362404584885, "rewards/accuracy_reward": 0.7368274033069611, "rewards/format_reward": 0.9897959232330322, "step": 5862 }, { "completion_length": 211.0204086303711, "epoch": 0.5899874213836478, "grad_norm": 1.2310739755630493, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8198335766792297, "reward_std": 0.15598461776971817, "rewards/accuracy_reward": 0.8300377428531647, "rewards/format_reward": 0.9897959232330322, "step": 5863 }, { "completion_length": 190.06121826171875, "epoch": 0.5900880503144654, "grad_norm": 0.49076274037361145, "kl": 0.12646484375, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.722787618637085, "reward_std": 0.12931182235479355, "rewards/accuracy_reward": 0.7227876484394073, "rewards/format_reward": 1.0, "step": 5864 }, { "completion_length": 238.4693832397461, "epoch": 0.5901886792452831, "grad_norm": 0.5484110116958618, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.9233235120773315, "reward_std": 0.09976733289659023, "rewards/accuracy_reward": 0.9233235418796539, "rewards/format_reward": 1.0, "step": 5865 }, { "completion_length": 304.4387664794922, "epoch": 0.5902893081761006, "grad_norm": 0.4050944745540619, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7419770956039429, "reward_std": 0.09243917930871248, "rewards/accuracy_reward": 0.7419770956039429, "rewards/format_reward": 1.0, "step": 5866 }, { "completion_length": 224.13265228271484, "epoch": 0.5903899371069182, "grad_norm": 0.621252715587616, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8003401160240173, "reward_std": 0.11794260516762733, "rewards/accuracy_reward": 0.8003401160240173, "rewards/format_reward": 1.0, "step": 5867 }, { "completion_length": 221.53060150146484, "epoch": 0.5904905660377359, "grad_norm": 0.5108212828636169, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6259832978248596, "reward_std": 0.09731848537921906, "rewards/accuracy_reward": 0.6463915407657623, "rewards/format_reward": 0.9795918464660645, "step": 5868 }, { "completion_length": 209.89795684814453, "epoch": 0.5905911949685535, "grad_norm": 0.42686405777931213, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.905455231666565, "reward_std": 0.08603990077972412, "rewards/accuracy_reward": 0.9258634746074677, "rewards/format_reward": 0.9795918464660645, "step": 5869 }, { "completion_length": 216.37754821777344, "epoch": 0.590691823899371, "grad_norm": 0.43355274200439453, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.844704031944275, "reward_std": 0.08033434115350246, "rewards/accuracy_reward": 0.8447040617465973, "rewards/format_reward": 1.0, "step": 5870 }, { "completion_length": 238.04080200195312, "epoch": 0.5907924528301887, "grad_norm": 0.7212082147598267, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6869157552719116, "reward_std": 0.20758773013949394, "rewards/accuracy_reward": 0.7073239386081696, "rewards/format_reward": 0.9795918166637421, "step": 5871 }, { "completion_length": 296.9285583496094, "epoch": 0.5908930817610063, "grad_norm": 0.48076122999191284, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.5557671785354614, "reward_std": 0.11278935521841049, "rewards/accuracy_reward": 0.5761753916740417, "rewards/format_reward": 0.9795918166637421, "step": 5872 }, { "completion_length": 228.9693832397461, "epoch": 0.5909937106918239, "grad_norm": 0.6757431030273438, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.796137809753418, "reward_std": 0.20645222067832947, "rewards/accuracy_reward": 0.8165459930896759, "rewards/format_reward": 0.9795918166637421, "step": 5873 }, { "completion_length": 276.29590606689453, "epoch": 0.5910943396226415, "grad_norm": 0.6730252504348755, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.5857958793640137, "reward_std": 0.1872134804725647, "rewards/accuracy_reward": 0.6164081394672394, "rewards/format_reward": 0.9693877398967743, "step": 5874 }, { "completion_length": 209.81632232666016, "epoch": 0.5911949685534591, "grad_norm": 1.6072373390197754, "kl": 0.11767578125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7291443943977356, "reward_std": 0.1900601014494896, "rewards/accuracy_reward": 0.7393484115600586, "rewards/format_reward": 0.9897959232330322, "step": 5875 }, { "completion_length": 191.56121826171875, "epoch": 0.5912955974842767, "grad_norm": 0.9581142663955688, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7474077939987183, "reward_std": 0.21109945327043533, "rewards/accuracy_reward": 0.767815887928009, "rewards/format_reward": 0.9795918464660645, "step": 5876 }, { "completion_length": 227.28570556640625, "epoch": 0.5913962264150944, "grad_norm": 0.5188745260238647, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7655557990074158, "reward_std": 0.11534896492958069, "rewards/accuracy_reward": 0.7757599353790283, "rewards/format_reward": 0.9897959232330322, "step": 5877 }, { "completion_length": 280.49998474121094, "epoch": 0.5914968553459119, "grad_norm": 2.3677773475646973, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.746254324913025, "reward_std": 0.1777140162885189, "rewards/accuracy_reward": 0.7564584612846375, "rewards/format_reward": 0.9897959232330322, "step": 5878 }, { "completion_length": 273.4795913696289, "epoch": 0.5915974842767295, "grad_norm": 1.2144560813903809, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7500161528587341, "reward_std": 0.1645394004881382, "rewards/accuracy_reward": 0.7602202594280243, "rewards/format_reward": 0.9897959232330322, "step": 5879 }, { "completion_length": 230.78571319580078, "epoch": 0.5916981132075472, "grad_norm": 0.9478929042816162, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.811953365802765, "reward_std": 0.1982073113322258, "rewards/accuracy_reward": 0.8323615193367004, "rewards/format_reward": 0.9795918464660645, "step": 5880 }, { "completion_length": 252.0408172607422, "epoch": 0.5917987421383648, "grad_norm": 5.283713340759277, "kl": 0.0706787109375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6353111863136292, "reward_std": 0.21457157284021378, "rewards/accuracy_reward": 0.6659235060214996, "rewards/format_reward": 0.9693877398967743, "step": 5881 }, { "completion_length": 217.1836700439453, "epoch": 0.5918993710691824, "grad_norm": 0.6569108963012695, "kl": 0.0716552734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8602120876312256, "reward_std": 0.12222596257925034, "rewards/accuracy_reward": 0.8704162240028381, "rewards/format_reward": 0.9897959232330322, "step": 5882 }, { "completion_length": 250.8571319580078, "epoch": 0.592, "grad_norm": 0.7926539778709412, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8591836094856262, "reward_std": 0.1554558426141739, "rewards/accuracy_reward": 0.8693877756595612, "rewards/format_reward": 0.9897959232330322, "step": 5883 }, { "completion_length": 291.4183654785156, "epoch": 0.5921006289308176, "grad_norm": 1.0225780010223389, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6414347887039185, "reward_std": 0.20956937223672867, "rewards/accuracy_reward": 0.672046959400177, "rewards/format_reward": 0.9693877398967743, "step": 5884 }, { "completion_length": 235.12244415283203, "epoch": 0.5922012578616352, "grad_norm": 1.239661693572998, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7239215970039368, "reward_std": 0.1979161649942398, "rewards/accuracy_reward": 0.7341257631778717, "rewards/format_reward": 0.9897959232330322, "step": 5885 }, { "completion_length": 264.5306091308594, "epoch": 0.5923018867924529, "grad_norm": 0.9218624234199524, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6612966060638428, "reward_std": 0.24930530786514282, "rewards/accuracy_reward": 0.6919088959693909, "rewards/format_reward": 0.9693877398967743, "step": 5886 }, { "completion_length": 338.9081573486328, "epoch": 0.5924025157232704, "grad_norm": 0.7604973316192627, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5800454020500183, "reward_std": 0.27051951736211777, "rewards/accuracy_reward": 0.6412699818611145, "rewards/format_reward": 0.938775509595871, "step": 5887 }, { "completion_length": 272.8673400878906, "epoch": 0.592503144654088, "grad_norm": 0.6961818933486938, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5541434288024902, "reward_std": 0.24047963321208954, "rewards/accuracy_reward": 0.5745516270399094, "rewards/format_reward": 0.9795918464660645, "step": 5888 }, { "completion_length": 210.62244415283203, "epoch": 0.5926037735849057, "grad_norm": 0.5954573154449463, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8163264393806458, "reward_std": 0.12817170843482018, "rewards/accuracy_reward": 0.8367346525192261, "rewards/format_reward": 0.9795918464660645, "step": 5889 }, { "completion_length": 269.3673324584961, "epoch": 0.5927044025157233, "grad_norm": 1.420966625213623, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.759136974811554, "reward_std": 0.17449216544628143, "rewards/accuracy_reward": 0.769340991973877, "rewards/format_reward": 0.9897959232330322, "step": 5890 }, { "completion_length": 232.58163452148438, "epoch": 0.5928050314465408, "grad_norm": 0.7635088562965393, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8204081654548645, "reward_std": 0.1511857733130455, "rewards/accuracy_reward": 0.8306122422218323, "rewards/format_reward": 0.9897959232330322, "step": 5891 }, { "completion_length": 222.27550506591797, "epoch": 0.5929056603773585, "grad_norm": 2.512291431427002, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7551182508468628, "reward_std": 0.2089470699429512, "rewards/accuracy_reward": 0.775526374578476, "rewards/format_reward": 0.9795918166637421, "step": 5892 }, { "completion_length": 228.6326446533203, "epoch": 0.5930062893081761, "grad_norm": 0.4480937421321869, "kl": 0.0682373046875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.821574330329895, "reward_std": 0.14568953588604927, "rewards/accuracy_reward": 0.8317784070968628, "rewards/format_reward": 0.9897959232330322, "step": 5893 }, { "completion_length": 266.6122360229492, "epoch": 0.5931069182389938, "grad_norm": 0.43372902274131775, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.915753424167633, "reward_std": 0.1073656789958477, "rewards/accuracy_reward": 0.9157534241676331, "rewards/format_reward": 1.0, "step": 5894 }, { "completion_length": 222.21427154541016, "epoch": 0.5932075471698113, "grad_norm": 0.6266302466392517, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.80209082365036, "reward_std": 0.1674887239933014, "rewards/accuracy_reward": 0.8327030837535858, "rewards/format_reward": 0.9693877398967743, "step": 5895 }, { "completion_length": 210.64285278320312, "epoch": 0.5933081761006289, "grad_norm": 1.0503474473953247, "kl": 0.122314453125, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.776319980621338, "reward_std": 0.17296522855758667, "rewards/accuracy_reward": 0.7967281937599182, "rewards/format_reward": 0.9795918166637421, "step": 5896 }, { "completion_length": 201.53060913085938, "epoch": 0.5934088050314466, "grad_norm": 1.1445913314819336, "kl": 0.106689453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7350292801856995, "reward_std": 0.18640728294849396, "rewards/accuracy_reward": 0.7656415104866028, "rewards/format_reward": 0.9693877398967743, "step": 5897 }, { "completion_length": 308.6938781738281, "epoch": 0.5935094339622642, "grad_norm": 0.6992928385734558, "kl": 0.1181640625, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7056379318237305, "reward_std": 0.2115214839577675, "rewards/accuracy_reward": 0.7260462045669556, "rewards/format_reward": 0.9795918166637421, "step": 5898 }, { "completion_length": 270.56121826171875, "epoch": 0.5936100628930817, "grad_norm": 0.4956836402416229, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8547722697257996, "reward_std": 0.19041300937533379, "rewards/accuracy_reward": 0.8649762868881226, "rewards/format_reward": 0.9897959232330322, "step": 5899 }, { "completion_length": 159.57142639160156, "epoch": 0.5937106918238994, "grad_norm": 0.7703999280929565, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7829087972640991, "reward_std": 0.1219082809984684, "rewards/accuracy_reward": 0.7931129634380341, "rewards/format_reward": 0.9897959232330322, "step": 5900 }, { "completion_length": 240.9897918701172, "epoch": 0.593811320754717, "grad_norm": 2.3974435329437256, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8360720872879028, "reward_std": 0.18486355245113373, "rewards/accuracy_reward": 0.8462761044502258, "rewards/format_reward": 0.9897959232330322, "step": 5901 }, { "completion_length": 219.52040100097656, "epoch": 0.5939119496855346, "grad_norm": 0.47285348176956177, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8913102746009827, "reward_std": 0.10684037581086159, "rewards/accuracy_reward": 0.9015144109725952, "rewards/format_reward": 0.9897959232330322, "step": 5902 }, { "completion_length": 221.2959213256836, "epoch": 0.5940125786163521, "grad_norm": 1.4621058702468872, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8637836575508118, "reward_std": 0.13051199913024902, "rewards/accuracy_reward": 0.8739877045154572, "rewards/format_reward": 0.9897959232330322, "step": 5903 }, { "completion_length": 232.92857360839844, "epoch": 0.5941132075471698, "grad_norm": 3.9516310691833496, "kl": 0.09716796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6972825527191162, "reward_std": 0.17988641560077667, "rewards/accuracy_reward": 0.7176907062530518, "rewards/format_reward": 0.9795918166637421, "step": 5904 }, { "completion_length": 245.82653045654297, "epoch": 0.5942138364779874, "grad_norm": 0.43730428814888, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7511594891548157, "reward_std": 0.06898536905646324, "rewards/accuracy_reward": 0.7511595487594604, "rewards/format_reward": 1.0, "step": 5905 }, { "completion_length": 281.28570556640625, "epoch": 0.5943144654088051, "grad_norm": 3.3744254112243652, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6218217015266418, "reward_std": 0.2647062838077545, "rewards/accuracy_reward": 0.6218217313289642, "rewards/format_reward": 1.0, "step": 5906 }, { "completion_length": 142.6020393371582, "epoch": 0.5944150943396227, "grad_norm": 0.4013388454914093, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.9250099658966064, "reward_std": 0.03954796493053436, "rewards/accuracy_reward": 0.9250099658966064, "rewards/format_reward": 1.0, "step": 5907 }, { "completion_length": 282.8877410888672, "epoch": 0.5945157232704402, "grad_norm": 0.551507830619812, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.675589680671692, "reward_std": 0.17954255267977715, "rewards/accuracy_reward": 0.6959978640079498, "rewards/format_reward": 0.9795918464660645, "step": 5908 }, { "completion_length": 312.0816345214844, "epoch": 0.5946163522012579, "grad_norm": 0.8912561535835266, "kl": 0.09326171875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.772124171257019, "reward_std": 0.24192054569721222, "rewards/accuracy_reward": 0.772124171257019, "rewards/format_reward": 1.0, "step": 5909 }, { "completion_length": 264.948974609375, "epoch": 0.5947169811320755, "grad_norm": 0.5232675075531006, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6736741662025452, "reward_std": 0.12198301404714584, "rewards/accuracy_reward": 0.6940823197364807, "rewards/format_reward": 0.9795918166637421, "step": 5910 }, { "completion_length": 192.59183502197266, "epoch": 0.5948176100628931, "grad_norm": 0.6175476312637329, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.778014898300171, "reward_std": 0.14775512367486954, "rewards/accuracy_reward": 0.7780148386955261, "rewards/format_reward": 1.0, "step": 5911 }, { "completion_length": 205.75509643554688, "epoch": 0.5949182389937107, "grad_norm": 0.7336249351501465, "kl": 0.108154296875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7444856762886047, "reward_std": 0.13800623640418053, "rewards/accuracy_reward": 0.7444856464862823, "rewards/format_reward": 1.0, "step": 5912 }, { "completion_length": 209.4285659790039, "epoch": 0.5950188679245283, "grad_norm": 0.8254851698875427, "kl": 0.097900390625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7057961821556091, "reward_std": 0.11762268748134375, "rewards/accuracy_reward": 0.7160003185272217, "rewards/format_reward": 0.9897959232330322, "step": 5913 }, { "completion_length": 308.02040100097656, "epoch": 0.5951194968553459, "grad_norm": 0.7141166925430298, "kl": 0.1064453125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6024118065834045, "reward_std": 0.13133515045046806, "rewards/accuracy_reward": 0.6024118661880493, "rewards/format_reward": 1.0, "step": 5914 }, { "completion_length": 227.2244873046875, "epoch": 0.5952201257861636, "grad_norm": 0.4053989350795746, "kl": 0.0631103515625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6869407892227173, "reward_std": 0.12888728827238083, "rewards/accuracy_reward": 0.7073489725589752, "rewards/format_reward": 0.9795918166637421, "step": 5915 }, { "completion_length": 254.52040100097656, "epoch": 0.5953207547169811, "grad_norm": 0.6768544316291809, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7011131048202515, "reward_std": 0.17545662447810173, "rewards/accuracy_reward": 0.7317253947257996, "rewards/format_reward": 0.9693877398967743, "step": 5916 }, { "completion_length": 228.27550506591797, "epoch": 0.5954213836477987, "grad_norm": 0.38777783513069153, "kl": 0.103515625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.9123459458351135, "reward_std": 0.06541236117482185, "rewards/accuracy_reward": 0.9123459458351135, "rewards/format_reward": 1.0, "step": 5917 }, { "completion_length": 236.38774871826172, "epoch": 0.5955220125786164, "grad_norm": 0.643230140209198, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.5505362749099731, "reward_std": 0.1646198108792305, "rewards/accuracy_reward": 0.570944532752037, "rewards/format_reward": 0.9795918166637421, "step": 5918 }, { "completion_length": 239.82653045654297, "epoch": 0.595622641509434, "grad_norm": 0.9083629250526428, "kl": 0.11572265625, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.8095502853393555, "reward_std": 0.16165827959775925, "rewards/accuracy_reward": 0.8299584686756134, "rewards/format_reward": 0.9795918166637421, "step": 5919 }, { "completion_length": 215.448974609375, "epoch": 0.5957232704402515, "grad_norm": 0.8416833877563477, "kl": 0.11474609375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.7543657422065735, "reward_std": 0.13255042769014835, "rewards/accuracy_reward": 0.7543658316135406, "rewards/format_reward": 1.0, "step": 5920 }, { "completion_length": 250.20408630371094, "epoch": 0.5958238993710692, "grad_norm": 0.38284415006637573, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7650444507598877, "reward_std": 0.08093808405101299, "rewards/accuracy_reward": 0.7752485275268555, "rewards/format_reward": 0.9897959232330322, "step": 5921 }, { "completion_length": 184.88774871826172, "epoch": 0.5959245283018868, "grad_norm": 0.33284151554107666, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6541382670402527, "reward_std": 0.06207273853942752, "rewards/accuracy_reward": 0.6643423736095428, "rewards/format_reward": 0.9897959232330322, "step": 5922 }, { "completion_length": 199.16326141357422, "epoch": 0.5960251572327044, "grad_norm": 0.43388742208480835, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.942486047744751, "reward_std": 0.08194069936871529, "rewards/accuracy_reward": 0.9628942310810089, "rewards/format_reward": 0.9795918464660645, "step": 5923 }, { "completion_length": 231.39795684814453, "epoch": 0.596125786163522, "grad_norm": 0.6461248397827148, "kl": 0.062744140625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7393526434898376, "reward_std": 0.08504353277385235, "rewards/accuracy_reward": 0.7393527626991272, "rewards/format_reward": 1.0, "step": 5924 }, { "completion_length": 150.9285659790039, "epoch": 0.5962264150943396, "grad_norm": 1.1929484605789185, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.9693877696990967, "reward_std": 0.08099238574504852, "rewards/accuracy_reward": 0.9693877398967743, "rewards/format_reward": 1.0, "step": 5925 }, { "completion_length": 240.7448959350586, "epoch": 0.5963270440251572, "grad_norm": 0.6428646445274353, "kl": 0.09814453125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7997651100158691, "reward_std": 0.19027438759803772, "rewards/accuracy_reward": 0.8099692165851593, "rewards/format_reward": 0.9897959232330322, "step": 5926 }, { "completion_length": 180.7448959350586, "epoch": 0.5964276729559749, "grad_norm": 0.5359202027320862, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8251559138298035, "reward_std": 0.09117643907666206, "rewards/accuracy_reward": 0.835360050201416, "rewards/format_reward": 0.9897959232330322, "step": 5927 }, { "completion_length": 296.8571472167969, "epoch": 0.5965283018867924, "grad_norm": 1.1620137691497803, "kl": 0.130126953125, "learning_rate": 1e-06, "loss": 0.0052, "reward": 1.736337125301361, "reward_std": 0.2227136790752411, "rewards/accuracy_reward": 0.7771534621715546, "rewards/format_reward": 0.9591836333274841, "step": 5928 }, { "completion_length": 212.60203552246094, "epoch": 0.59662893081761, "grad_norm": 2.7391550540924072, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7758546471595764, "reward_std": 0.12180249579250813, "rewards/accuracy_reward": 0.7860588133335114, "rewards/format_reward": 0.9897959232330322, "step": 5929 }, { "completion_length": 232.4387664794922, "epoch": 0.5967295597484277, "grad_norm": 0.6549849510192871, "kl": 0.15380859375, "learning_rate": 1e-06, "loss": 0.0062, "reward": 1.6528058648109436, "reward_std": 0.1568671390414238, "rewards/accuracy_reward": 0.6630099415779114, "rewards/format_reward": 0.9897959232330322, "step": 5930 }, { "completion_length": 217.2040786743164, "epoch": 0.5968301886792453, "grad_norm": 0.901228666305542, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7687074542045593, "reward_std": 0.200294628739357, "rewards/accuracy_reward": 0.7993197441101074, "rewards/format_reward": 0.9693877398967743, "step": 5931 }, { "completion_length": 215.90816497802734, "epoch": 0.596930817610063, "grad_norm": 0.36289697885513306, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8463890552520752, "reward_std": 0.08644931018352509, "rewards/accuracy_reward": 0.8565931618213654, "rewards/format_reward": 0.9897959232330322, "step": 5932 }, { "completion_length": 272.22447967529297, "epoch": 0.5970314465408805, "grad_norm": 0.36718225479125977, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7755101919174194, "reward_std": 0.11584595590829849, "rewards/accuracy_reward": 0.8061224520206451, "rewards/format_reward": 0.9693877398967743, "step": 5933 }, { "completion_length": 229.78570556640625, "epoch": 0.5971320754716981, "grad_norm": 1.0186853408813477, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7127270698547363, "reward_std": 0.1382637619972229, "rewards/accuracy_reward": 0.7229311466217041, "rewards/format_reward": 0.9897959232330322, "step": 5934 }, { "completion_length": 194.05101776123047, "epoch": 0.5972327044025157, "grad_norm": 0.3130396902561188, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8075801134109497, "reward_std": 0.06893216818571091, "rewards/accuracy_reward": 0.8177842497825623, "rewards/format_reward": 0.9897959232330322, "step": 5935 }, { "completion_length": 196.64285278320312, "epoch": 0.5973333333333334, "grad_norm": 1.3478809595108032, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6249879002571106, "reward_std": 0.2464655414223671, "rewards/accuracy_reward": 0.6760082840919495, "rewards/format_reward": 0.9489795863628387, "step": 5936 }, { "completion_length": 254.89794921875, "epoch": 0.5974339622641509, "grad_norm": 2.8314995765686035, "kl": 0.123779296875, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.5298758149147034, "reward_std": 0.2085125669836998, "rewards/accuracy_reward": 0.5808962881565094, "rewards/format_reward": 0.9489795565605164, "step": 5937 }, { "completion_length": 207.41836547851562, "epoch": 0.5975345911949685, "grad_norm": 0.7632133364677429, "kl": 0.1318359375, "learning_rate": 1e-06, "loss": 0.0053, "reward": 1.7504332065582275, "reward_std": 0.16352254152297974, "rewards/accuracy_reward": 0.7606371641159058, "rewards/format_reward": 0.9897959232330322, "step": 5938 }, { "completion_length": 262.09183502197266, "epoch": 0.5976352201257862, "grad_norm": 4.423191070556641, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7800270318984985, "reward_std": 0.1475866436958313, "rewards/accuracy_reward": 0.8004352152347565, "rewards/format_reward": 0.9795918464660645, "step": 5939 }, { "completion_length": 237.38775634765625, "epoch": 0.5977358490566038, "grad_norm": 0.45923009514808655, "kl": 0.0731201171875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.9183672666549683, "reward_std": 0.09670460596680641, "rewards/accuracy_reward": 0.918367326259613, "rewards/format_reward": 1.0, "step": 5940 }, { "completion_length": 275.80611419677734, "epoch": 0.5978364779874213, "grad_norm": 0.5905246138572693, "kl": 0.10400390625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6810759902000427, "reward_std": 0.2132079415023327, "rewards/accuracy_reward": 0.7116883099079132, "rewards/format_reward": 0.9693877398967743, "step": 5941 }, { "completion_length": 218.23468780517578, "epoch": 0.597937106918239, "grad_norm": 0.9218011498451233, "kl": 0.1396484375, "learning_rate": 1e-06, "loss": 0.0056, "reward": 1.682786464691162, "reward_std": 0.2389931008219719, "rewards/accuracy_reward": 0.7236028611660004, "rewards/format_reward": 0.9591836333274841, "step": 5942 }, { "completion_length": 277.33673095703125, "epoch": 0.5980377358490566, "grad_norm": 1.069922924041748, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.5538033246994019, "reward_std": 0.18592774868011475, "rewards/accuracy_reward": 0.5946196615695953, "rewards/format_reward": 0.9591836631298065, "step": 5943 }, { "completion_length": 197.448974609375, "epoch": 0.5981383647798743, "grad_norm": 1.0377016067504883, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7776644229888916, "reward_std": 0.2389235496520996, "rewards/accuracy_reward": 0.7980725467205048, "rewards/format_reward": 0.9795918464660645, "step": 5944 }, { "completion_length": 216.14285278320312, "epoch": 0.5982389937106918, "grad_norm": 0.5846149921417236, "kl": 0.14794921875, "learning_rate": 1e-06, "loss": 0.0059, "reward": 1.5287804007530212, "reward_std": 0.14758775755763054, "rewards/accuracy_reward": 0.538984477519989, "rewards/format_reward": 0.9897959232330322, "step": 5945 }, { "completion_length": 185.30611419677734, "epoch": 0.5983396226415094, "grad_norm": 0.5491967797279358, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.9145124554634094, "reward_std": 0.06877214834094048, "rewards/accuracy_reward": 0.9247165620326996, "rewards/format_reward": 0.9897959232330322, "step": 5946 }, { "completion_length": 188.12244415283203, "epoch": 0.598440251572327, "grad_norm": 1.4615094661712646, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7821891903877258, "reward_std": 0.1671786978840828, "rewards/accuracy_reward": 0.8230055570602417, "rewards/format_reward": 0.9591836631298065, "step": 5947 }, { "completion_length": 183.77550506591797, "epoch": 0.5985408805031447, "grad_norm": 0.8211162686347961, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7626673579216003, "reward_std": 0.087130943313241, "rewards/accuracy_reward": 0.7626674175262451, "rewards/format_reward": 1.0, "step": 5948 }, { "completion_length": 271.5408172607422, "epoch": 0.5986415094339622, "grad_norm": 0.8530097603797913, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6197521090507507, "reward_std": 0.23989400640130043, "rewards/accuracy_reward": 0.6503643989562988, "rewards/format_reward": 0.9693877398967743, "step": 5949 }, { "completion_length": 345.0408020019531, "epoch": 0.5987421383647799, "grad_norm": 0.5784646272659302, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.589542031288147, "reward_std": 0.21135064214468002, "rewards/accuracy_reward": 0.6303583383560181, "rewards/format_reward": 0.9591836333274841, "step": 5950 }, { "completion_length": 296.65306091308594, "epoch": 0.5988427672955975, "grad_norm": 1.2863954305648804, "kl": 0.0672607421875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5497894287109375, "reward_std": 0.26753632724285126, "rewards/accuracy_reward": 0.60080985724926, "rewards/format_reward": 0.9489795863628387, "step": 5951 }, { "completion_length": 268.8061065673828, "epoch": 0.5989433962264151, "grad_norm": 0.6092501282691956, "kl": 0.0743408203125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8541578650474548, "reward_std": 0.1256532222032547, "rewards/accuracy_reward": 0.874565988779068, "rewards/format_reward": 0.9795918464660645, "step": 5952 }, { "completion_length": 213.88774871826172, "epoch": 0.5990440251572327, "grad_norm": 0.49150750041007996, "kl": 0.1025390625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8479548692703247, "reward_std": 0.19323484599590302, "rewards/accuracy_reward": 0.8683629631996155, "rewards/format_reward": 0.9795918166637421, "step": 5953 }, { "completion_length": 259.34693908691406, "epoch": 0.5991446540880503, "grad_norm": 1.2682366371154785, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6784406304359436, "reward_std": 0.20436934381723404, "rewards/accuracy_reward": 0.7090528607368469, "rewards/format_reward": 0.9693877398967743, "step": 5954 }, { "completion_length": 311.0918273925781, "epoch": 0.5992452830188679, "grad_norm": 0.5267773270606995, "kl": 0.05078125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8792517185211182, "reward_std": 0.15468762814998627, "rewards/accuracy_reward": 0.8894557654857635, "rewards/format_reward": 0.9897959232330322, "step": 5955 }, { "completion_length": 325.2346954345703, "epoch": 0.5993459119496856, "grad_norm": 1.0611226558685303, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6641420125961304, "reward_std": 0.26711611449718475, "rewards/accuracy_reward": 0.6947542577981949, "rewards/format_reward": 0.9693877398967743, "step": 5956 }, { "completion_length": 265.3673400878906, "epoch": 0.5994465408805032, "grad_norm": 0.6299037337303162, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.5969309210777283, "reward_std": 0.25326912850141525, "rewards/accuracy_reward": 0.6581553220748901, "rewards/format_reward": 0.9387754797935486, "step": 5957 }, { "completion_length": 246.49999237060547, "epoch": 0.5995471698113207, "grad_norm": 0.49299493432044983, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6510504484176636, "reward_std": 0.15616083517670631, "rewards/accuracy_reward": 0.6612545549869537, "rewards/format_reward": 0.9897959232330322, "step": 5958 }, { "completion_length": 260.28570556640625, "epoch": 0.5996477987421384, "grad_norm": 8.824994087219238, "kl": 0.824462890625, "learning_rate": 1e-06, "loss": 0.0332, "reward": 1.7789143919944763, "reward_std": 0.2045304849743843, "rewards/accuracy_reward": 0.8095267117023468, "rewards/format_reward": 0.9693877398967743, "step": 5959 }, { "completion_length": 230.72447967529297, "epoch": 0.599748427672956, "grad_norm": 4.113089084625244, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8272108435630798, "reward_std": 0.23946811258792877, "rewards/accuracy_reward": 0.8884353339672089, "rewards/format_reward": 0.938775509595871, "step": 5960 }, { "completion_length": 205.7551040649414, "epoch": 0.5998490566037736, "grad_norm": 2.426013469696045, "kl": 0.111572265625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7323153614997864, "reward_std": 0.21523544192314148, "rewards/accuracy_reward": 0.7629275321960449, "rewards/format_reward": 0.9693877398967743, "step": 5961 }, { "completion_length": 372.7142639160156, "epoch": 0.5999496855345912, "grad_norm": 0.5074877142906189, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6705699563026428, "reward_std": 0.26934149861335754, "rewards/accuracy_reward": 0.7317944169044495, "rewards/format_reward": 0.9387754797935486, "step": 5962 }, { "completion_length": 274.83673095703125, "epoch": 0.6000503144654088, "grad_norm": 1.2644590139389038, "kl": 0.102294921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.639452040195465, "reward_std": 0.17553190886974335, "rewards/accuracy_reward": 0.6802683919668198, "rewards/format_reward": 0.9591836631298065, "step": 5963 }, { "completion_length": 292.7040710449219, "epoch": 0.6001509433962264, "grad_norm": 0.6009355783462524, "kl": 0.106201171875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.849020779132843, "reward_std": 0.20685965567827225, "rewards/accuracy_reward": 0.8796330392360687, "rewards/format_reward": 0.9693877398967743, "step": 5964 }, { "completion_length": 251.65306091308594, "epoch": 0.6002515723270441, "grad_norm": 0.48975393176078796, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.9387754797935486, "reward_std": 0.11917255818843842, "rewards/accuracy_reward": 0.9489795863628387, "rewards/format_reward": 0.9897959232330322, "step": 5965 }, { "completion_length": 197.9693832397461, "epoch": 0.6003522012578616, "grad_norm": 0.8693367838859558, "kl": 0.1396484375, "learning_rate": 1e-06, "loss": 0.0056, "reward": 1.7718994617462158, "reward_std": 0.1741984561085701, "rewards/accuracy_reward": 0.8025117516517639, "rewards/format_reward": 0.9693877398967743, "step": 5966 }, { "completion_length": 262.3673400878906, "epoch": 0.6004528301886792, "grad_norm": 0.98127681016922, "kl": 0.0543212890625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7069764733314514, "reward_std": 0.08033622428774834, "rewards/accuracy_reward": 0.7069764733314514, "rewards/format_reward": 1.0, "step": 5967 }, { "completion_length": 257.36734771728516, "epoch": 0.6005534591194969, "grad_norm": 0.6113758087158203, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7364983558654785, "reward_std": 0.19259290490299463, "rewards/accuracy_reward": 0.7671105861663818, "rewards/format_reward": 0.9693877398967743, "step": 5968 }, { "completion_length": 275.1632537841797, "epoch": 0.6006540880503145, "grad_norm": 0.4529170095920563, "kl": 0.06689453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7747406363487244, "reward_std": 0.07187181152403355, "rewards/accuracy_reward": 0.7849446833133698, "rewards/format_reward": 0.9897959232330322, "step": 5969 }, { "completion_length": 270.5102005004883, "epoch": 0.600754716981132, "grad_norm": 0.810586154460907, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7419824600219727, "reward_std": 0.20849674195051193, "rewards/accuracy_reward": 0.7930029034614563, "rewards/format_reward": 0.9489795565605164, "step": 5970 }, { "completion_length": 298.76529693603516, "epoch": 0.6008553459119497, "grad_norm": 0.6256667375564575, "kl": 0.0748291015625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5219306349754333, "reward_std": 0.21197445690631866, "rewards/accuracy_reward": 0.5321347564458847, "rewards/format_reward": 0.9897959232330322, "step": 5971 }, { "completion_length": 293.74488830566406, "epoch": 0.6009559748427673, "grad_norm": 0.6254622340202332, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7762053608894348, "reward_std": 0.20049359649419785, "rewards/accuracy_reward": 0.8170217275619507, "rewards/format_reward": 0.9591836631298065, "step": 5972 }, { "completion_length": 217.57141876220703, "epoch": 0.6010566037735849, "grad_norm": 0.9235109090805054, "kl": 0.119873046875, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.717313289642334, "reward_std": 0.2018050104379654, "rewards/accuracy_reward": 0.7581295967102051, "rewards/format_reward": 0.9591836333274841, "step": 5973 }, { "completion_length": 272.78570556640625, "epoch": 0.6011572327044025, "grad_norm": 1.911923885345459, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7444119453430176, "reward_std": 0.15316841006278992, "rewards/accuracy_reward": 0.7648202478885651, "rewards/format_reward": 0.9795918166637421, "step": 5974 }, { "completion_length": 218.38774871826172, "epoch": 0.6012578616352201, "grad_norm": 0.790766716003418, "kl": 0.10205078125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8700897097587585, "reward_std": 0.09827693272382021, "rewards/accuracy_reward": 0.8700898289680481, "rewards/format_reward": 1.0, "step": 5975 }, { "completion_length": 230.82653045654297, "epoch": 0.6013584905660377, "grad_norm": 0.5867413878440857, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.835988461971283, "reward_std": 0.19265442341566086, "rewards/accuracy_reward": 0.8563966751098633, "rewards/format_reward": 0.9795918166637421, "step": 5976 }, { "completion_length": 280.8163146972656, "epoch": 0.6014591194968554, "grad_norm": 0.5299528241157532, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7331698536872864, "reward_std": 0.22230540961027145, "rewards/accuracy_reward": 0.7535780072212219, "rewards/format_reward": 0.9795918166637421, "step": 5977 }, { "completion_length": 256.30611419677734, "epoch": 0.6015597484276729, "grad_norm": 0.6549657583236694, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6449288725852966, "reward_std": 0.25938301533460617, "rewards/accuracy_reward": 0.6959493458271027, "rewards/format_reward": 0.9489795863628387, "step": 5978 }, { "completion_length": 220.58162689208984, "epoch": 0.6016603773584905, "grad_norm": 0.8453904390335083, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6812438368797302, "reward_std": 0.10587617382407188, "rewards/accuracy_reward": 0.6914480030536652, "rewards/format_reward": 0.9897959232330322, "step": 5979 }, { "completion_length": 257.35713958740234, "epoch": 0.6017610062893082, "grad_norm": 0.6214697957038879, "kl": 0.0614013671875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.810774028301239, "reward_std": 0.22878200560808182, "rewards/accuracy_reward": 0.8311822712421417, "rewards/format_reward": 0.9795918166637421, "step": 5980 }, { "completion_length": 233.4183578491211, "epoch": 0.6018616352201258, "grad_norm": 0.3845329284667969, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.668318748474121, "reward_std": 0.09787981025874615, "rewards/accuracy_reward": 0.6887269020080566, "rewards/format_reward": 0.9795918166637421, "step": 5981 }, { "completion_length": 248.85713958740234, "epoch": 0.6019622641509434, "grad_norm": 0.7617606520652771, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8407732844352722, "reward_std": 0.1465139165520668, "rewards/accuracy_reward": 0.8611814379692078, "rewards/format_reward": 0.9795918464660645, "step": 5982 }, { "completion_length": 219.70407104492188, "epoch": 0.602062893081761, "grad_norm": 0.9772137403488159, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.761904776096344, "reward_std": 0.13270121812820435, "rewards/accuracy_reward": 0.8027210533618927, "rewards/format_reward": 0.9591836333274841, "step": 5983 }, { "completion_length": 209.23468780517578, "epoch": 0.6021635220125786, "grad_norm": 0.7463628649711609, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8983516097068787, "reward_std": 0.1803625300526619, "rewards/accuracy_reward": 0.9289638698101044, "rewards/format_reward": 0.9693877398967743, "step": 5984 }, { "completion_length": 269.1836700439453, "epoch": 0.6022641509433962, "grad_norm": 0.6010444164276123, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6966271996498108, "reward_std": 0.14828507043421268, "rewards/accuracy_reward": 0.7068313360214233, "rewards/format_reward": 0.9897959232330322, "step": 5985 }, { "completion_length": 224.23468780517578, "epoch": 0.6023647798742139, "grad_norm": 0.8922208547592163, "kl": 0.097412109375, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.883116900920868, "reward_std": 0.1201944500207901, "rewards/accuracy_reward": 0.9137291312217712, "rewards/format_reward": 0.9693877398967743, "step": 5986 }, { "completion_length": 203.30612182617188, "epoch": 0.6024654088050314, "grad_norm": 0.9045203924179077, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.9591836333274841, "reward_std": 0.1079898476600647, "rewards/accuracy_reward": 0.9795918166637421, "rewards/format_reward": 0.9795918166637421, "step": 5987 }, { "completion_length": 200.41836547851562, "epoch": 0.602566037735849, "grad_norm": 0.3892296850681305, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7746878266334534, "reward_std": 0.09666703827679157, "rewards/accuracy_reward": 0.784891813993454, "rewards/format_reward": 0.9897959232330322, "step": 5988 }, { "completion_length": 248.27550506591797, "epoch": 0.6026666666666667, "grad_norm": 1.9588044881820679, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.734458327293396, "reward_std": 0.3279924541711807, "rewards/accuracy_reward": 0.7752746939659119, "rewards/format_reward": 0.9591836631298065, "step": 5989 }, { "completion_length": 228.1734619140625, "epoch": 0.6027672955974843, "grad_norm": 0.8735046982765198, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.644686758518219, "reward_std": 0.17784901335835457, "rewards/accuracy_reward": 0.6752991676330566, "rewards/format_reward": 0.9693877398967743, "step": 5990 }, { "completion_length": 242.8673439025879, "epoch": 0.6028679245283018, "grad_norm": 1.748355746269226, "kl": 0.127685546875, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.744547963142395, "reward_std": 0.2514600530266762, "rewards/accuracy_reward": 0.795568436384201, "rewards/format_reward": 0.9489795565605164, "step": 5991 }, { "completion_length": 299.55101776123047, "epoch": 0.6029685534591195, "grad_norm": 0.8121449947357178, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7594751715660095, "reward_std": 0.17494549602270126, "rewards/accuracy_reward": 0.7798833847045898, "rewards/format_reward": 0.9795918464660645, "step": 5992 }, { "completion_length": 240.06121826171875, "epoch": 0.6030691823899371, "grad_norm": 5.986726760864258, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.82211172580719, "reward_std": 0.17226728051900864, "rewards/accuracy_reward": 0.8425199389457703, "rewards/format_reward": 0.9795918464660645, "step": 5993 }, { "completion_length": 200.7653045654297, "epoch": 0.6031698113207548, "grad_norm": 1.0999090671539307, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7109462022781372, "reward_std": 0.1787765622138977, "rewards/accuracy_reward": 0.7109462022781372, "rewards/format_reward": 1.0, "step": 5994 }, { "completion_length": 217.7244873046875, "epoch": 0.6032704402515723, "grad_norm": 1.803330898284912, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8673468828201294, "reward_std": 0.13821138441562653, "rewards/accuracy_reward": 0.8775509893894196, "rewards/format_reward": 0.9897959232330322, "step": 5995 }, { "completion_length": 169.16326141357422, "epoch": 0.6033710691823899, "grad_norm": 1.3056361675262451, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8156906366348267, "reward_std": 0.15971661359071732, "rewards/accuracy_reward": 0.8156906068325043, "rewards/format_reward": 1.0, "step": 5996 }, { "completion_length": 292.6326446533203, "epoch": 0.6034716981132076, "grad_norm": 0.8955829739570618, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8195011019706726, "reward_std": 0.2647539898753166, "rewards/accuracy_reward": 0.8399092853069305, "rewards/format_reward": 0.9795918464660645, "step": 5997 }, { "completion_length": 209.13265228271484, "epoch": 0.6035723270440252, "grad_norm": 0.471613347530365, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8731777667999268, "reward_std": 0.14406796917319298, "rewards/accuracy_reward": 0.8833818733692169, "rewards/format_reward": 0.9897959232330322, "step": 5998 }, { "completion_length": 207.21428680419922, "epoch": 0.6036729559748427, "grad_norm": 0.6138791441917419, "kl": 0.1175537109375, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7275614738464355, "reward_std": 0.14159894734621048, "rewards/accuracy_reward": 0.7479696571826935, "rewards/format_reward": 0.9795918166637421, "step": 5999 }, { "completion_length": 292.61224365234375, "epoch": 0.6037735849056604, "grad_norm": 0.5625991821289062, "kl": 0.0716552734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6908285021781921, "reward_std": 0.1368604078888893, "rewards/accuracy_reward": 0.7010325491428375, "rewards/format_reward": 0.9897959232330322, "step": 6000 }, { "completion_length": 182.02040100097656, "epoch": 0.603874213836478, "grad_norm": 4.912395477294922, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6783097386360168, "reward_std": 0.1571008376777172, "rewards/accuracy_reward": 0.6885138154029846, "rewards/format_reward": 0.9897959232330322, "step": 6001 }, { "completion_length": 285.86734771728516, "epoch": 0.6039748427672956, "grad_norm": 0.4653298556804657, "kl": 0.0765380859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.9522778987884521, "reward_std": 0.10112594813108444, "rewards/accuracy_reward": 0.9522778689861298, "rewards/format_reward": 1.0, "step": 6002 }, { "completion_length": 224.05101776123047, "epoch": 0.6040754716981132, "grad_norm": 0.47652196884155273, "kl": 0.0643310546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7866469621658325, "reward_std": 0.1552523598074913, "rewards/accuracy_reward": 0.7968510687351227, "rewards/format_reward": 0.9897959232330322, "step": 6003 }, { "completion_length": 237.1836700439453, "epoch": 0.6041761006289308, "grad_norm": 1.9769220352172852, "kl": 0.118408203125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.740886926651001, "reward_std": 0.1835760772228241, "rewards/accuracy_reward": 0.740886926651001, "rewards/format_reward": 1.0, "step": 6004 }, { "completion_length": 202.52040100097656, "epoch": 0.6042767295597484, "grad_norm": 0.3598327934741974, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.8512678146362305, "reward_std": 0.09618107974529266, "rewards/accuracy_reward": 0.8716759085655212, "rewards/format_reward": 0.9795918166637421, "step": 6005 }, { "completion_length": 189.67346954345703, "epoch": 0.6043773584905661, "grad_norm": 0.6950569152832031, "kl": 0.14892578125, "learning_rate": 1e-06, "loss": 0.006, "reward": 1.8077922463417053, "reward_std": 0.07138044387102127, "rewards/accuracy_reward": 0.8077922463417053, "rewards/format_reward": 1.0, "step": 6006 }, { "completion_length": 275.74488830566406, "epoch": 0.6044779874213837, "grad_norm": 0.9423937797546387, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7582736015319824, "reward_std": 0.2089947983622551, "rewards/accuracy_reward": 0.7684777975082397, "rewards/format_reward": 0.9897959232330322, "step": 6007 }, { "completion_length": 262.04080963134766, "epoch": 0.6045786163522012, "grad_norm": 0.3708828091621399, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7236239910125732, "reward_std": 0.04484115540981293, "rewards/accuracy_reward": 0.7236239910125732, "rewards/format_reward": 1.0, "step": 6008 }, { "completion_length": 214.49999237060547, "epoch": 0.6046792452830189, "grad_norm": 1.4585397243499756, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8571428060531616, "reward_std": 0.21920375525951385, "rewards/accuracy_reward": 0.8877550959587097, "rewards/format_reward": 0.9693877398967743, "step": 6009 }, { "completion_length": 228.35713958740234, "epoch": 0.6047798742138365, "grad_norm": 0.7503620982170105, "kl": 0.0706787109375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6848540306091309, "reward_std": 0.22508274763822556, "rewards/accuracy_reward": 0.7052621990442276, "rewards/format_reward": 0.9795918464660645, "step": 6010 }, { "completion_length": 189.9081573486328, "epoch": 0.6048805031446541, "grad_norm": 1.3417599201202393, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7719500064849854, "reward_std": 0.13758005946874619, "rewards/accuracy_reward": 0.7719500064849854, "rewards/format_reward": 1.0, "step": 6011 }, { "completion_length": 273.4795913696289, "epoch": 0.6049811320754717, "grad_norm": 0.5912744402885437, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8948412537574768, "reward_std": 0.1333853378891945, "rewards/accuracy_reward": 0.9152494072914124, "rewards/format_reward": 0.9795918166637421, "step": 6012 }, { "completion_length": 298.7653045654297, "epoch": 0.6050817610062893, "grad_norm": 0.8902661204338074, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6336037516593933, "reward_std": 0.22903793305158615, "rewards/accuracy_reward": 0.6336038708686829, "rewards/format_reward": 1.0, "step": 6013 }, { "completion_length": 231.88775634765625, "epoch": 0.6051823899371069, "grad_norm": 0.5474696159362793, "kl": 0.0762939453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.836734652519226, "reward_std": 0.18555310368537903, "rewards/accuracy_reward": 0.8469387590885162, "rewards/format_reward": 0.9897959232330322, "step": 6014 }, { "completion_length": 149.12244415283203, "epoch": 0.6052830188679246, "grad_norm": 0.5622692108154297, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.857823133468628, "reward_std": 0.10786810517311096, "rewards/accuracy_reward": 0.8578231036663055, "rewards/format_reward": 1.0, "step": 6015 }, { "completion_length": 280.56121826171875, "epoch": 0.6053836477987421, "grad_norm": 1.08342707157135, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.694557785987854, "reward_std": 0.12951361387968063, "rewards/accuracy_reward": 0.7047618925571442, "rewards/format_reward": 0.9897959232330322, "step": 6016 }, { "completion_length": 261.4183578491211, "epoch": 0.6054842767295597, "grad_norm": 0.6584013104438782, "kl": 0.10693359375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.769495666027069, "reward_std": 0.13342096284031868, "rewards/accuracy_reward": 0.7796997129917145, "rewards/format_reward": 0.9897959232330322, "step": 6017 }, { "completion_length": 235.80611419677734, "epoch": 0.6055849056603774, "grad_norm": 0.9916936755180359, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6861888766288757, "reward_std": 0.16599568352103233, "rewards/accuracy_reward": 0.6861888766288757, "rewards/format_reward": 1.0, "step": 6018 }, { "completion_length": 205.6836700439453, "epoch": 0.605685534591195, "grad_norm": 0.704799234867096, "kl": 0.1240234375, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.6552875638008118, "reward_std": 0.1412128284573555, "rewards/accuracy_reward": 0.6654915809631348, "rewards/format_reward": 0.9897959232330322, "step": 6019 }, { "completion_length": 264.11224365234375, "epoch": 0.6057861635220125, "grad_norm": 0.8707624673843384, "kl": 0.115478515625, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.6154322624206543, "reward_std": 0.24243973195552826, "rewards/accuracy_reward": 0.6562486290931702, "rewards/format_reward": 0.9591836631298065, "step": 6020 }, { "completion_length": 271.3673400878906, "epoch": 0.6058867924528302, "grad_norm": 0.532130241394043, "kl": 0.0606689453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.691428542137146, "reward_std": 0.12717891857028008, "rewards/accuracy_reward": 0.691428542137146, "rewards/format_reward": 1.0, "step": 6021 }, { "completion_length": 193.78570556640625, "epoch": 0.6059874213836478, "grad_norm": 0.7831104397773743, "kl": 0.0650634765625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8666666746139526, "reward_std": 0.040585883893072605, "rewards/accuracy_reward": 0.8666666448116302, "rewards/format_reward": 1.0, "step": 6022 }, { "completion_length": 244.52040100097656, "epoch": 0.6060880503144654, "grad_norm": 0.8616753816604614, "kl": 0.1103515625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8251700401306152, "reward_std": 0.20376460254192352, "rewards/accuracy_reward": 0.8455782234668732, "rewards/format_reward": 0.9795918166637421, "step": 6023 }, { "completion_length": 182.53060913085938, "epoch": 0.606188679245283, "grad_norm": 1.00408136844635, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8596938252449036, "reward_std": 0.07303375005722046, "rewards/accuracy_reward": 0.8698979616165161, "rewards/format_reward": 0.9897959232330322, "step": 6024 }, { "completion_length": 252.17346954345703, "epoch": 0.6062893081761006, "grad_norm": 0.9782642126083374, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7066892981529236, "reward_std": 0.2943544387817383, "rewards/accuracy_reward": 0.7577097117900848, "rewards/format_reward": 0.9489795565605164, "step": 6025 }, { "completion_length": 231.02040100097656, "epoch": 0.6063899371069182, "grad_norm": 0.5947766304016113, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7180765271186829, "reward_std": 0.1581847257912159, "rewards/accuracy_reward": 0.7282806634902954, "rewards/format_reward": 0.9897959232330322, "step": 6026 }, { "completion_length": 261.1938781738281, "epoch": 0.6064905660377359, "grad_norm": 1.326460599899292, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7320725321769714, "reward_std": 0.2133791223168373, "rewards/accuracy_reward": 0.7320726215839386, "rewards/format_reward": 1.0, "step": 6027 }, { "completion_length": 237.9897918701172, "epoch": 0.6065911949685534, "grad_norm": 1.131155014038086, "kl": 0.10791015625, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.5264084935188293, "reward_std": 0.267286766320467, "rewards/accuracy_reward": 0.5468166619539261, "rewards/format_reward": 0.9795918464660645, "step": 6028 }, { "completion_length": 203.37754821777344, "epoch": 0.606691823899371, "grad_norm": 4.44467830657959, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8265305757522583, "reward_std": 0.08099238574504852, "rewards/accuracy_reward": 0.8265305757522583, "rewards/format_reward": 1.0, "step": 6029 }, { "completion_length": 204.1326446533203, "epoch": 0.6067924528301887, "grad_norm": 63.06987762451172, "kl": 1.761962890625, "learning_rate": 1e-06, "loss": 0.0707, "reward": 1.651311993598938, "reward_std": 0.1388365477323532, "rewards/accuracy_reward": 0.6717200875282288, "rewards/format_reward": 0.9795918464660645, "step": 6030 }, { "completion_length": 215.77550506591797, "epoch": 0.6068930817610063, "grad_norm": 1.022857427597046, "kl": 0.13623046875, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.71816748380661, "reward_std": 0.1979491338133812, "rewards/accuracy_reward": 0.748779833316803, "rewards/format_reward": 0.9693877398967743, "step": 6031 }, { "completion_length": 262.33673095703125, "epoch": 0.606993710691824, "grad_norm": 0.39041751623153687, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6683363914489746, "reward_std": 0.07623200863599777, "rewards/accuracy_reward": 0.668336421251297, "rewards/format_reward": 1.0, "step": 6032 }, { "completion_length": 230.2346954345703, "epoch": 0.6070943396226415, "grad_norm": 2.29557204246521, "kl": 0.126708984375, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.7508897185325623, "reward_std": 0.17558977007865906, "rewards/accuracy_reward": 0.7508897483348846, "rewards/format_reward": 1.0, "step": 6033 }, { "completion_length": 219.77550506591797, "epoch": 0.6071949685534591, "grad_norm": 0.8654146790504456, "kl": 0.096923828125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6591836214065552, "reward_std": 0.1515774317085743, "rewards/accuracy_reward": 0.6591836512088776, "rewards/format_reward": 1.0, "step": 6034 }, { "completion_length": 212.4897918701172, "epoch": 0.6072955974842768, "grad_norm": 1.1617618799209595, "kl": 0.12451171875, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.7223926186561584, "reward_std": 0.2687901332974434, "rewards/accuracy_reward": 0.7734130918979645, "rewards/format_reward": 0.9489795565605164, "step": 6035 }, { "completion_length": 208.03060913085938, "epoch": 0.6073962264150944, "grad_norm": 0.39461445808410645, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.5910395979881287, "reward_std": 0.07496274914592505, "rewards/accuracy_reward": 0.601243644952774, "rewards/format_reward": 0.9897959232330322, "step": 6036 }, { "completion_length": 287.37754821777344, "epoch": 0.6074968553459119, "grad_norm": 0.5013609528541565, "kl": 0.0755615234375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.686300277709961, "reward_std": 0.08556608110666275, "rewards/accuracy_reward": 0.6863002181053162, "rewards/format_reward": 1.0, "step": 6037 }, { "completion_length": 195.1326446533203, "epoch": 0.6075974842767295, "grad_norm": 4.90398645401001, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8102040886878967, "reward_std": 0.1718180552124977, "rewards/accuracy_reward": 0.8306122422218323, "rewards/format_reward": 0.9795918166637421, "step": 6038 }, { "completion_length": 248.82653045654297, "epoch": 0.6076981132075472, "grad_norm": 0.7739532589912415, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7625083923339844, "reward_std": 0.1489960364997387, "rewards/accuracy_reward": 0.7727124691009521, "rewards/format_reward": 0.9897959232330322, "step": 6039 }, { "completion_length": 246.82652282714844, "epoch": 0.6077987421383648, "grad_norm": 0.5926805734634399, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8326530456542969, "reward_std": 0.0827974621206522, "rewards/accuracy_reward": 0.8428571224212646, "rewards/format_reward": 0.9897959232330322, "step": 6040 }, { "completion_length": 177.28571319580078, "epoch": 0.6078993710691823, "grad_norm": 1.0431480407714844, "kl": 0.186279296875, "learning_rate": 1e-06, "loss": 0.0075, "reward": 1.8080174922943115, "reward_std": 0.19555514305830002, "rewards/accuracy_reward": 0.8284256458282471, "rewards/format_reward": 0.9795918464660645, "step": 6041 }, { "completion_length": 281.2142791748047, "epoch": 0.608, "grad_norm": 0.827853262424469, "kl": 0.100830078125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6682971119880676, "reward_std": 0.1650717929005623, "rewards/accuracy_reward": 0.6989094018936157, "rewards/format_reward": 0.9693877398967743, "step": 6042 }, { "completion_length": 312.95916748046875, "epoch": 0.6081006289308176, "grad_norm": 0.4381825625896454, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7329160571098328, "reward_std": 0.10517457872629166, "rewards/accuracy_reward": 0.7329160571098328, "rewards/format_reward": 1.0, "step": 6043 }, { "completion_length": 220.11223602294922, "epoch": 0.6082012578616353, "grad_norm": 1.2199227809906006, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7176251411437988, "reward_std": 0.1491300342604518, "rewards/accuracy_reward": 0.7176252007484436, "rewards/format_reward": 1.0, "step": 6044 }, { "completion_length": 186.69387435913086, "epoch": 0.6083018867924528, "grad_norm": 0.2536238431930542, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8367164134979248, "reward_std": 0.03373571112751961, "rewards/accuracy_reward": 0.8367164433002472, "rewards/format_reward": 1.0, "step": 6045 }, { "completion_length": 220.9285659790039, "epoch": 0.6084025157232704, "grad_norm": 2.979707717895508, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7532467246055603, "reward_std": 0.2285320684313774, "rewards/accuracy_reward": 0.7736549079418182, "rewards/format_reward": 0.9795918166637421, "step": 6046 }, { "completion_length": 292.88775634765625, "epoch": 0.6085031446540881, "grad_norm": 0.3506110906600952, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8210967779159546, "reward_std": 0.16024912148714066, "rewards/accuracy_reward": 0.8619131743907928, "rewards/format_reward": 0.9591836631298065, "step": 6047 }, { "completion_length": 176.33673095703125, "epoch": 0.6086037735849057, "grad_norm": 0.7171962857246399, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7662800550460815, "reward_std": 0.11778145655989647, "rewards/accuracy_reward": 0.7662800848484039, "rewards/format_reward": 1.0, "step": 6048 }, { "completion_length": 282.81632232666016, "epoch": 0.6087044025157232, "grad_norm": 0.5494193434715271, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7351660132408142, "reward_std": 0.13885286077857018, "rewards/accuracy_reward": 0.7351659536361694, "rewards/format_reward": 1.0, "step": 6049 }, { "completion_length": 200.86734008789062, "epoch": 0.6088050314465409, "grad_norm": 0.6460564136505127, "kl": 0.0594482421875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7799216508865356, "reward_std": 0.16047640144824982, "rewards/accuracy_reward": 0.8003298044204712, "rewards/format_reward": 0.9795918166637421, "step": 6050 }, { "completion_length": 186.58162689208984, "epoch": 0.6089056603773585, "grad_norm": 0.5680237412452698, "kl": 0.0631103515625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7972601652145386, "reward_std": 0.07948786625638604, "rewards/accuracy_reward": 0.797260195016861, "rewards/format_reward": 1.0, "step": 6051 }, { "completion_length": 200.25509643554688, "epoch": 0.6090062893081761, "grad_norm": 0.8068789839744568, "kl": 0.094970703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7305415272712708, "reward_std": 0.13109403103590012, "rewards/accuracy_reward": 0.7407456636428833, "rewards/format_reward": 0.9897959232330322, "step": 6052 }, { "completion_length": 230.6938705444336, "epoch": 0.6091069182389937, "grad_norm": 0.5874688625335693, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7022616267204285, "reward_std": 0.12164409831166267, "rewards/accuracy_reward": 0.7124657332897186, "rewards/format_reward": 0.9897959232330322, "step": 6053 }, { "completion_length": 240.69387817382812, "epoch": 0.6092075471698113, "grad_norm": 0.7880886793136597, "kl": 0.0987548828125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7399210333824158, "reward_std": 0.0632191002368927, "rewards/accuracy_reward": 0.7399210929870605, "rewards/format_reward": 1.0, "step": 6054 }, { "completion_length": 246.91836547851562, "epoch": 0.6093081761006289, "grad_norm": 0.7768402099609375, "kl": 0.09912109375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7878066301345825, "reward_std": 0.14910965040326118, "rewards/accuracy_reward": 0.7980107069015503, "rewards/format_reward": 0.9897959232330322, "step": 6055 }, { "completion_length": 238.68366241455078, "epoch": 0.6094088050314466, "grad_norm": 0.7601077556610107, "kl": 0.12939453125, "learning_rate": 1e-06, "loss": 0.0052, "reward": 1.7123486995697021, "reward_std": 0.23602311313152313, "rewards/accuracy_reward": 0.7225527763366699, "rewards/format_reward": 0.9897959232330322, "step": 6056 }, { "completion_length": 165.6836700439453, "epoch": 0.6095094339622642, "grad_norm": 2.868062734603882, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8129251599311829, "reward_std": 0.17549104988574982, "rewards/accuracy_reward": 0.8435373902320862, "rewards/format_reward": 0.9693877398967743, "step": 6057 }, { "completion_length": 251.2448959350586, "epoch": 0.6096100628930817, "grad_norm": 0.5056284666061401, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.780612289905548, "reward_std": 0.06495761126279831, "rewards/accuracy_reward": 0.7806122303009033, "rewards/format_reward": 1.0, "step": 6058 }, { "completion_length": 215.2959213256836, "epoch": 0.6097106918238994, "grad_norm": 1.1353555917739868, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.765127182006836, "reward_std": 0.14608231745660305, "rewards/accuracy_reward": 0.7753312885761261, "rewards/format_reward": 0.9897959232330322, "step": 6059 }, { "completion_length": 253.18367767333984, "epoch": 0.609811320754717, "grad_norm": 0.8666407465934753, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7234644293785095, "reward_std": 0.1843279004096985, "rewards/accuracy_reward": 0.764280766248703, "rewards/format_reward": 0.9591836631298065, "step": 6060 }, { "completion_length": 241.39794921875, "epoch": 0.6099119496855346, "grad_norm": 0.6151794195175171, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.5842307209968567, "reward_std": 0.12797271460294724, "rewards/accuracy_reward": 0.5842308402061462, "rewards/format_reward": 1.0, "step": 6061 }, { "completion_length": 249.06122589111328, "epoch": 0.6100125786163522, "grad_norm": 0.4929678440093994, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.935374140739441, "reward_std": 0.04499576613306999, "rewards/accuracy_reward": 0.9455782175064087, "rewards/format_reward": 0.9897959232330322, "step": 6062 }, { "completion_length": 223.7040786743164, "epoch": 0.6101132075471698, "grad_norm": 0.5670861005783081, "kl": 0.113525390625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7958255410194397, "reward_std": 0.15129338204860687, "rewards/accuracy_reward": 0.8060296773910522, "rewards/format_reward": 0.9897959232330322, "step": 6063 }, { "completion_length": 251.4183578491211, "epoch": 0.6102138364779874, "grad_norm": 2.6839683055877686, "kl": 0.0606689453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.869180977344513, "reward_std": 0.13701802492141724, "rewards/accuracy_reward": 0.8691810369491577, "rewards/format_reward": 1.0, "step": 6064 }, { "completion_length": 265.2550964355469, "epoch": 0.6103144654088051, "grad_norm": 0.7163942456245422, "kl": 0.0694580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7346938252449036, "reward_std": 0.20802105963230133, "rewards/accuracy_reward": 0.7448979616165161, "rewards/format_reward": 0.9897959232330322, "step": 6065 }, { "completion_length": 243.2959213256836, "epoch": 0.6104150943396226, "grad_norm": 3.2432003021240234, "kl": 0.107177734375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.6085209846496582, "reward_std": 0.16041939705610275, "rewards/accuracy_reward": 0.6289292573928833, "rewards/format_reward": 0.9795918166637421, "step": 6066 }, { "completion_length": 272.9081573486328, "epoch": 0.6105157232704402, "grad_norm": 0.6861068606376648, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8311617374420166, "reward_std": 0.1985357105731964, "rewards/accuracy_reward": 0.8515698313713074, "rewards/format_reward": 0.9795918464660645, "step": 6067 }, { "completion_length": 212.09183502197266, "epoch": 0.6106163522012579, "grad_norm": 0.9336445331573486, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8184807300567627, "reward_std": 0.09905437380075455, "rewards/accuracy_reward": 0.8286848068237305, "rewards/format_reward": 0.9897959232330322, "step": 6068 }, { "completion_length": 299.4285583496094, "epoch": 0.6107169811320755, "grad_norm": 0.4544377326965332, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.667573630809784, "reward_std": 0.12351282313466072, "rewards/accuracy_reward": 0.6675736904144287, "rewards/format_reward": 1.0, "step": 6069 }, { "completion_length": 249.6530532836914, "epoch": 0.610817610062893, "grad_norm": 0.4826226532459259, "kl": 0.0689697265625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.70982164144516, "reward_std": 0.11745380982756615, "rewards/accuracy_reward": 0.7098216116428375, "rewards/format_reward": 1.0, "step": 6070 }, { "completion_length": 249.85714721679688, "epoch": 0.6109182389937107, "grad_norm": 0.717035174369812, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.780842125415802, "reward_std": 0.1523466594517231, "rewards/accuracy_reward": 0.7910462021827698, "rewards/format_reward": 0.9897959232330322, "step": 6071 }, { "completion_length": 206.08162689208984, "epoch": 0.6110188679245283, "grad_norm": 0.790810763835907, "kl": 0.0963134765625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7348882555961609, "reward_std": 0.12577837705612183, "rewards/accuracy_reward": 0.7348882555961609, "rewards/format_reward": 1.0, "step": 6072 }, { "completion_length": 239.448974609375, "epoch": 0.611119496855346, "grad_norm": 0.6340769529342651, "kl": 0.0614013671875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.731709897518158, "reward_std": 0.12168751284480095, "rewards/accuracy_reward": 0.731709897518158, "rewards/format_reward": 1.0, "step": 6073 }, { "completion_length": 226.87754821777344, "epoch": 0.6112201257861635, "grad_norm": 1.2435054779052734, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6159642934799194, "reward_std": 0.1447855681180954, "rewards/accuracy_reward": 0.6159642934799194, "rewards/format_reward": 1.0, "step": 6074 }, { "completion_length": 192.9897918701172, "epoch": 0.6113207547169811, "grad_norm": 0.8602598309516907, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8292136192321777, "reward_std": 0.15595686808228493, "rewards/accuracy_reward": 0.8700299263000488, "rewards/format_reward": 0.9591836631298065, "step": 6075 }, { "completion_length": 189.4591827392578, "epoch": 0.6114213836477987, "grad_norm": 0.9703094363212585, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8337219953536987, "reward_std": 0.17432914674282074, "rewards/accuracy_reward": 0.8541301786899567, "rewards/format_reward": 0.9795918464660645, "step": 6076 }, { "completion_length": 253.33673095703125, "epoch": 0.6115220125786164, "grad_norm": 1.4271141290664673, "kl": 0.0775146484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.594104290008545, "reward_std": 0.13075865805149078, "rewards/accuracy_reward": 0.6043083816766739, "rewards/format_reward": 0.9897959232330322, "step": 6077 }, { "completion_length": 255.44898223876953, "epoch": 0.611622641509434, "grad_norm": 0.7576304078102112, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.759510099887848, "reward_std": 0.14219869300723076, "rewards/accuracy_reward": 0.7595101594924927, "rewards/format_reward": 1.0, "step": 6078 }, { "completion_length": 198.29591369628906, "epoch": 0.6117232704402515, "grad_norm": 1.6000789403915405, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8775509595870972, "reward_std": 0.18435019254684448, "rewards/accuracy_reward": 0.9081632494926453, "rewards/format_reward": 0.9693877398967743, "step": 6079 }, { "completion_length": 253.97958374023438, "epoch": 0.6118238993710692, "grad_norm": 0.7566680312156677, "kl": 0.116943359375, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7113890647888184, "reward_std": 0.16287941858172417, "rewards/accuracy_reward": 0.7420012950897217, "rewards/format_reward": 0.9693877398967743, "step": 6080 }, { "completion_length": 239.7653045654297, "epoch": 0.6119245283018868, "grad_norm": 1.0956631898880005, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.575551450252533, "reward_std": 0.14838356897234917, "rewards/accuracy_reward": 0.5755514204502106, "rewards/format_reward": 1.0, "step": 6081 }, { "completion_length": 206.67346954345703, "epoch": 0.6120251572327045, "grad_norm": 0.8533883690834045, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6774672269821167, "reward_std": 0.195403590798378, "rewards/accuracy_reward": 0.6876713335514069, "rewards/format_reward": 0.9897959232330322, "step": 6082 }, { "completion_length": 293.0918197631836, "epoch": 0.612125786163522, "grad_norm": 0.4485761225223541, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7480552792549133, "reward_std": 0.19181759282946587, "rewards/accuracy_reward": 0.7684633731842041, "rewards/format_reward": 0.9795918464660645, "step": 6083 }, { "completion_length": 217.32652282714844, "epoch": 0.6122264150943396, "grad_norm": 0.8861916065216064, "kl": 0.093994140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8720468878746033, "reward_std": 0.17375333607196808, "rewards/accuracy_reward": 0.8822510838508606, "rewards/format_reward": 0.9897959232330322, "step": 6084 }, { "completion_length": 228.51020050048828, "epoch": 0.6123270440251573, "grad_norm": 2.778019905090332, "kl": 0.0965576171875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8676521182060242, "reward_std": 0.10059547610580921, "rewards/accuracy_reward": 0.867652177810669, "rewards/format_reward": 1.0, "step": 6085 }, { "completion_length": 197.64285278320312, "epoch": 0.6124276729559749, "grad_norm": 0.9735540151596069, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6009476780891418, "reward_std": 0.09902568534016609, "rewards/accuracy_reward": 0.6213558614253998, "rewards/format_reward": 0.9795918464660645, "step": 6086 }, { "completion_length": 254.49999237060547, "epoch": 0.6125283018867924, "grad_norm": 0.7329344749450684, "kl": 0.0986328125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.705786943435669, "reward_std": 0.22036480531096458, "rewards/accuracy_reward": 0.7466033101081848, "rewards/format_reward": 0.9591836631298065, "step": 6087 }, { "completion_length": 225.03060913085938, "epoch": 0.61262893081761, "grad_norm": 0.6631953716278076, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.9428570866584778, "reward_std": 0.09807625412940979, "rewards/accuracy_reward": 0.9428571462631226, "rewards/format_reward": 1.0, "step": 6088 }, { "completion_length": 193.65306091308594, "epoch": 0.6127295597484277, "grad_norm": 0.9196004867553711, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8680115342140198, "reward_std": 0.1714090332388878, "rewards/accuracy_reward": 0.8986237943172455, "rewards/format_reward": 0.9693877398967743, "step": 6089 }, { "completion_length": 232.67346954345703, "epoch": 0.6128301886792453, "grad_norm": 0.923777163028717, "kl": 0.108642578125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.767369270324707, "reward_std": 0.25804493576288223, "rewards/accuracy_reward": 0.7673693597316742, "rewards/format_reward": 1.0, "step": 6090 }, { "completion_length": 228.68366241455078, "epoch": 0.6129308176100629, "grad_norm": 0.6007865071296692, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.693173587322235, "reward_std": 0.17961487546563148, "rewards/accuracy_reward": 0.7135818302631378, "rewards/format_reward": 0.9795918166637421, "step": 6091 }, { "completion_length": 193.77550506591797, "epoch": 0.6130314465408805, "grad_norm": 0.5606977343559265, "kl": 0.119140625, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.781908392906189, "reward_std": 0.15699896961450577, "rewards/accuracy_reward": 0.7921124994754791, "rewards/format_reward": 0.9897959232330322, "step": 6092 }, { "completion_length": 312.2958984375, "epoch": 0.6131320754716981, "grad_norm": 0.5366261005401611, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6838451027870178, "reward_std": 0.10901672765612602, "rewards/accuracy_reward": 0.6940492391586304, "rewards/format_reward": 0.9897959232330322, "step": 6093 }, { "completion_length": 199.6734619140625, "epoch": 0.6132327044025158, "grad_norm": 0.6409168839454651, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7161409854888916, "reward_std": 0.09851211309432983, "rewards/accuracy_reward": 0.726345032453537, "rewards/format_reward": 0.9897959232330322, "step": 6094 }, { "completion_length": 243.97958374023438, "epoch": 0.6133333333333333, "grad_norm": 0.5947122573852539, "kl": 0.094970703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7522915601730347, "reward_std": 0.09454722702503204, "rewards/accuracy_reward": 0.7624956965446472, "rewards/format_reward": 0.9897959232330322, "step": 6095 }, { "completion_length": 249.6734619140625, "epoch": 0.6134339622641509, "grad_norm": 0.7871359586715698, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6873445510864258, "reward_std": 0.12212162837386131, "rewards/accuracy_reward": 0.6873446106910706, "rewards/format_reward": 1.0, "step": 6096 }, { "completion_length": 240.73468780517578, "epoch": 0.6135345911949686, "grad_norm": 0.8083881139755249, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.811250627040863, "reward_std": 0.18261459469795227, "rewards/accuracy_reward": 0.8112506866455078, "rewards/format_reward": 1.0, "step": 6097 }, { "completion_length": 234.23468780517578, "epoch": 0.6136352201257862, "grad_norm": 0.6771810054779053, "kl": 0.104736328125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7874541282653809, "reward_std": 0.0908166877925396, "rewards/accuracy_reward": 0.7874541878700256, "rewards/format_reward": 1.0, "step": 6098 }, { "completion_length": 259.27550506591797, "epoch": 0.6137358490566037, "grad_norm": 0.3136073052883148, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8567275404930115, "reward_std": 0.05419532302767038, "rewards/accuracy_reward": 0.8669316470623016, "rewards/format_reward": 0.9897959232330322, "step": 6099 }, { "completion_length": 196.55101776123047, "epoch": 0.6138364779874214, "grad_norm": 0.7010274529457092, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7570048570632935, "reward_std": 0.12262232229113579, "rewards/accuracy_reward": 0.7672089338302612, "rewards/format_reward": 0.9897959232330322, "step": 6100 }, { "completion_length": 358.27549743652344, "epoch": 0.613937106918239, "grad_norm": 0.7982661128044128, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.572050392627716, "reward_std": 0.15570640936493874, "rewards/accuracy_reward": 0.5720505118370056, "rewards/format_reward": 1.0, "step": 6101 }, { "completion_length": 274.05101013183594, "epoch": 0.6140377358490566, "grad_norm": 0.5919872522354126, "kl": 0.065185546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.736145257949829, "reward_std": 0.11071596294641495, "rewards/accuracy_reward": 0.7463494539260864, "rewards/format_reward": 0.9897959232330322, "step": 6102 }, { "completion_length": 255.40816497802734, "epoch": 0.6141383647798743, "grad_norm": 0.47560635209083557, "kl": 0.0635986328125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.843537449836731, "reward_std": 0.08999153971672058, "rewards/accuracy_reward": 0.8435374200344086, "rewards/format_reward": 1.0, "step": 6103 }, { "completion_length": 309.04080963134766, "epoch": 0.6142389937106918, "grad_norm": 0.9098798632621765, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.664251983165741, "reward_std": 0.23875973373651505, "rewards/accuracy_reward": 0.6846600770950317, "rewards/format_reward": 0.9795918464660645, "step": 6104 }, { "completion_length": 291.90816497802734, "epoch": 0.6143396226415094, "grad_norm": 0.38894331455230713, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7659245729446411, "reward_std": 0.05489705689251423, "rewards/accuracy_reward": 0.7761286199092865, "rewards/format_reward": 0.9897959232330322, "step": 6105 }, { "completion_length": 231.2551040649414, "epoch": 0.6144402515723271, "grad_norm": 0.698146402835846, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8250728845596313, "reward_std": 0.15349595248699188, "rewards/accuracy_reward": 0.8556851148605347, "rewards/format_reward": 0.9693877398967743, "step": 6106 }, { "completion_length": 269.3673400878906, "epoch": 0.6145408805031447, "grad_norm": 1.1426100730895996, "kl": 0.10205078125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7035583853721619, "reward_std": 0.13503178395330906, "rewards/accuracy_reward": 0.7137624323368073, "rewards/format_reward": 0.9897959232330322, "step": 6107 }, { "completion_length": 277.1530532836914, "epoch": 0.6146415094339622, "grad_norm": 2.4978277683258057, "kl": 0.204833984375, "learning_rate": 1e-06, "loss": 0.0082, "reward": 1.8571428060531616, "reward_std": 0.16984088718891144, "rewards/accuracy_reward": 0.8775510191917419, "rewards/format_reward": 0.9795918166637421, "step": 6108 }, { "completion_length": 193.1938705444336, "epoch": 0.6147421383647799, "grad_norm": 0.7937909364700317, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8810182213783264, "reward_std": 0.16130591928958893, "rewards/accuracy_reward": 0.8912222981452942, "rewards/format_reward": 0.9897959232330322, "step": 6109 }, { "completion_length": 201.02040100097656, "epoch": 0.6148427672955975, "grad_norm": 0.7139734029769897, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7230321168899536, "reward_std": 0.124809380620718, "rewards/accuracy_reward": 0.7332361340522766, "rewards/format_reward": 0.9897959232330322, "step": 6110 }, { "completion_length": 289.06121826171875, "epoch": 0.6149433962264151, "grad_norm": 0.8338938355445862, "kl": 0.10986328125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.4957833290100098, "reward_std": 0.15449311584234238, "rewards/accuracy_reward": 0.5161915421485901, "rewards/format_reward": 0.9795918166637421, "step": 6111 }, { "completion_length": 319.10203552246094, "epoch": 0.6150440251572327, "grad_norm": 0.5396265983581543, "kl": 0.10302734375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.5287753343582153, "reward_std": 0.12056906148791313, "rewards/accuracy_reward": 0.5491835176944733, "rewards/format_reward": 0.9795918464660645, "step": 6112 }, { "completion_length": 274.2550964355469, "epoch": 0.6151446540880503, "grad_norm": 1.3154348134994507, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6793293952941895, "reward_std": 0.22366258967667818, "rewards/accuracy_reward": 0.7099417150020599, "rewards/format_reward": 0.9693877398967743, "step": 6113 }, { "completion_length": 194.22447967529297, "epoch": 0.6152452830188679, "grad_norm": 1.1776294708251953, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8858762383460999, "reward_std": 0.14840587973594666, "rewards/accuracy_reward": 0.9164884686470032, "rewards/format_reward": 0.9693877398967743, "step": 6114 }, { "completion_length": 280.1224365234375, "epoch": 0.6153459119496856, "grad_norm": 0.33114200830459595, "kl": 0.093994140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.5587713718414307, "reward_std": 0.07037174981087446, "rewards/accuracy_reward": 0.579179510474205, "rewards/format_reward": 0.9795918166637421, "step": 6115 }, { "completion_length": 310.051025390625, "epoch": 0.6154465408805031, "grad_norm": 0.7796745300292969, "kl": 0.0723876953125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.692374050617218, "reward_std": 0.19801421463489532, "rewards/accuracy_reward": 0.702578216791153, "rewards/format_reward": 0.9897959232330322, "step": 6116 }, { "completion_length": 284.74488830566406, "epoch": 0.6155471698113207, "grad_norm": 0.8617724776268005, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6465126872062683, "reward_std": 0.19289042055606842, "rewards/accuracy_reward": 0.6669208109378815, "rewards/format_reward": 0.9795918464660645, "step": 6117 }, { "completion_length": 275.02040100097656, "epoch": 0.6156477987421384, "grad_norm": 0.6462401151657104, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7327784299850464, "reward_std": 0.12576564028859138, "rewards/accuracy_reward": 0.7327784597873688, "rewards/format_reward": 1.0, "step": 6118 }, { "completion_length": 296.83673095703125, "epoch": 0.615748427672956, "grad_norm": 5.581773281097412, "kl": 0.22900390625, "learning_rate": 1e-06, "loss": 0.0092, "reward": 1.593062937259674, "reward_std": 0.1871999427676201, "rewards/accuracy_reward": 0.6236751973628998, "rewards/format_reward": 0.9693877398967743, "step": 6119 }, { "completion_length": 233.94898223876953, "epoch": 0.6158490566037735, "grad_norm": 0.5624784231185913, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8803475499153137, "reward_std": 0.1251269429922104, "rewards/accuracy_reward": 0.8905516564846039, "rewards/format_reward": 0.9897959232330322, "step": 6120 }, { "completion_length": 246.90816497802734, "epoch": 0.6159496855345912, "grad_norm": 0.7005864381790161, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8482215404510498, "reward_std": 0.11042826436460018, "rewards/accuracy_reward": 0.8584256768226624, "rewards/format_reward": 0.9897959232330322, "step": 6121 }, { "completion_length": 278.03060150146484, "epoch": 0.6160503144654088, "grad_norm": 1.1695926189422607, "kl": 0.151611328125, "learning_rate": 1e-06, "loss": 0.006, "reward": 1.7247969508171082, "reward_std": 0.19479241967201233, "rewards/accuracy_reward": 0.755409300327301, "rewards/format_reward": 0.9693877398967743, "step": 6122 }, { "completion_length": 313.7346954345703, "epoch": 0.6161509433962264, "grad_norm": 0.9506562352180481, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8673468828201294, "reward_std": 0.14284341782331467, "rewards/accuracy_reward": 0.8877550661563873, "rewards/format_reward": 0.9795918464660645, "step": 6123 }, { "completion_length": 325.0918273925781, "epoch": 0.616251572327044, "grad_norm": 1.965539813041687, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7354667782783508, "reward_std": 0.22154151648283005, "rewards/accuracy_reward": 0.7864873111248016, "rewards/format_reward": 0.9489795565605164, "step": 6124 }, { "completion_length": 210.74488830566406, "epoch": 0.6163522012578616, "grad_norm": 1.0080081224441528, "kl": 0.1083984375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.8726609945297241, "reward_std": 0.22666635736823082, "rewards/accuracy_reward": 0.9134773015975952, "rewards/format_reward": 0.9591836631298065, "step": 6125 }, { "completion_length": 179.1836700439453, "epoch": 0.6164528301886792, "grad_norm": 1.3769899606704712, "kl": 0.117431640625, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7437704801559448, "reward_std": 0.20855876430869102, "rewards/accuracy_reward": 0.7743827402591705, "rewards/format_reward": 0.9693877398967743, "step": 6126 }, { "completion_length": 223.51019287109375, "epoch": 0.6165534591194969, "grad_norm": 1.2933659553527832, "kl": 0.12109375, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7834793329238892, "reward_std": 0.1646108515560627, "rewards/accuracy_reward": 0.8140916526317596, "rewards/format_reward": 0.9693877398967743, "step": 6127 }, { "completion_length": 312.4795837402344, "epoch": 0.6166540880503145, "grad_norm": 0.6951369047164917, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.5739478468894958, "reward_std": 0.2695978209376335, "rewards/accuracy_reward": 0.594356045126915, "rewards/format_reward": 0.9795918166637421, "step": 6128 }, { "completion_length": 248.84693908691406, "epoch": 0.616754716981132, "grad_norm": 0.8687657713890076, "kl": 0.139404296875, "learning_rate": 1e-06, "loss": 0.0056, "reward": 1.8138372302055359, "reward_std": 0.21167828142642975, "rewards/accuracy_reward": 0.8240412771701813, "rewards/format_reward": 0.9897959232330322, "step": 6129 }, { "completion_length": 306.6632537841797, "epoch": 0.6168553459119497, "grad_norm": 0.5077134966850281, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6942917704582214, "reward_std": 0.2083912268280983, "rewards/accuracy_reward": 0.7146999835968018, "rewards/format_reward": 0.9795918166637421, "step": 6130 }, { "completion_length": 250.6836700439453, "epoch": 0.6169559748427673, "grad_norm": 1.3430287837982178, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7193877696990967, "reward_std": 0.2283373512327671, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.9693877398967743, "step": 6131 }, { "completion_length": 300.12244415283203, "epoch": 0.617056603773585, "grad_norm": 0.607085108757019, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7115204334259033, "reward_std": 0.18482567742466927, "rewards/accuracy_reward": 0.7421326637268066, "rewards/format_reward": 0.9693877398967743, "step": 6132 }, { "completion_length": 288.11224365234375, "epoch": 0.6171572327044025, "grad_norm": 1.196326494216919, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7629428505897522, "reward_std": 0.18928361684083939, "rewards/accuracy_reward": 0.7731468975543976, "rewards/format_reward": 0.9897959232330322, "step": 6133 }, { "completion_length": 244.25509643554688, "epoch": 0.6172578616352201, "grad_norm": 4.259077548980713, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.687531292438507, "reward_std": 0.1834767758846283, "rewards/accuracy_reward": 0.6875313222408295, "rewards/format_reward": 1.0, "step": 6134 }, { "completion_length": 279.14286041259766, "epoch": 0.6173584905660378, "grad_norm": 1.2645201683044434, "kl": 0.11669921875, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.714288055896759, "reward_std": 0.21126887202262878, "rewards/accuracy_reward": 0.7755125463008881, "rewards/format_reward": 0.938775509595871, "step": 6135 }, { "completion_length": 244.7755126953125, "epoch": 0.6174591194968554, "grad_norm": 0.532463550567627, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8696144819259644, "reward_std": 0.13930781185626984, "rewards/accuracy_reward": 0.8900226354598999, "rewards/format_reward": 0.9795918166637421, "step": 6136 }, { "completion_length": 296.3877410888672, "epoch": 0.6175597484276729, "grad_norm": 0.6917830109596252, "kl": 0.110107421875, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7946252226829529, "reward_std": 0.12692764773964882, "rewards/accuracy_reward": 0.7946252524852753, "rewards/format_reward": 1.0, "step": 6137 }, { "completion_length": 335.62245178222656, "epoch": 0.6176603773584906, "grad_norm": 0.6512343287467957, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7531960606575012, "reward_std": 0.19484828412532806, "rewards/accuracy_reward": 0.7736042737960815, "rewards/format_reward": 0.9795918166637421, "step": 6138 }, { "completion_length": 274.2755126953125, "epoch": 0.6177610062893082, "grad_norm": 0.912609875202179, "kl": 0.0706787109375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8147173523902893, "reward_std": 0.14798756688833237, "rewards/accuracy_reward": 0.8249214589595795, "rewards/format_reward": 0.9897959232330322, "step": 6139 }, { "completion_length": 318.4795837402344, "epoch": 0.6178616352201258, "grad_norm": 0.6689526438713074, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6175509095191956, "reward_std": 0.12911519408226013, "rewards/accuracy_reward": 0.6277550756931305, "rewards/format_reward": 0.9897959232330322, "step": 6140 }, { "completion_length": 231.22447967529297, "epoch": 0.6179622641509434, "grad_norm": 1.9797954559326172, "kl": 0.126953125, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.7568026781082153, "reward_std": 0.1978788673877716, "rewards/accuracy_reward": 0.7772108614444733, "rewards/format_reward": 0.9795918166637421, "step": 6141 }, { "completion_length": 234.4591827392578, "epoch": 0.618062893081761, "grad_norm": 0.5750674605369568, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.754514455795288, "reward_std": 0.1769612431526184, "rewards/accuracy_reward": 0.7647185921669006, "rewards/format_reward": 0.9897959232330322, "step": 6142 }, { "completion_length": 314.051025390625, "epoch": 0.6181635220125786, "grad_norm": 0.9540138244628906, "kl": 0.091552734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.691005527973175, "reward_std": 0.1699007749557495, "rewards/accuracy_reward": 0.7012096047401428, "rewards/format_reward": 0.9897959232330322, "step": 6143 }, { "completion_length": 251.29591369628906, "epoch": 0.6182641509433963, "grad_norm": 1.8207682371139526, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7465470433235168, "reward_std": 0.18534059450030327, "rewards/accuracy_reward": 0.7465470731258392, "rewards/format_reward": 1.0, "step": 6144 }, { "completion_length": 303.39794921875, "epoch": 0.6183647798742138, "grad_norm": 0.3856446146965027, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7890331745147705, "reward_std": 0.17182143591344357, "rewards/accuracy_reward": 0.819645494222641, "rewards/format_reward": 0.9693877398967743, "step": 6145 }, { "completion_length": 240.87754821777344, "epoch": 0.6184654088050314, "grad_norm": 1.3405308723449707, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6472634673118591, "reward_std": 0.17786074429750443, "rewards/accuracy_reward": 0.6778756380081177, "rewards/format_reward": 0.9693877398967743, "step": 6146 }, { "completion_length": 283.61224365234375, "epoch": 0.6185660377358491, "grad_norm": 0.849786102771759, "kl": 0.106201171875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7065410614013672, "reward_std": 0.20530364848673344, "rewards/accuracy_reward": 0.7269492447376251, "rewards/format_reward": 0.9795918166637421, "step": 6147 }, { "completion_length": 320.57142639160156, "epoch": 0.6186666666666667, "grad_norm": 0.7853444814682007, "kl": 0.1171875, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.6878220438957214, "reward_std": 0.1976412832736969, "rewards/accuracy_reward": 0.7184343338012695, "rewards/format_reward": 0.9693877398967743, "step": 6148 }, { "completion_length": 257.17346954345703, "epoch": 0.6187672955974842, "grad_norm": 0.6514701247215271, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7038862109184265, "reward_std": 0.10682432353496552, "rewards/accuracy_reward": 0.7140903174877167, "rewards/format_reward": 0.9897959232330322, "step": 6149 }, { "completion_length": 285.6530532836914, "epoch": 0.6188679245283019, "grad_norm": 0.85642409324646, "kl": 0.05810546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8140437006950378, "reward_std": 0.19210770726203918, "rewards/accuracy_reward": 0.8548599779605865, "rewards/format_reward": 0.9591836333274841, "step": 6150 }, { "completion_length": 194.61223602294922, "epoch": 0.6189685534591195, "grad_norm": 0.3839455842971802, "kl": 0.116455078125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7155091762542725, "reward_std": 0.09383785538375378, "rewards/accuracy_reward": 0.7155091464519501, "rewards/format_reward": 1.0, "step": 6151 }, { "completion_length": 380.2857208251953, "epoch": 0.6190691823899371, "grad_norm": 0.6671134233474731, "kl": 0.0552978515625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6308530569076538, "reward_std": 0.2954903393983841, "rewards/accuracy_reward": 0.6512612402439117, "rewards/format_reward": 0.9795918464660645, "step": 6152 }, { "completion_length": 171.31632614135742, "epoch": 0.6191698113207548, "grad_norm": 1.0225657224655151, "kl": 0.122802734375, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.7778324484825134, "reward_std": 0.15640271082520485, "rewards/accuracy_reward": 0.7880365550518036, "rewards/format_reward": 0.9897959232330322, "step": 6153 }, { "completion_length": 325.7346954345703, "epoch": 0.6192704402515723, "grad_norm": 0.4669840931892395, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6185101866722107, "reward_std": 0.13817359134554863, "rewards/accuracy_reward": 0.649122416973114, "rewards/format_reward": 0.9693877398967743, "step": 6154 }, { "completion_length": 327.77550506591797, "epoch": 0.6193710691823899, "grad_norm": 0.7123196721076965, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.814285695552826, "reward_std": 0.15635140240192413, "rewards/accuracy_reward": 0.8244897723197937, "rewards/format_reward": 0.9897959232330322, "step": 6155 }, { "completion_length": 300.95916748046875, "epoch": 0.6194716981132076, "grad_norm": 0.35375162959098816, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8353741765022278, "reward_std": 0.14796380698680878, "rewards/accuracy_reward": 0.8455782234668732, "rewards/format_reward": 0.9897959232330322, "step": 6156 }, { "completion_length": 275.1938705444336, "epoch": 0.6195723270440252, "grad_norm": 1.2679020166397095, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8180392384529114, "reward_std": 0.10450635850429535, "rewards/accuracy_reward": 0.8384473919868469, "rewards/format_reward": 0.9795918166637421, "step": 6157 }, { "completion_length": 257.35713958740234, "epoch": 0.6196729559748427, "grad_norm": 1.1248112916946411, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6855928301811218, "reward_std": 0.22533808648586273, "rewards/accuracy_reward": 0.706000953912735, "rewards/format_reward": 0.9795918464660645, "step": 6158 }, { "completion_length": 279.24488830566406, "epoch": 0.6197735849056604, "grad_norm": 0.3866994082927704, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8402107954025269, "reward_std": 0.07819395139813423, "rewards/accuracy_reward": 0.8402107656002045, "rewards/format_reward": 1.0, "step": 6159 }, { "completion_length": 288.6122360229492, "epoch": 0.619874213836478, "grad_norm": 0.45662039518356323, "kl": 0.0556640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7466707229614258, "reward_std": 0.14497053623199463, "rewards/accuracy_reward": 0.7568748295307159, "rewards/format_reward": 0.9897959232330322, "step": 6160 }, { "completion_length": 221.32652282714844, "epoch": 0.6199748427672956, "grad_norm": 1.3429276943206787, "kl": 0.1025390625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8086734414100647, "reward_std": 0.15857867896556854, "rewards/accuracy_reward": 0.8188775479793549, "rewards/format_reward": 0.9897959232330322, "step": 6161 }, { "completion_length": 267.61224365234375, "epoch": 0.6200754716981132, "grad_norm": 0.35441839694976807, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.821088433265686, "reward_std": 0.032396955881267786, "rewards/accuracy_reward": 0.8312925100326538, "rewards/format_reward": 0.9897959232330322, "step": 6162 }, { "completion_length": 263.6836624145508, "epoch": 0.6201761006289308, "grad_norm": 0.6781159043312073, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.836734652519226, "reward_std": 0.20016492903232574, "rewards/accuracy_reward": 0.8469387590885162, "rewards/format_reward": 0.9897959232330322, "step": 6163 }, { "completion_length": 308.20408630371094, "epoch": 0.6202767295597484, "grad_norm": 0.39941173791885376, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.682918131351471, "reward_std": 0.13665951788425446, "rewards/accuracy_reward": 0.7237344086170197, "rewards/format_reward": 0.9591836333274841, "step": 6164 }, { "completion_length": 286.0, "epoch": 0.6203773584905661, "grad_norm": 1.0236477851867676, "kl": 0.0799560546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.579625427722931, "reward_std": 0.18147262930870056, "rewards/accuracy_reward": 0.5898294448852539, "rewards/format_reward": 0.9897959232330322, "step": 6165 }, { "completion_length": 286.8571319580078, "epoch": 0.6204779874213836, "grad_norm": 1.1447241306304932, "kl": 0.11865234375, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.4592618346214294, "reward_std": 0.2848232835531235, "rewards/accuracy_reward": 0.48987412452697754, "rewards/format_reward": 0.9693877398967743, "step": 6166 }, { "completion_length": 270.4081573486328, "epoch": 0.6205786163522012, "grad_norm": 0.6201446056365967, "kl": 0.0582275390625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7542759776115417, "reward_std": 0.17013990692794323, "rewards/accuracy_reward": 0.7644800543785095, "rewards/format_reward": 0.9897959232330322, "step": 6167 }, { "completion_length": 284.89794921875, "epoch": 0.6206792452830189, "grad_norm": 1.0250954627990723, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.5267374515533447, "reward_std": 0.13433174788951874, "rewards/accuracy_reward": 0.5471455752849579, "rewards/format_reward": 0.9795918166637421, "step": 6168 }, { "completion_length": 308.74488830566406, "epoch": 0.6207798742138365, "grad_norm": 1.2531570196151733, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8250324130058289, "reward_std": 0.20400040596723557, "rewards/accuracy_reward": 0.8250323534011841, "rewards/format_reward": 1.0, "step": 6169 }, { "completion_length": 310.0918273925781, "epoch": 0.620880503144654, "grad_norm": 0.7426708936691284, "kl": 0.106689453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.791527509689331, "reward_std": 0.2020939290523529, "rewards/accuracy_reward": 0.8119356632232666, "rewards/format_reward": 0.9795918464660645, "step": 6170 }, { "completion_length": 352.5714111328125, "epoch": 0.6209811320754717, "grad_norm": 0.7785139679908752, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6335630416870117, "reward_std": 0.15296848863363266, "rewards/accuracy_reward": 0.6641752868890762, "rewards/format_reward": 0.9693877398967743, "step": 6171 }, { "completion_length": 199.6530532836914, "epoch": 0.6210817610062893, "grad_norm": 1.019769549369812, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7834647297859192, "reward_std": 0.22862707823514938, "rewards/accuracy_reward": 0.8140769600868225, "rewards/format_reward": 0.9693877398967743, "step": 6172 }, { "completion_length": 331.7142791748047, "epoch": 0.621182389937107, "grad_norm": 0.6682958602905273, "kl": 0.0513916015625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7411146759986877, "reward_std": 0.1635301187634468, "rewards/accuracy_reward": 0.7411146759986877, "rewards/format_reward": 1.0, "step": 6173 }, { "completion_length": 235.06121826171875, "epoch": 0.6212830188679245, "grad_norm": 0.9106759428977966, "kl": 0.0704345703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.478965401649475, "reward_std": 0.2307976856827736, "rewards/accuracy_reward": 0.48916955292224884, "rewards/format_reward": 0.9897959232330322, "step": 6174 }, { "completion_length": 345.52040100097656, "epoch": 0.6213836477987421, "grad_norm": 0.9398826360702515, "kl": 0.0704345703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7004310488700867, "reward_std": 0.18353860825300217, "rewards/accuracy_reward": 0.7106351554393768, "rewards/format_reward": 0.9897959232330322, "step": 6175 }, { "completion_length": 322.4897918701172, "epoch": 0.6214842767295597, "grad_norm": 1.03495454788208, "kl": 0.05029296875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.822995901107788, "reward_std": 0.15641789883375168, "rewards/accuracy_reward": 0.8434040546417236, "rewards/format_reward": 0.9795918464660645, "step": 6176 }, { "completion_length": 260.2244873046875, "epoch": 0.6215849056603774, "grad_norm": 1.3547881841659546, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8275024890899658, "reward_std": 0.16385158151388168, "rewards/accuracy_reward": 0.8275023996829987, "rewards/format_reward": 1.0, "step": 6177 }, { "completion_length": 297.3673400878906, "epoch": 0.621685534591195, "grad_norm": 0.2978518605232239, "kl": 0.059326171875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7733463644981384, "reward_std": 0.08884280920028687, "rewards/accuracy_reward": 0.7733464241027832, "rewards/format_reward": 1.0, "step": 6178 }, { "completion_length": 361.6938781738281, "epoch": 0.6217861635220125, "grad_norm": 0.5941013693809509, "kl": 0.0726318359375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.739593505859375, "reward_std": 0.1362767368555069, "rewards/accuracy_reward": 0.7497974932193756, "rewards/format_reward": 0.9897959232330322, "step": 6179 }, { "completion_length": 211.7653045654297, "epoch": 0.6218867924528302, "grad_norm": 1.0620890855789185, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7285714149475098, "reward_std": 0.13042442500591278, "rewards/accuracy_reward": 0.7387754917144775, "rewards/format_reward": 0.9897959232330322, "step": 6180 }, { "completion_length": 319.6428527832031, "epoch": 0.6219874213836478, "grad_norm": 0.6353914737701416, "kl": 0.099609375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8053533434867859, "reward_std": 0.2316158413887024, "rewards/accuracy_reward": 0.8563736975193024, "rewards/format_reward": 0.9489795565605164, "step": 6181 }, { "completion_length": 394.6836700439453, "epoch": 0.6220880503144655, "grad_norm": 0.9471622109413147, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7623271942138672, "reward_std": 0.2569410055875778, "rewards/accuracy_reward": 0.7929393947124481, "rewards/format_reward": 0.9693877398967743, "step": 6182 }, { "completion_length": 253.63265228271484, "epoch": 0.622188679245283, "grad_norm": 0.539763331413269, "kl": 0.0819091796875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7726117968559265, "reward_std": 0.12497252225875854, "rewards/accuracy_reward": 0.7726118564605713, "rewards/format_reward": 1.0, "step": 6183 }, { "completion_length": 281.79591369628906, "epoch": 0.6222893081761006, "grad_norm": 1.261160135269165, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7396798729896545, "reward_std": 0.28684188425540924, "rewards/accuracy_reward": 0.770292192697525, "rewards/format_reward": 0.9693877398967743, "step": 6184 }, { "completion_length": 260.6632537841797, "epoch": 0.6223899371069183, "grad_norm": 0.5338612794876099, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8091647028923035, "reward_std": 0.19760479032993317, "rewards/accuracy_reward": 0.8193688690662384, "rewards/format_reward": 0.9897959232330322, "step": 6185 }, { "completion_length": 274.0408172607422, "epoch": 0.6224905660377359, "grad_norm": 0.571727991104126, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7799805998802185, "reward_std": 0.14308025315403938, "rewards/accuracy_reward": 0.800388753414154, "rewards/format_reward": 0.9795918166637421, "step": 6186 }, { "completion_length": 291.2244873046875, "epoch": 0.6225911949685534, "grad_norm": 1.2035884857177734, "kl": 0.113037109375, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.671643614768982, "reward_std": 0.2781974971294403, "rewards/accuracy_reward": 0.7124598920345306, "rewards/format_reward": 0.9591836631298065, "step": 6187 }, { "completion_length": 292.9795837402344, "epoch": 0.622691823899371, "grad_norm": 0.5966708064079285, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6240477561950684, "reward_std": 0.12064234539866447, "rewards/accuracy_reward": 0.6444560289382935, "rewards/format_reward": 0.9795918166637421, "step": 6188 }, { "completion_length": 233.1326446533203, "epoch": 0.6227924528301887, "grad_norm": 1.3470134735107422, "kl": 0.1025390625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7085293531417847, "reward_std": 0.12056417763233185, "rewards/accuracy_reward": 0.7187334597110748, "rewards/format_reward": 0.9897959232330322, "step": 6189 }, { "completion_length": 285.87754821777344, "epoch": 0.6228930817610063, "grad_norm": 1.0812915563583374, "kl": 0.136962890625, "learning_rate": 1e-06, "loss": 0.0055, "reward": 1.604024887084961, "reward_std": 0.13687613233923912, "rewards/accuracy_reward": 0.6142289340496063, "rewards/format_reward": 0.9897959232330322, "step": 6190 }, { "completion_length": 238.16326141357422, "epoch": 0.6229937106918239, "grad_norm": 1.1694667339324951, "kl": 0.100830078125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7903069257736206, "reward_std": 0.16395311057567596, "rewards/accuracy_reward": 0.8005110621452332, "rewards/format_reward": 0.9897959232330322, "step": 6191 }, { "completion_length": 230.1938705444336, "epoch": 0.6230943396226415, "grad_norm": 0.3436437249183655, "kl": 0.0576171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8674092292785645, "reward_std": 0.1192527562379837, "rewards/accuracy_reward": 0.8980214297771454, "rewards/format_reward": 0.9693877398967743, "step": 6192 }, { "completion_length": 250.52039337158203, "epoch": 0.6231949685534591, "grad_norm": 0.46710991859436035, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8212004899978638, "reward_std": 0.14510880410671234, "rewards/accuracy_reward": 0.8416086733341217, "rewards/format_reward": 0.9795918464660645, "step": 6193 }, { "completion_length": 202.61223602294922, "epoch": 0.6232955974842768, "grad_norm": 0.6594690680503845, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7488986253738403, "reward_std": 0.08736064098775387, "rewards/accuracy_reward": 0.7591026127338409, "rewards/format_reward": 0.9897959232330322, "step": 6194 }, { "completion_length": 349.1428527832031, "epoch": 0.6233962264150943, "grad_norm": 0.5297206044197083, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8245184421539307, "reward_std": 0.18543102592229843, "rewards/accuracy_reward": 0.8653348088264465, "rewards/format_reward": 0.9591836333274841, "step": 6195 }, { "completion_length": 233.4285659790039, "epoch": 0.6234968553459119, "grad_norm": 1.3282172679901123, "kl": 0.1192626953125, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.6261072754859924, "reward_std": 0.24032088369131088, "rewards/accuracy_reward": 0.6261073052883148, "rewards/format_reward": 1.0, "step": 6196 }, { "completion_length": 271.61224365234375, "epoch": 0.6235974842767296, "grad_norm": 1.1412044763565063, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8189504146575928, "reward_std": 0.18216800689697266, "rewards/accuracy_reward": 0.8495627343654633, "rewards/format_reward": 0.9693877398967743, "step": 6197 }, { "completion_length": 310.1224365234375, "epoch": 0.6236981132075472, "grad_norm": 0.4115549623966217, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8169506192207336, "reward_std": 0.17289013043045998, "rewards/accuracy_reward": 0.8271546959877014, "rewards/format_reward": 0.9897959232330322, "step": 6198 }, { "completion_length": 224.9897918701172, "epoch": 0.6237987421383647, "grad_norm": 0.5739237666130066, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7951897978782654, "reward_std": 0.17164117097854614, "rewards/accuracy_reward": 0.8155980408191681, "rewards/format_reward": 0.9795918464660645, "step": 6199 }, { "completion_length": 218.5, "epoch": 0.6238993710691824, "grad_norm": 0.7871516346931458, "kl": 0.128173828125, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.8388401865959167, "reward_std": 0.20575136691331863, "rewards/accuracy_reward": 0.8490443527698517, "rewards/format_reward": 0.9897959232330322, "step": 6200 }, { "completion_length": 255.45917510986328, "epoch": 0.624, "grad_norm": 0.28349408507347107, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8673469424247742, "reward_std": 0.06185103580355644, "rewards/accuracy_reward": 0.8775510191917419, "rewards/format_reward": 0.9897959232330322, "step": 6201 }, { "completion_length": 172.27550506591797, "epoch": 0.6241006289308176, "grad_norm": 2.442366123199463, "kl": 0.094970703125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8086734414100647, "reward_std": 0.2024570032954216, "rewards/accuracy_reward": 0.8290816247463226, "rewards/format_reward": 0.9795918166637421, "step": 6202 }, { "completion_length": 329.1734619140625, "epoch": 0.6242012578616353, "grad_norm": 0.778534471988678, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7859137058258057, "reward_std": 0.22784829884767532, "rewards/accuracy_reward": 0.7961178421974182, "rewards/format_reward": 0.9897959232330322, "step": 6203 }, { "completion_length": 219.61223602294922, "epoch": 0.6243018867924528, "grad_norm": 0.5895749926567078, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8108842968940735, "reward_std": 0.05348071362823248, "rewards/accuracy_reward": 0.8108842968940735, "rewards/format_reward": 1.0, "step": 6204 }, { "completion_length": 331.51019287109375, "epoch": 0.6244025157232704, "grad_norm": 0.7012062072753906, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8082706332206726, "reward_std": 0.22156084328889847, "rewards/accuracy_reward": 0.8592910468578339, "rewards/format_reward": 0.9489795863628387, "step": 6205 }, { "completion_length": 178.82653045654297, "epoch": 0.6245031446540881, "grad_norm": 0.5792148113250732, "kl": 0.0794677734375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.818738341331482, "reward_std": 0.0760253295302391, "rewards/accuracy_reward": 0.8187383711338043, "rewards/format_reward": 1.0, "step": 6206 }, { "completion_length": 237.49999237060547, "epoch": 0.6246037735849057, "grad_norm": 0.46234267950057983, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6845893859863281, "reward_std": 0.11234182864427567, "rewards/accuracy_reward": 0.7152015566825867, "rewards/format_reward": 0.9693877398967743, "step": 6207 }, { "completion_length": 325.4387664794922, "epoch": 0.6247044025157232, "grad_norm": 0.9632988572120667, "kl": 0.103271484375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8508874773979187, "reward_std": 0.1580875851213932, "rewards/accuracy_reward": 0.8610915839672089, "rewards/format_reward": 0.9897959232330322, "step": 6208 }, { "completion_length": 285.4285583496094, "epoch": 0.6248050314465409, "grad_norm": 0.4172682762145996, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.744515061378479, "reward_std": 0.050067681819200516, "rewards/accuracy_reward": 0.7445150911808014, "rewards/format_reward": 1.0, "step": 6209 }, { "completion_length": 264.61224365234375, "epoch": 0.6249056603773585, "grad_norm": 0.8169181942939758, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7701297998428345, "reward_std": 0.13755563274025917, "rewards/accuracy_reward": 0.7905379831790924, "rewards/format_reward": 0.9795918166637421, "step": 6210 }, { "completion_length": 269.52040100097656, "epoch": 0.6250062893081761, "grad_norm": 0.7134021520614624, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7874690890312195, "reward_std": 0.14827392995357513, "rewards/accuracy_reward": 0.7976731956005096, "rewards/format_reward": 0.9897959232330322, "step": 6211 }, { "completion_length": 274.7244873046875, "epoch": 0.6251069182389937, "grad_norm": 0.6120365858078003, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8226397633552551, "reward_std": 0.1647917404770851, "rewards/accuracy_reward": 0.8328438699245453, "rewards/format_reward": 0.9897959232330322, "step": 6212 }, { "completion_length": 205.82652282714844, "epoch": 0.6252075471698113, "grad_norm": 0.3599050045013428, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.766172170639038, "reward_std": 0.06326381443068385, "rewards/accuracy_reward": 0.7865803241729736, "rewards/format_reward": 0.9795918166637421, "step": 6213 }, { "completion_length": 244.59183502197266, "epoch": 0.6253081761006289, "grad_norm": 0.463655561208725, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7234734296798706, "reward_std": 0.09925159439444542, "rewards/accuracy_reward": 0.7336775362491608, "rewards/format_reward": 0.9897959232330322, "step": 6214 }, { "completion_length": 221.55101776123047, "epoch": 0.6254088050314466, "grad_norm": 0.2951294779777527, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.9741982221603394, "reward_std": 0.0594448447227478, "rewards/accuracy_reward": 0.9741982519626617, "rewards/format_reward": 1.0, "step": 6215 }, { "completion_length": 279.5102081298828, "epoch": 0.6255094339622641, "grad_norm": 0.322536438703537, "kl": 0.0694580078125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.833951711654663, "reward_std": 0.08510442823171616, "rewards/accuracy_reward": 0.8339517414569855, "rewards/format_reward": 1.0, "step": 6216 }, { "completion_length": 233.77550506591797, "epoch": 0.6256100628930817, "grad_norm": 0.5509617924690247, "kl": 0.0841064453125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.827380895614624, "reward_std": 0.09147784113883972, "rewards/accuracy_reward": 0.8375850021839142, "rewards/format_reward": 0.9897959232330322, "step": 6217 }, { "completion_length": 238.5, "epoch": 0.6257106918238994, "grad_norm": 0.6292356252670288, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.711789608001709, "reward_std": 0.0896459799259901, "rewards/accuracy_reward": 0.7321978807449341, "rewards/format_reward": 0.9795918166637421, "step": 6218 }, { "completion_length": 243.5, "epoch": 0.625811320754717, "grad_norm": 0.763103187084198, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7401551604270935, "reward_std": 0.28610433638095856, "rewards/accuracy_reward": 0.7707674205303192, "rewards/format_reward": 0.9693877398967743, "step": 6219 }, { "completion_length": 219.02040100097656, "epoch": 0.6259119496855345, "grad_norm": 0.5471899509429932, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.818027138710022, "reward_std": 0.12558822706341743, "rewards/accuracy_reward": 0.8384353220462799, "rewards/format_reward": 0.9795918464660645, "step": 6220 }, { "completion_length": 242.06122589111328, "epoch": 0.6260125786163522, "grad_norm": 1.889548659324646, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8013731837272644, "reward_std": 0.14823006093502045, "rewards/accuracy_reward": 0.8013732135295868, "rewards/format_reward": 1.0, "step": 6221 }, { "completion_length": 153.2040786743164, "epoch": 0.6261132075471698, "grad_norm": 0.6837208271026611, "kl": 0.113037109375, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.8444305062294006, "reward_std": 0.13973563443869352, "rewards/accuracy_reward": 0.8750427961349487, "rewards/format_reward": 0.9693877398967743, "step": 6222 }, { "completion_length": 182.7346954345703, "epoch": 0.6262138364779875, "grad_norm": 0.930703341960907, "kl": 0.134521484375, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.7819727063179016, "reward_std": 0.10097558423876762, "rewards/accuracy_reward": 0.7921768724918365, "rewards/format_reward": 0.9897959232330322, "step": 6223 }, { "completion_length": 220.28571319580078, "epoch": 0.626314465408805, "grad_norm": 0.6011991500854492, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.932827651500702, "reward_std": 0.1217556856572628, "rewards/accuracy_reward": 0.9532358348369598, "rewards/format_reward": 0.9795918464660645, "step": 6224 }, { "completion_length": 321.7653045654297, "epoch": 0.6264150943396226, "grad_norm": 0.8341443538665771, "kl": 0.11865234375, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7223562002182007, "reward_std": 0.21114759147167206, "rewards/accuracy_reward": 0.7427643835544586, "rewards/format_reward": 0.9795918166637421, "step": 6225 }, { "completion_length": 257.37755584716797, "epoch": 0.6265157232704403, "grad_norm": 0.8277786374092102, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7459668517112732, "reward_std": 0.15370840206742287, "rewards/accuracy_reward": 0.7561709582805634, "rewards/format_reward": 0.9897959232330322, "step": 6226 }, { "completion_length": 289.3775405883789, "epoch": 0.6266163522012579, "grad_norm": 1.3328191041946411, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6900999546051025, "reward_std": 0.2903855890035629, "rewards/accuracy_reward": 0.7207121849060059, "rewards/format_reward": 0.9693877398967743, "step": 6227 }, { "completion_length": 200.64285278320312, "epoch": 0.6267169811320755, "grad_norm": 0.6810321807861328, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8316094279289246, "reward_std": 0.0740269310772419, "rewards/accuracy_reward": 0.8316094875335693, "rewards/format_reward": 1.0, "step": 6228 }, { "completion_length": 244.45917510986328, "epoch": 0.626817610062893, "grad_norm": 0.9036087989807129, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.827097475528717, "reward_std": 0.19280403852462769, "rewards/accuracy_reward": 0.8577097356319427, "rewards/format_reward": 0.9693877398967743, "step": 6229 }, { "completion_length": 299.2244873046875, "epoch": 0.6269182389937107, "grad_norm": 0.8608013391494751, "kl": 0.11376953125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.798469364643097, "reward_std": 0.17373458296060562, "rewards/accuracy_reward": 0.8290815949440002, "rewards/format_reward": 0.9693877398967743, "step": 6230 }, { "completion_length": 207.01020050048828, "epoch": 0.6270188679245283, "grad_norm": 0.826012909412384, "kl": 0.0743408203125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8652901649475098, "reward_std": 0.08026999607682228, "rewards/accuracy_reward": 0.865290105342865, "rewards/format_reward": 1.0, "step": 6231 }, { "completion_length": 206.78570556640625, "epoch": 0.627119496855346, "grad_norm": 0.3935307562351227, "kl": 0.0986328125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7740039229393005, "reward_std": 0.04827569331973791, "rewards/accuracy_reward": 0.7740038931369781, "rewards/format_reward": 1.0, "step": 6232 }, { "completion_length": 235.4183578491211, "epoch": 0.6272201257861635, "grad_norm": 0.9157724976539612, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.83157080411911, "reward_std": 0.17159824073314667, "rewards/accuracy_reward": 0.8417749106884003, "rewards/format_reward": 0.9897959232330322, "step": 6233 }, { "completion_length": 219.56121826171875, "epoch": 0.6273207547169811, "grad_norm": 1.0206795930862427, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.684023141860962, "reward_std": 0.2822266295552254, "rewards/accuracy_reward": 0.7044313251972198, "rewards/format_reward": 0.9795918166637421, "step": 6234 }, { "completion_length": 168.2653045654297, "epoch": 0.6274213836477988, "grad_norm": 10.861696243286133, "kl": 0.7911376953125, "learning_rate": 1e-06, "loss": 0.0317, "reward": 1.7643784284591675, "reward_std": 0.11019476130604744, "rewards/accuracy_reward": 0.7847866415977478, "rewards/format_reward": 0.9795918464660645, "step": 6235 }, { "completion_length": 247.77550506591797, "epoch": 0.6275220125786164, "grad_norm": 0.8357857465744019, "kl": 0.0775146484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6506856083869934, "reward_std": 0.18968546390533447, "rewards/accuracy_reward": 0.6812978982925415, "rewards/format_reward": 0.9693877398967743, "step": 6236 }, { "completion_length": 228.08162689208984, "epoch": 0.6276226415094339, "grad_norm": 1.2830971479415894, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7504637837409973, "reward_std": 0.25052257627248764, "rewards/accuracy_reward": 0.7708719968795776, "rewards/format_reward": 0.9795918464660645, "step": 6237 }, { "completion_length": 246.45917510986328, "epoch": 0.6277232704402516, "grad_norm": 1.081076741218567, "kl": 0.097412109375, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7811711430549622, "reward_std": 0.28470686078071594, "rewards/accuracy_reward": 0.8117834329605103, "rewards/format_reward": 0.9693877398967743, "step": 6238 }, { "completion_length": 335.8571319580078, "epoch": 0.6278238993710692, "grad_norm": 0.6991215348243713, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5645921230316162, "reward_std": 0.16712533310055733, "rewards/accuracy_reward": 0.5747962594032288, "rewards/format_reward": 0.9897959232330322, "step": 6239 }, { "completion_length": 216.08162689208984, "epoch": 0.6279245283018868, "grad_norm": 0.4876317083835602, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8594303131103516, "reward_std": 0.07648310717195272, "rewards/accuracy_reward": 0.8594303131103516, "rewards/format_reward": 1.0, "step": 6240 }, { "completion_length": 258.9591827392578, "epoch": 0.6280251572327044, "grad_norm": 0.9635222554206848, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8433772921562195, "reward_std": 0.26196947880089283, "rewards/accuracy_reward": 0.8637854754924774, "rewards/format_reward": 0.9795918166637421, "step": 6241 }, { "completion_length": 270.2550964355469, "epoch": 0.628125786163522, "grad_norm": 0.5671475529670715, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8224490284919739, "reward_std": 0.20534638315439224, "rewards/accuracy_reward": 0.8326530754566193, "rewards/format_reward": 0.9897959232330322, "step": 6242 }, { "completion_length": 293.4591827392578, "epoch": 0.6282264150943396, "grad_norm": 0.7048658132553101, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6240673661231995, "reward_std": 0.22427204996347427, "rewards/accuracy_reward": 0.6444755494594574, "rewards/format_reward": 0.9795918464660645, "step": 6243 }, { "completion_length": 205.11223602294922, "epoch": 0.6283270440251573, "grad_norm": 1.0770155191421509, "kl": 0.059814453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8546249866485596, "reward_std": 0.1996517889201641, "rewards/accuracy_reward": 0.8750330805778503, "rewards/format_reward": 0.9795918166637421, "step": 6244 }, { "completion_length": 216.61224365234375, "epoch": 0.6284276729559748, "grad_norm": 2.2604146003723145, "kl": 0.211669921875, "learning_rate": 1e-06, "loss": 0.0085, "reward": 1.748463749885559, "reward_std": 0.09823350980877876, "rewards/accuracy_reward": 0.768871933221817, "rewards/format_reward": 0.9795918166637421, "step": 6245 }, { "completion_length": 233.89795684814453, "epoch": 0.6285283018867924, "grad_norm": 3.4094934463500977, "kl": 0.182861328125, "learning_rate": 1e-06, "loss": 0.0073, "reward": 1.8775509595870972, "reward_std": 0.11584595590829849, "rewards/accuracy_reward": 0.8877550959587097, "rewards/format_reward": 0.9897959232330322, "step": 6246 }, { "completion_length": 242.43877410888672, "epoch": 0.6286289308176101, "grad_norm": 1.098642349243164, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7368622422218323, "reward_std": 0.14120444655418396, "rewards/accuracy_reward": 0.7368622124195099, "rewards/format_reward": 1.0, "step": 6247 }, { "completion_length": 195.1938705444336, "epoch": 0.6287295597484277, "grad_norm": 0.5663955211639404, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7424036264419556, "reward_std": 0.15879974514245987, "rewards/accuracy_reward": 0.7424035966396332, "rewards/format_reward": 1.0, "step": 6248 }, { "completion_length": 332.82652282714844, "epoch": 0.6288301886792452, "grad_norm": 0.5782132744789124, "kl": 0.102294921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.825494647026062, "reward_std": 0.15986815840005875, "rewards/accuracy_reward": 0.8459028601646423, "rewards/format_reward": 0.9795918166637421, "step": 6249 }, { "completion_length": 239.29591369628906, "epoch": 0.6289308176100629, "grad_norm": 0.8006384372711182, "kl": 0.0665283203125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6394566297531128, "reward_std": 0.18495844304561615, "rewards/accuracy_reward": 0.6496607959270477, "rewards/format_reward": 0.9897959232330322, "step": 6250 }, { "completion_length": 222.89795684814453, "epoch": 0.6290314465408805, "grad_norm": 0.5420014262199402, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8348130583763123, "reward_std": 0.124970693141222, "rewards/accuracy_reward": 0.8450171947479248, "rewards/format_reward": 0.9897959232330322, "step": 6251 }, { "completion_length": 194.9693832397461, "epoch": 0.6291320754716981, "grad_norm": 1.3132411241531372, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7875850200653076, "reward_std": 0.172259159386158, "rewards/accuracy_reward": 0.8079932034015656, "rewards/format_reward": 0.9795918464660645, "step": 6252 }, { "completion_length": 244.99999237060547, "epoch": 0.6292327044025158, "grad_norm": 0.6919333934783936, "kl": 0.1044921875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7300359010696411, "reward_std": 0.21038948744535446, "rewards/accuracy_reward": 0.750444084405899, "rewards/format_reward": 0.9795918166637421, "step": 6253 }, { "completion_length": 257.8775405883789, "epoch": 0.6293333333333333, "grad_norm": 2.4816508293151855, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7639852166175842, "reward_std": 0.2734082564711571, "rewards/accuracy_reward": 0.8048016428947449, "rewards/format_reward": 0.9591836631298065, "step": 6254 }, { "completion_length": 257.7550964355469, "epoch": 0.6294339622641509, "grad_norm": 0.5383927226066589, "kl": 0.0821533203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8129359483718872, "reward_std": 0.21864968538284302, "rewards/accuracy_reward": 0.8537521958351135, "rewards/format_reward": 0.9591836631298065, "step": 6255 }, { "completion_length": 314.4795837402344, "epoch": 0.6295345911949686, "grad_norm": 0.5723728537559509, "kl": 0.098388671875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.5731168389320374, "reward_std": 0.1961914263665676, "rewards/accuracy_reward": 0.6445455253124237, "rewards/format_reward": 0.9285714328289032, "step": 6256 }, { "completion_length": 223.30611419677734, "epoch": 0.6296352201257862, "grad_norm": 0.5205971598625183, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6330474615097046, "reward_std": 0.2001720666885376, "rewards/accuracy_reward": 0.6738637387752533, "rewards/format_reward": 0.9591836333274841, "step": 6257 }, { "completion_length": 258.5102005004883, "epoch": 0.6297358490566037, "grad_norm": 0.794258177280426, "kl": 0.122802734375, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.7515339255332947, "reward_std": 0.20658541470766068, "rewards/accuracy_reward": 0.782146155834198, "rewards/format_reward": 0.9693877398967743, "step": 6258 }, { "completion_length": 315.8061218261719, "epoch": 0.6298364779874214, "grad_norm": 5.896450519561768, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7452425360679626, "reward_std": 0.20509828627109528, "rewards/accuracy_reward": 0.7758547067642212, "rewards/format_reward": 0.9693877398967743, "step": 6259 }, { "completion_length": 247.8877410888672, "epoch": 0.629937106918239, "grad_norm": 1.084134817123413, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7466912865638733, "reward_std": 0.3021989017724991, "rewards/accuracy_reward": 0.787507563829422, "rewards/format_reward": 0.9591836333274841, "step": 6260 }, { "completion_length": 209.54080963134766, "epoch": 0.6300377358490566, "grad_norm": 0.9691505432128906, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8269943594932556, "reward_std": 0.13669894263148308, "rewards/accuracy_reward": 0.8269944190979004, "rewards/format_reward": 1.0, "step": 6261 }, { "completion_length": 174.41836547851562, "epoch": 0.6301383647798742, "grad_norm": 0.5047475695610046, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8025009036064148, "reward_std": 0.0665961429476738, "rewards/accuracy_reward": 0.8229090571403503, "rewards/format_reward": 0.9795918166637421, "step": 6262 }, { "completion_length": 250.09182739257812, "epoch": 0.6302389937106918, "grad_norm": 1.1258807182312012, "kl": 0.0694580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8575551509857178, "reward_std": 0.2273348867893219, "rewards/accuracy_reward": 0.8779633045196533, "rewards/format_reward": 0.9795918166637421, "step": 6263 }, { "completion_length": 339.5714111328125, "epoch": 0.6303396226415094, "grad_norm": 0.5068485140800476, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7199206352233887, "reward_std": 0.21917413175106049, "rewards/accuracy_reward": 0.7505328953266144, "rewards/format_reward": 0.9693877398967743, "step": 6264 }, { "completion_length": 251.44896697998047, "epoch": 0.6304402515723271, "grad_norm": 1.5365495681762695, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7871962785720825, "reward_std": 0.20384668558835983, "rewards/accuracy_reward": 0.8382167220115662, "rewards/format_reward": 0.9489795863628387, "step": 6265 }, { "completion_length": 247.17346954345703, "epoch": 0.6305408805031446, "grad_norm": 1.3981090784072876, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7161321640014648, "reward_std": 0.19031555205583572, "rewards/accuracy_reward": 0.736540287733078, "rewards/format_reward": 0.9795918166637421, "step": 6266 }, { "completion_length": 295.7653045654297, "epoch": 0.6306415094339622, "grad_norm": 0.6367557048797607, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7935941219329834, "reward_std": 0.278762087225914, "rewards/accuracy_reward": 0.814002275466919, "rewards/format_reward": 0.9795918166637421, "step": 6267 }, { "completion_length": 209.23468780517578, "epoch": 0.6307421383647799, "grad_norm": 0.5742791891098022, "kl": 0.102294921875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7064051032066345, "reward_std": 0.13568277657032013, "rewards/accuracy_reward": 0.7166092395782471, "rewards/format_reward": 0.9897959232330322, "step": 6268 }, { "completion_length": 235.29590606689453, "epoch": 0.6308427672955975, "grad_norm": 1.5082894563674927, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.810314953327179, "reward_std": 0.11552035808563232, "rewards/accuracy_reward": 0.8205191195011139, "rewards/format_reward": 0.9897959232330322, "step": 6269 }, { "completion_length": 234.33673095703125, "epoch": 0.630943396226415, "grad_norm": 0.8687869310379028, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8342621326446533, "reward_std": 0.17124946415424347, "rewards/accuracy_reward": 0.8648744225502014, "rewards/format_reward": 0.9693877398967743, "step": 6270 }, { "completion_length": 219.38774871826172, "epoch": 0.6310440251572327, "grad_norm": 1.6776106357574463, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8231292963027954, "reward_std": 0.14398645609617233, "rewards/accuracy_reward": 0.8231292366981506, "rewards/format_reward": 1.0, "step": 6271 }, { "completion_length": 241.48979949951172, "epoch": 0.6311446540880503, "grad_norm": 0.38949230313301086, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.706060528755188, "reward_std": 0.10756934806704521, "rewards/accuracy_reward": 0.7060605883598328, "rewards/format_reward": 1.0, "step": 6272 }, { "completion_length": 164.15306091308594, "epoch": 0.631245283018868, "grad_norm": 2.9013373851776123, "kl": 0.091552734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.870262324810028, "reward_std": 0.11145899817347527, "rewards/accuracy_reward": 0.890670508146286, "rewards/format_reward": 0.9795918166637421, "step": 6273 }, { "completion_length": 287.1428527832031, "epoch": 0.6313459119496856, "grad_norm": 0.8395682573318481, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8334522247314453, "reward_std": 0.22502310574054718, "rewards/accuracy_reward": 0.8640645146369934, "rewards/format_reward": 0.9693877398967743, "step": 6274 }, { "completion_length": 179.2346954345703, "epoch": 0.6314465408805031, "grad_norm": 2.717135429382324, "kl": 0.089111328125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8338692784309387, "reward_std": 0.12022528052330017, "rewards/accuracy_reward": 0.8440732955932617, "rewards/format_reward": 0.9897959232330322, "step": 6275 }, { "completion_length": 275.8673400878906, "epoch": 0.6315471698113208, "grad_norm": 1.0928444862365723, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.768311321735382, "reward_std": 0.1988542377948761, "rewards/accuracy_reward": 0.7989236414432526, "rewards/format_reward": 0.9693877398967743, "step": 6276 }, { "completion_length": 267.3877487182617, "epoch": 0.6316477987421384, "grad_norm": 0.8171178698539734, "kl": 0.111572265625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7811585068702698, "reward_std": 0.23981840163469315, "rewards/accuracy_reward": 0.8219748437404633, "rewards/format_reward": 0.9591836333274841, "step": 6277 }, { "completion_length": 235.99999237060547, "epoch": 0.631748427672956, "grad_norm": 0.991531491279602, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8433372378349304, "reward_std": 0.11593920364975929, "rewards/accuracy_reward": 0.8433372974395752, "rewards/format_reward": 1.0, "step": 6278 }, { "completion_length": 253.65306091308594, "epoch": 0.6318490566037736, "grad_norm": 0.7181685566902161, "kl": 0.0657958984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6331526041030884, "reward_std": 0.17569660395383835, "rewards/accuracy_reward": 0.6535607278347015, "rewards/format_reward": 0.9795918166637421, "step": 6279 }, { "completion_length": 215.3571319580078, "epoch": 0.6319496855345912, "grad_norm": 0.5991817116737366, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.763115108013153, "reward_std": 0.0888976939022541, "rewards/accuracy_reward": 0.7835232317447662, "rewards/format_reward": 0.9795918166637421, "step": 6280 }, { "completion_length": 299.94896697998047, "epoch": 0.6320503144654088, "grad_norm": 0.9684491753578186, "kl": 0.0789794921875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7147716283798218, "reward_std": 0.22037795931100845, "rewards/accuracy_reward": 0.7351797819137573, "rewards/format_reward": 0.9795918166637421, "step": 6281 }, { "completion_length": 229.93877410888672, "epoch": 0.6321509433962265, "grad_norm": 0.7754892706871033, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7979590892791748, "reward_std": 0.11190363392233849, "rewards/accuracy_reward": 0.8081632554531097, "rewards/format_reward": 0.9897959232330322, "step": 6282 }, { "completion_length": 256.0, "epoch": 0.632251572327044, "grad_norm": 0.9025571942329407, "kl": 0.114501953125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.7575315237045288, "reward_std": 0.18795495480298996, "rewards/accuracy_reward": 0.7881438136100769, "rewards/format_reward": 0.9693877398967743, "step": 6283 }, { "completion_length": 173.84693908691406, "epoch": 0.6323522012578616, "grad_norm": 1.2621991634368896, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.9657434225082397, "reward_std": 0.09063433669507504, "rewards/accuracy_reward": 0.9759474992752075, "rewards/format_reward": 0.9897959232330322, "step": 6284 }, { "completion_length": 227.15306091308594, "epoch": 0.6324528301886793, "grad_norm": 0.28104549646377563, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.9525665044784546, "reward_std": 0.013698311522603035, "rewards/accuracy_reward": 0.9525664746761322, "rewards/format_reward": 1.0, "step": 6285 }, { "completion_length": 273.31632232666016, "epoch": 0.6325534591194969, "grad_norm": 0.601249098777771, "kl": 0.0516357421875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.759520709514618, "reward_std": 0.24159547686576843, "rewards/accuracy_reward": 0.7697249054908752, "rewards/format_reward": 0.9897959232330322, "step": 6286 }, { "completion_length": 245.09183502197266, "epoch": 0.6326540880503144, "grad_norm": 1.34843111038208, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.799546480178833, "reward_std": 0.17655355483293533, "rewards/accuracy_reward": 0.8403628170490265, "rewards/format_reward": 0.9591836333274841, "step": 6287 }, { "completion_length": 326.1938781738281, "epoch": 0.6327547169811321, "grad_norm": 0.4978955388069153, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6280044317245483, "reward_std": 0.17029961198568344, "rewards/accuracy_reward": 0.6586167812347412, "rewards/format_reward": 0.9693877398967743, "step": 6288 }, { "completion_length": 225.70407104492188, "epoch": 0.6328553459119497, "grad_norm": 0.7190239429473877, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7150468826293945, "reward_std": 0.133608378469944, "rewards/accuracy_reward": 0.7354551255702972, "rewards/format_reward": 0.9795918166637421, "step": 6289 }, { "completion_length": 265.85713958740234, "epoch": 0.6329559748427673, "grad_norm": 0.901599109172821, "kl": 0.0826416015625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.9033395648002625, "reward_std": 0.16434891894459724, "rewards/accuracy_reward": 0.9237476289272308, "rewards/format_reward": 0.9795918464660645, "step": 6290 }, { "completion_length": 258.63265228271484, "epoch": 0.6330566037735849, "grad_norm": 0.6470006108283997, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6584061980247498, "reward_std": 0.16778474301099777, "rewards/accuracy_reward": 0.6584062278270721, "rewards/format_reward": 1.0, "step": 6291 }, { "completion_length": 233.84693908691406, "epoch": 0.6331572327044025, "grad_norm": 0.9357128143310547, "kl": 0.11279296875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.9387754201889038, "reward_std": 0.11917255446314812, "rewards/accuracy_reward": 0.9591836631298065, "rewards/format_reward": 0.9795918166637421, "step": 6292 }, { "completion_length": 205.01020050048828, "epoch": 0.6332578616352201, "grad_norm": 1.325598120689392, "kl": 0.091552734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.707967460155487, "reward_std": 0.18848565220832825, "rewards/accuracy_reward": 0.7079674005508423, "rewards/format_reward": 1.0, "step": 6293 }, { "completion_length": 179.9081573486328, "epoch": 0.6333584905660378, "grad_norm": 0.8014036417007446, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.811723530292511, "reward_std": 0.08541209250688553, "rewards/accuracy_reward": 0.8117235600948334, "rewards/format_reward": 1.0, "step": 6294 }, { "completion_length": 206.60204315185547, "epoch": 0.6334591194968553, "grad_norm": 0.6723420023918152, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.877551019191742, "reward_std": 0.08884849399328232, "rewards/accuracy_reward": 0.8877550959587097, "rewards/format_reward": 0.9897959232330322, "step": 6295 }, { "completion_length": 257.0408020019531, "epoch": 0.6335597484276729, "grad_norm": 1.1265349388122559, "kl": 0.0587158203125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8272594213485718, "reward_std": 0.15833256021142006, "rewards/accuracy_reward": 0.8578716814517975, "rewards/format_reward": 0.9693877398967743, "step": 6296 }, { "completion_length": 286.9387664794922, "epoch": 0.6336603773584906, "grad_norm": 1.3643561601638794, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.712291181087494, "reward_std": 0.18883835896849632, "rewards/accuracy_reward": 0.7326994240283966, "rewards/format_reward": 0.9795918166637421, "step": 6297 }, { "completion_length": 191.9591827392578, "epoch": 0.6337610062893082, "grad_norm": 0.7141749858856201, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8668610453605652, "reward_std": 0.132558673620224, "rewards/accuracy_reward": 0.8872691690921783, "rewards/format_reward": 0.9795918166637421, "step": 6298 }, { "completion_length": 176.448974609375, "epoch": 0.6338616352201258, "grad_norm": 1.0282716751098633, "kl": 0.09423828125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8998762369155884, "reward_std": 0.1843697726726532, "rewards/accuracy_reward": 0.9202844202518463, "rewards/format_reward": 0.9795918166637421, "step": 6299 }, { "completion_length": 182.82652282714844, "epoch": 0.6339622641509434, "grad_norm": 0.49378064274787903, "kl": 0.09765625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7115424871444702, "reward_std": 0.11022086394950747, "rewards/accuracy_reward": 0.7319506406784058, "rewards/format_reward": 0.9795918166637421, "step": 6300 }, { "completion_length": 269.8367233276367, "epoch": 0.634062893081761, "grad_norm": 1.3917378187179565, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7682571411132812, "reward_std": 0.21563326939940453, "rewards/accuracy_reward": 0.7988693714141846, "rewards/format_reward": 0.9693877398967743, "step": 6301 }, { "completion_length": 276.8571319580078, "epoch": 0.6341635220125786, "grad_norm": 1.1386051177978516, "kl": 0.057373046875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7929081320762634, "reward_std": 0.15195056796073914, "rewards/accuracy_reward": 0.8235204815864563, "rewards/format_reward": 0.9693877398967743, "step": 6302 }, { "completion_length": 198.5408172607422, "epoch": 0.6342641509433963, "grad_norm": 0.8551666736602783, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.787523090839386, "reward_std": 0.12784013524651527, "rewards/accuracy_reward": 0.7977272570133209, "rewards/format_reward": 0.9897959232330322, "step": 6303 }, { "completion_length": 272.61224365234375, "epoch": 0.6343647798742138, "grad_norm": 1.1893662214279175, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6554169654846191, "reward_std": 0.2189578413963318, "rewards/accuracy_reward": 0.6758251637220383, "rewards/format_reward": 0.9795918166637421, "step": 6304 }, { "completion_length": 222.05101013183594, "epoch": 0.6344654088050314, "grad_norm": 0.5037321448326111, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.666771948337555, "reward_std": 0.1100449487566948, "rewards/accuracy_reward": 0.6871800720691681, "rewards/format_reward": 0.9795918166637421, "step": 6305 }, { "completion_length": 277.448974609375, "epoch": 0.6345660377358491, "grad_norm": 0.4442984461784363, "kl": 0.0546875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8799519538879395, "reward_std": 0.15336519479751587, "rewards/accuracy_reward": 0.9105642139911652, "rewards/format_reward": 0.9693877398967743, "step": 6306 }, { "completion_length": 249.77549743652344, "epoch": 0.6346666666666667, "grad_norm": 0.48915332555770874, "kl": 0.0787353515625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8547813296318054, "reward_std": 0.14333830773830414, "rewards/accuracy_reward": 0.8853935599327087, "rewards/format_reward": 0.9693877398967743, "step": 6307 }, { "completion_length": 202.38775634765625, "epoch": 0.6347672955974842, "grad_norm": 0.8000394105911255, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8228200674057007, "reward_std": 0.12071944773197174, "rewards/accuracy_reward": 0.8228200376033783, "rewards/format_reward": 1.0, "step": 6308 }, { "completion_length": 216.30611419677734, "epoch": 0.6348679245283019, "grad_norm": 0.8873796463012695, "kl": 0.0606689453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.753644347190857, "reward_std": 0.10035287216305733, "rewards/accuracy_reward": 0.7638483643531799, "rewards/format_reward": 0.9897959232330322, "step": 6309 }, { "completion_length": 238.1836700439453, "epoch": 0.6349685534591195, "grad_norm": 0.7631006836891174, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.880868375301361, "reward_std": 0.13301536440849304, "rewards/accuracy_reward": 0.8910725116729736, "rewards/format_reward": 0.9897959232330322, "step": 6310 }, { "completion_length": 276.8571319580078, "epoch": 0.6350691823899371, "grad_norm": 0.7250856757164001, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8316327333450317, "reward_std": 0.059919172897934914, "rewards/accuracy_reward": 0.84183669090271, "rewards/format_reward": 0.9897959232330322, "step": 6311 }, { "completion_length": 238.59183502197266, "epoch": 0.6351698113207547, "grad_norm": 0.5788034796714783, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7938774824142456, "reward_std": 0.1295878179371357, "rewards/accuracy_reward": 0.8040815889835358, "rewards/format_reward": 0.9897959232330322, "step": 6312 }, { "completion_length": 240.57142639160156, "epoch": 0.6352704402515723, "grad_norm": 0.9251255393028259, "kl": 0.110107421875, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8617368340492249, "reward_std": 0.13632016256451607, "rewards/accuracy_reward": 0.871940940618515, "rewards/format_reward": 0.9897959232330322, "step": 6313 }, { "completion_length": 278.0204086303711, "epoch": 0.63537106918239, "grad_norm": 0.8477271199226379, "kl": 0.09912109375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.638950228691101, "reward_std": 0.2961210608482361, "rewards/accuracy_reward": 0.6695624887943268, "rewards/format_reward": 0.9693877398967743, "step": 6314 }, { "completion_length": 326.3367233276367, "epoch": 0.6354716981132076, "grad_norm": 0.6147031188011169, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7960832118988037, "reward_std": 0.2366950884461403, "rewards/accuracy_reward": 0.836899608373642, "rewards/format_reward": 0.9591836333274841, "step": 6315 }, { "completion_length": 328.0408020019531, "epoch": 0.6355723270440251, "grad_norm": 0.5271419286727905, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7907860279083252, "reward_std": 0.10719754174351692, "rewards/accuracy_reward": 0.8009901642799377, "rewards/format_reward": 0.9897959232330322, "step": 6316 }, { "completion_length": 253.88774871826172, "epoch": 0.6356729559748427, "grad_norm": 0.8637370467185974, "kl": 0.103515625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7209647297859192, "reward_std": 0.17755411565303802, "rewards/accuracy_reward": 0.7311688363552094, "rewards/format_reward": 0.9897959232330322, "step": 6317 }, { "completion_length": 278.8571472167969, "epoch": 0.6357735849056604, "grad_norm": 0.6558905243873596, "kl": 0.0595703125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8897080421447754, "reward_std": 0.18063940107822418, "rewards/accuracy_reward": 0.8999121487140656, "rewards/format_reward": 0.9897959232330322, "step": 6318 }, { "completion_length": 265.40816497802734, "epoch": 0.635874213836478, "grad_norm": 1.0068089962005615, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7222398519515991, "reward_std": 0.15335090458393097, "rewards/accuracy_reward": 0.7324439585208893, "rewards/format_reward": 0.9897959232330322, "step": 6319 }, { "completion_length": 265.04080963134766, "epoch": 0.6359748427672955, "grad_norm": 1.1470131874084473, "kl": 0.0797119140625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6185661554336548, "reward_std": 0.2109791785478592, "rewards/accuracy_reward": 0.6287702322006226, "rewards/format_reward": 0.9897959232330322, "step": 6320 }, { "completion_length": 328.08162689208984, "epoch": 0.6360754716981132, "grad_norm": 0.5136885643005371, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8306906819343567, "reward_std": 0.1610138602554798, "rewards/accuracy_reward": 0.8510990142822266, "rewards/format_reward": 0.9795918166637421, "step": 6321 }, { "completion_length": 257.9183578491211, "epoch": 0.6361761006289308, "grad_norm": 0.6271573305130005, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8348499536514282, "reward_std": 0.17103884369134903, "rewards/accuracy_reward": 0.8450540602207184, "rewards/format_reward": 0.9897959232330322, "step": 6322 }, { "completion_length": 335.24488830566406, "epoch": 0.6362767295597485, "grad_norm": 0.7024686932563782, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.726237952709198, "reward_std": 0.16108747199177742, "rewards/accuracy_reward": 0.7262379229068756, "rewards/format_reward": 1.0, "step": 6323 }, { "completion_length": 307.17347717285156, "epoch": 0.6363773584905661, "grad_norm": 0.9321584105491638, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.707583725452423, "reward_std": 0.18657580763101578, "rewards/accuracy_reward": 0.727991908788681, "rewards/format_reward": 0.9795918464660645, "step": 6324 }, { "completion_length": 231.59183502197266, "epoch": 0.6364779874213836, "grad_norm": 1.1481854915618896, "kl": 0.10693359375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.8386260271072388, "reward_std": 0.1857355758547783, "rewards/accuracy_reward": 0.8488301634788513, "rewards/format_reward": 0.9897959232330322, "step": 6325 }, { "completion_length": 275.07142639160156, "epoch": 0.6365786163522013, "grad_norm": 2.1942245960235596, "kl": 0.12060546875, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7653060555458069, "reward_std": 0.18655496090650558, "rewards/accuracy_reward": 0.7857142686843872, "rewards/format_reward": 0.9795918166637421, "step": 6326 }, { "completion_length": 314.88775634765625, "epoch": 0.6366792452830189, "grad_norm": 0.6812071204185486, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6039106249809265, "reward_std": 0.19404520094394684, "rewards/accuracy_reward": 0.6141147315502167, "rewards/format_reward": 0.9897959232330322, "step": 6327 }, { "completion_length": 235.89795684814453, "epoch": 0.6367798742138365, "grad_norm": 0.5454522967338562, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.9285714030265808, "reward_std": 0.08884849399328232, "rewards/accuracy_reward": 0.9285714328289032, "rewards/format_reward": 1.0, "step": 6328 }, { "completion_length": 222.44896697998047, "epoch": 0.636880503144654, "grad_norm": 0.692807137966156, "kl": 0.098876953125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.9057166576385498, "reward_std": 0.15516456961631775, "rewards/accuracy_reward": 0.9159208238124847, "rewards/format_reward": 0.9897959232330322, "step": 6329 }, { "completion_length": 255.77549743652344, "epoch": 0.6369811320754717, "grad_norm": 0.6511245965957642, "kl": 0.10400390625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6511971950531006, "reward_std": 0.1505654901266098, "rewards/accuracy_reward": 0.6614013612270355, "rewards/format_reward": 0.9897959232330322, "step": 6330 }, { "completion_length": 290.2346954345703, "epoch": 0.6370817610062893, "grad_norm": 0.4589141607284546, "kl": 0.1025390625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6687546968460083, "reward_std": 0.1559756062924862, "rewards/accuracy_reward": 0.6789587140083313, "rewards/format_reward": 0.9897959232330322, "step": 6331 }, { "completion_length": 284.6530532836914, "epoch": 0.637182389937107, "grad_norm": 0.8588778972625732, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7414966225624084, "reward_std": 0.15457381308078766, "rewards/accuracy_reward": 0.7517006695270538, "rewards/format_reward": 0.9897959232330322, "step": 6332 }, { "completion_length": 251.56121826171875, "epoch": 0.6372830188679245, "grad_norm": 1.4810134172439575, "kl": 0.096923828125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8557897210121155, "reward_std": 0.1182529367506504, "rewards/accuracy_reward": 0.8557898998260498, "rewards/format_reward": 1.0, "step": 6333 }, { "completion_length": 313.28570556640625, "epoch": 0.6373836477987421, "grad_norm": 0.5388757586479187, "kl": 0.10205078125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7012715339660645, "reward_std": 0.17458901554346085, "rewards/accuracy_reward": 0.7012714147567749, "rewards/format_reward": 1.0, "step": 6334 }, { "completion_length": 216.86734008789062, "epoch": 0.6374842767295598, "grad_norm": 0.39952903985977173, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8775509595870972, "reward_std": 0.09670460596680641, "rewards/accuracy_reward": 0.8775509893894196, "rewards/format_reward": 1.0, "step": 6335 }, { "completion_length": 270.9591827392578, "epoch": 0.6375849056603774, "grad_norm": 1.1379224061965942, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7418285012245178, "reward_std": 0.23571527749300003, "rewards/accuracy_reward": 0.7724407911300659, "rewards/format_reward": 0.9693877398967743, "step": 6336 }, { "completion_length": 275.7755126953125, "epoch": 0.6376855345911949, "grad_norm": 1.8705741167068481, "kl": 0.0789794921875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6956267356872559, "reward_std": 0.26211413741111755, "rewards/accuracy_reward": 0.7262390553951263, "rewards/format_reward": 0.9693877398967743, "step": 6337 }, { "completion_length": 295.7449035644531, "epoch": 0.6377861635220126, "grad_norm": 0.8932895660400391, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6482945680618286, "reward_std": 0.20596421509981155, "rewards/accuracy_reward": 0.658498615026474, "rewards/format_reward": 0.9897959232330322, "step": 6338 }, { "completion_length": 270.55101013183594, "epoch": 0.6378867924528302, "grad_norm": 1.2738821506500244, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7463408708572388, "reward_std": 0.16246122866868973, "rewards/accuracy_reward": 0.7565450370311737, "rewards/format_reward": 0.9897959232330322, "step": 6339 }, { "completion_length": 228.6734619140625, "epoch": 0.6379874213836478, "grad_norm": 0.5977081060409546, "kl": 0.126953125, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.6327469944953918, "reward_std": 0.1083941850811243, "rewards/accuracy_reward": 0.6327469944953918, "rewards/format_reward": 1.0, "step": 6340 }, { "completion_length": 310.63265228271484, "epoch": 0.6380880503144654, "grad_norm": 1.2483428716659546, "kl": 0.0655517578125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7240492701530457, "reward_std": 0.26479000598192215, "rewards/accuracy_reward": 0.7648655772209167, "rewards/format_reward": 0.9591836333274841, "step": 6341 }, { "completion_length": 260.6632614135742, "epoch": 0.638188679245283, "grad_norm": 0.7024317383766174, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7134199142456055, "reward_std": 0.1936669424176216, "rewards/accuracy_reward": 0.7134198844432831, "rewards/format_reward": 1.0, "step": 6342 }, { "completion_length": 312.846923828125, "epoch": 0.6382893081761006, "grad_norm": 1.1210354566574097, "kl": 0.0697021484375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.822157382965088, "reward_std": 0.12741629127413034, "rewards/accuracy_reward": 0.8323614597320557, "rewards/format_reward": 0.9897959232330322, "step": 6343 }, { "completion_length": 336.10203552246094, "epoch": 0.6383899371069183, "grad_norm": 0.7884590029716492, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7013810276985168, "reward_std": 0.2192179411649704, "rewards/accuracy_reward": 0.7217892110347748, "rewards/format_reward": 0.9795918464660645, "step": 6344 }, { "completion_length": 257.5408020019531, "epoch": 0.6384905660377358, "grad_norm": 0.7395963072776794, "kl": 0.0677490234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7259474396705627, "reward_std": 0.18668635189533234, "rewards/accuracy_reward": 0.7259474992752075, "rewards/format_reward": 1.0, "step": 6345 }, { "completion_length": 307.5408172607422, "epoch": 0.6385911949685534, "grad_norm": 1.0406371355056763, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.673469364643097, "reward_std": 0.18887971341609955, "rewards/accuracy_reward": 0.6938775181770325, "rewards/format_reward": 0.9795918464660645, "step": 6346 }, { "completion_length": 434.2550964355469, "epoch": 0.6386918238993711, "grad_norm": 0.5828366875648499, "kl": 0.0662841796875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5956916213035583, "reward_std": 0.22134914994239807, "rewards/accuracy_reward": 0.6263039112091064, "rewards/format_reward": 0.9693877398967743, "step": 6347 }, { "completion_length": 256.0612258911133, "epoch": 0.6387924528301887, "grad_norm": 0.865968644618988, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6860853433609009, "reward_std": 0.262533001601696, "rewards/accuracy_reward": 0.7166976034641266, "rewards/format_reward": 0.9693877398967743, "step": 6348 }, { "completion_length": 238.87754821777344, "epoch": 0.6388930817610063, "grad_norm": 0.881745457649231, "kl": 0.0986328125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6718144416809082, "reward_std": 0.15596619993448257, "rewards/accuracy_reward": 0.6820184588432312, "rewards/format_reward": 0.9897959232330322, "step": 6349 }, { "completion_length": 235.80612182617188, "epoch": 0.6389937106918239, "grad_norm": 0.9318445920944214, "kl": 0.052978515625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.6423712372779846, "reward_std": 0.21514929085969925, "rewards/accuracy_reward": 0.6729834675788879, "rewards/format_reward": 0.9693877398967743, "step": 6350 }, { "completion_length": 344.1836700439453, "epoch": 0.6390943396226415, "grad_norm": 0.5929917693138123, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6350326538085938, "reward_std": 0.23769991844892502, "rewards/accuracy_reward": 0.6656450629234314, "rewards/format_reward": 0.9693877398967743, "step": 6351 }, { "completion_length": 223.88774871826172, "epoch": 0.6391949685534591, "grad_norm": 0.7387136816978455, "kl": 0.10498046875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7624613642692566, "reward_std": 0.17971771582961082, "rewards/accuracy_reward": 0.7828695178031921, "rewards/format_reward": 0.9795918166637421, "step": 6352 }, { "completion_length": 178.30611419677734, "epoch": 0.6392955974842768, "grad_norm": 1.5370204448699951, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7398709654808044, "reward_std": 0.2879873216152191, "rewards/accuracy_reward": 0.7602790594100952, "rewards/format_reward": 0.9795918464660645, "step": 6353 }, { "completion_length": 187.28571319580078, "epoch": 0.6393962264150943, "grad_norm": 1.2538323402404785, "kl": 0.097412109375, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7934635877609253, "reward_std": 0.11430306173861027, "rewards/accuracy_reward": 0.8036677539348602, "rewards/format_reward": 0.9897959232330322, "step": 6354 }, { "completion_length": 198.1836700439453, "epoch": 0.6394968553459119, "grad_norm": 1.2570363283157349, "kl": 0.094970703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8469387292861938, "reward_std": 0.20802104473114014, "rewards/accuracy_reward": 0.8571428656578064, "rewards/format_reward": 0.9897959232330322, "step": 6355 }, { "completion_length": 314.0204086303711, "epoch": 0.6395974842767296, "grad_norm": 0.7222033739089966, "kl": 0.0762939453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7346659898757935, "reward_std": 0.1145280972123146, "rewards/accuracy_reward": 0.7448700964450836, "rewards/format_reward": 0.9897959232330322, "step": 6356 }, { "completion_length": 244.48978424072266, "epoch": 0.6396981132075472, "grad_norm": 124.07321166992188, "kl": 6.152587890625, "learning_rate": 1e-06, "loss": 0.2466, "reward": 1.7588125467300415, "reward_std": 0.19389083981513977, "rewards/accuracy_reward": 0.7792207598686218, "rewards/format_reward": 0.9795918166637421, "step": 6357 }, { "completion_length": 208.89794921875, "epoch": 0.6397987421383647, "grad_norm": 0.6902351379394531, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8213374614715576, "reward_std": 0.10473252832889557, "rewards/accuracy_reward": 0.8315415382385254, "rewards/format_reward": 0.9897959232330322, "step": 6358 }, { "completion_length": 294.32652282714844, "epoch": 0.6398993710691824, "grad_norm": 0.7548151612281799, "kl": 0.089111328125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.721622884273529, "reward_std": 0.1360358614474535, "rewards/accuracy_reward": 0.7318270206451416, "rewards/format_reward": 0.9897959232330322, "step": 6359 }, { "completion_length": 213.67346954345703, "epoch": 0.64, "grad_norm": 0.7870808839797974, "kl": 0.1025390625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7663213610649109, "reward_std": 0.08887943252921104, "rewards/accuracy_reward": 0.7765254080295563, "rewards/format_reward": 0.9897959232330322, "step": 6360 }, { "completion_length": 212.4183578491211, "epoch": 0.6401006289308177, "grad_norm": 1.2069950103759766, "kl": 0.09814453125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8664332032203674, "reward_std": 0.2170662134885788, "rewards/accuracy_reward": 0.8868414163589478, "rewards/format_reward": 0.9795918464660645, "step": 6361 }, { "completion_length": 192.07142639160156, "epoch": 0.6402012578616352, "grad_norm": 0.5114263296127319, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7906705737113953, "reward_std": 0.11529674753546715, "rewards/accuracy_reward": 0.7906705439090729, "rewards/format_reward": 1.0, "step": 6362 }, { "completion_length": 225.0408172607422, "epoch": 0.6403018867924528, "grad_norm": 2.5067360401153564, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8165938258171082, "reward_std": 0.19530795514583588, "rewards/accuracy_reward": 0.8267977833747864, "rewards/format_reward": 0.9897959232330322, "step": 6363 }, { "completion_length": 226.27550506591797, "epoch": 0.6404025157232704, "grad_norm": 1.1540355682373047, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.795918345451355, "reward_std": 0.15728024020791054, "rewards/accuracy_reward": 0.8265306055545807, "rewards/format_reward": 0.9693877398967743, "step": 6364 }, { "completion_length": 262.8163299560547, "epoch": 0.6405031446540881, "grad_norm": 0.5458514094352722, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8093382716178894, "reward_std": 0.15526384767144918, "rewards/accuracy_reward": 0.8195423781871796, "rewards/format_reward": 0.9897959232330322, "step": 6365 }, { "completion_length": 273.1428451538086, "epoch": 0.6406037735849056, "grad_norm": 0.5449805855751038, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8882410526275635, "reward_std": 0.0954190120100975, "rewards/accuracy_reward": 0.8882409930229187, "rewards/format_reward": 1.0, "step": 6366 }, { "completion_length": 263.1428527832031, "epoch": 0.6407044025157232, "grad_norm": 0.6129102110862732, "kl": 0.109130859375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7789189219474792, "reward_std": 0.12010776251554489, "rewards/accuracy_reward": 0.7891231179237366, "rewards/format_reward": 0.9897959232330322, "step": 6367 }, { "completion_length": 310.2346954345703, "epoch": 0.6408050314465409, "grad_norm": 0.7256045937538147, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7212974429130554, "reward_std": 0.1735493764281273, "rewards/accuracy_reward": 0.7315014898777008, "rewards/format_reward": 0.9897959232330322, "step": 6368 }, { "completion_length": 318.051025390625, "epoch": 0.6409056603773585, "grad_norm": 4.3577494621276855, "kl": 0.101318359375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6903778314590454, "reward_std": 0.17909271270036697, "rewards/accuracy_reward": 0.7107860147953033, "rewards/format_reward": 0.9795918166637421, "step": 6369 }, { "completion_length": 176.4081573486328, "epoch": 0.641006289308176, "grad_norm": 0.8321431875228882, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7320335507392883, "reward_std": 0.13046350702643394, "rewards/accuracy_reward": 0.7320334613323212, "rewards/format_reward": 1.0, "step": 6370 }, { "completion_length": 237.9387664794922, "epoch": 0.6411069182389937, "grad_norm": 0.39384061098098755, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8925170302391052, "reward_std": 0.07693550828844309, "rewards/accuracy_reward": 0.8925170004367828, "rewards/format_reward": 1.0, "step": 6371 }, { "completion_length": 210.64285278320312, "epoch": 0.6412075471698113, "grad_norm": 0.9669945240020752, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.866406798362732, "reward_std": 0.15291932970285416, "rewards/accuracy_reward": 0.8766108453273773, "rewards/format_reward": 0.9897959232330322, "step": 6372 }, { "completion_length": 248.51020050048828, "epoch": 0.641308176100629, "grad_norm": 0.9229684472084045, "kl": 0.11767578125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.8132652640342712, "reward_std": 0.17381005734205246, "rewards/accuracy_reward": 0.8336734473705292, "rewards/format_reward": 0.9795918166637421, "step": 6373 }, { "completion_length": 229.97958374023438, "epoch": 0.6414088050314466, "grad_norm": 0.7278345227241516, "kl": 0.0738525390625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7452262043952942, "reward_std": 0.12802886869758368, "rewards/accuracy_reward": 0.7554303705692291, "rewards/format_reward": 0.9897959232330322, "step": 6374 }, { "completion_length": 194.09183502197266, "epoch": 0.6415094339622641, "grad_norm": 0.9983042478561401, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8050622940063477, "reward_std": 0.19143402576446533, "rewards/accuracy_reward": 0.8254703879356384, "rewards/format_reward": 0.9795918464660645, "step": 6375 }, { "completion_length": 344.02040100097656, "epoch": 0.6416100628930818, "grad_norm": 0.5630277395248413, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6537166237831116, "reward_std": 0.26968781650066376, "rewards/accuracy_reward": 0.6741248071193695, "rewards/format_reward": 0.9795918166637421, "step": 6376 }, { "completion_length": 179.59183502197266, "epoch": 0.6417106918238994, "grad_norm": 1.2855157852172852, "kl": 0.116455078125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.8579931855201721, "reward_std": 0.16225114837288857, "rewards/accuracy_reward": 0.8579931855201721, "rewards/format_reward": 1.0, "step": 6377 }, { "completion_length": 230.2142791748047, "epoch": 0.641811320754717, "grad_norm": 1.9457974433898926, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8148040771484375, "reward_std": 0.20278601348400116, "rewards/accuracy_reward": 0.8352121412754059, "rewards/format_reward": 0.9795918166637421, "step": 6378 }, { "completion_length": 286.07142639160156, "epoch": 0.6419119496855346, "grad_norm": 1.052313208580017, "kl": 0.11279296875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.721993863582611, "reward_std": 0.08922970667481422, "rewards/accuracy_reward": 0.7219938635826111, "rewards/format_reward": 1.0, "step": 6379 }, { "completion_length": 254.6428451538086, "epoch": 0.6420125786163522, "grad_norm": 2.4348270893096924, "kl": 0.105712890625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.808277428150177, "reward_std": 0.23205973953008652, "rewards/accuracy_reward": 0.8286855816841125, "rewards/format_reward": 0.9795918166637421, "step": 6380 }, { "completion_length": 200.0714340209961, "epoch": 0.6421132075471698, "grad_norm": 0.9925779104232788, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.9190476536750793, "reward_std": 0.14961913228034973, "rewards/accuracy_reward": 0.9394557774066925, "rewards/format_reward": 0.9795918464660645, "step": 6381 }, { "completion_length": 340.15306091308594, "epoch": 0.6422138364779875, "grad_norm": 0.731145441532135, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7304245829582214, "reward_std": 0.07079045474529266, "rewards/accuracy_reward": 0.7304245829582214, "rewards/format_reward": 1.0, "step": 6382 }, { "completion_length": 325.9285583496094, "epoch": 0.642314465408805, "grad_norm": 0.3788386285305023, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.795918345451355, "reward_std": 0.14284341782331467, "rewards/accuracy_reward": 0.8265305757522583, "rewards/format_reward": 0.9693877398967743, "step": 6383 }, { "completion_length": 289.0102081298828, "epoch": 0.6424150943396226, "grad_norm": 0.6700594425201416, "kl": 0.0772705078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6382021307945251, "reward_std": 0.17290784418582916, "rewards/accuracy_reward": 0.6484062671661377, "rewards/format_reward": 0.9897959232330322, "step": 6384 }, { "completion_length": 234.2959213256836, "epoch": 0.6425157232704403, "grad_norm": 1.6278131008148193, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.830055594444275, "reward_std": 0.143303994089365, "rewards/accuracy_reward": 0.8402597308158875, "rewards/format_reward": 0.9897959232330322, "step": 6385 }, { "completion_length": 270.8367233276367, "epoch": 0.6426163522012579, "grad_norm": 1.086198091506958, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.863759994506836, "reward_std": 0.07801567483693361, "rewards/accuracy_reward": 0.8637600243091583, "rewards/format_reward": 1.0, "step": 6386 }, { "completion_length": 273.7550964355469, "epoch": 0.6427169811320754, "grad_norm": 0.6541959643363953, "kl": 0.102294921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8795917630195618, "reward_std": 0.12271210551261902, "rewards/accuracy_reward": 0.8795918524265289, "rewards/format_reward": 1.0, "step": 6387 }, { "completion_length": 237.7448959350586, "epoch": 0.6428176100628931, "grad_norm": 0.39019855856895447, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7431641817092896, "reward_std": 0.12998705822974443, "rewards/accuracy_reward": 0.7431641817092896, "rewards/format_reward": 1.0, "step": 6388 }, { "completion_length": 217.86734008789062, "epoch": 0.6429182389937107, "grad_norm": 0.6584498286247253, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.770728588104248, "reward_std": 0.12067096680402756, "rewards/accuracy_reward": 0.7809326648712158, "rewards/format_reward": 0.9897959232330322, "step": 6389 }, { "completion_length": 293.3673400878906, "epoch": 0.6430188679245283, "grad_norm": 0.6766665577888489, "kl": 0.115234375, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7419922351837158, "reward_std": 0.11546853929758072, "rewards/accuracy_reward": 0.7521963119506836, "rewards/format_reward": 0.9897959232330322, "step": 6390 }, { "completion_length": 145.96939086914062, "epoch": 0.6431194968553459, "grad_norm": 0.41092702746391296, "kl": 0.0762939453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.836734652519226, "reward_std": 0.07303375005722046, "rewards/accuracy_reward": 0.8367346823215485, "rewards/format_reward": 1.0, "step": 6391 }, { "completion_length": 325.2449035644531, "epoch": 0.6432201257861635, "grad_norm": 0.7192663550376892, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6865925192832947, "reward_std": 0.266655758023262, "rewards/accuracy_reward": 0.7274088561534882, "rewards/format_reward": 0.9591836333274841, "step": 6392 }, { "completion_length": 196.66326141357422, "epoch": 0.6433207547169811, "grad_norm": 0.603727400302887, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7607534527778625, "reward_std": 0.06310386769473553, "rewards/accuracy_reward": 0.7607535123825073, "rewards/format_reward": 1.0, "step": 6393 }, { "completion_length": 187.86734008789062, "epoch": 0.6434213836477988, "grad_norm": 0.48972147703170776, "kl": 0.106201171875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.856988251209259, "reward_std": 0.06435130164027214, "rewards/accuracy_reward": 0.8773964047431946, "rewards/format_reward": 0.9795918464660645, "step": 6394 }, { "completion_length": 220.3571319580078, "epoch": 0.6435220125786163, "grad_norm": 0.9156787991523743, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7493615746498108, "reward_std": 0.1640089713037014, "rewards/accuracy_reward": 0.7595657110214233, "rewards/format_reward": 0.9897959232330322, "step": 6395 }, { "completion_length": 215.7346954345703, "epoch": 0.6436226415094339, "grad_norm": 0.528198778629303, "kl": 0.102294921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.9169095754623413, "reward_std": 0.13410980254411697, "rewards/accuracy_reward": 0.947521835565567, "rewards/format_reward": 0.9693877398967743, "step": 6396 }, { "completion_length": 309.15306091308594, "epoch": 0.6437232704402516, "grad_norm": 0.5703982710838318, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6883612871170044, "reward_std": 0.0970882959663868, "rewards/accuracy_reward": 0.6883613765239716, "rewards/format_reward": 1.0, "step": 6397 }, { "completion_length": 209.43877410888672, "epoch": 0.6438238993710692, "grad_norm": 2.674140453338623, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.753744900226593, "reward_std": 0.07260525925084949, "rewards/accuracy_reward": 0.7537449896335602, "rewards/format_reward": 1.0, "step": 6398 }, { "completion_length": 325.4387741088867, "epoch": 0.6439245283018868, "grad_norm": 0.5810615420341492, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.60356867313385, "reward_std": 0.22172366827726364, "rewards/accuracy_reward": 0.6137727648019791, "rewards/format_reward": 0.9897959232330322, "step": 6399 }, { "completion_length": 268.4183578491211, "epoch": 0.6440251572327044, "grad_norm": 0.6850205063819885, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8081779479980469, "reward_std": 0.08972753398120403, "rewards/accuracy_reward": 0.8081779479980469, "rewards/format_reward": 1.0, "step": 6400 }, { "completion_length": 213.85713958740234, "epoch": 0.644125786163522, "grad_norm": 0.6350905299186707, "kl": 0.0841064453125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7110861539840698, "reward_std": 0.11044998839497566, "rewards/accuracy_reward": 0.7110861539840698, "rewards/format_reward": 1.0, "step": 6401 }, { "completion_length": 284.55101776123047, "epoch": 0.6442264150943396, "grad_norm": 0.4866100251674652, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.781358778476715, "reward_std": 0.12349788099527359, "rewards/accuracy_reward": 0.7915628850460052, "rewards/format_reward": 0.9897959232330322, "step": 6402 }, { "completion_length": 230.89794921875, "epoch": 0.6443270440251573, "grad_norm": 0.46787378191947937, "kl": 0.097412109375, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.9403790831565857, "reward_std": 0.07501243986189365, "rewards/accuracy_reward": 0.9403789937496185, "rewards/format_reward": 1.0, "step": 6403 }, { "completion_length": 288.8367233276367, "epoch": 0.6444276729559748, "grad_norm": 0.8503333926200867, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6660927534103394, "reward_std": 0.20739784836769104, "rewards/accuracy_reward": 0.6762968897819519, "rewards/format_reward": 0.9897959232330322, "step": 6404 }, { "completion_length": 210.97958374023438, "epoch": 0.6445283018867924, "grad_norm": 0.632830798625946, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8101114630699158, "reward_std": 0.10487237945199013, "rewards/accuracy_reward": 0.8203155696392059, "rewards/format_reward": 0.9897959232330322, "step": 6405 }, { "completion_length": 233.19387817382812, "epoch": 0.6446289308176101, "grad_norm": 1.0744208097457886, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8086844682693481, "reward_std": 0.19768879190087318, "rewards/accuracy_reward": 0.8290926814079285, "rewards/format_reward": 0.9795918166637421, "step": 6406 }, { "completion_length": 250.1020278930664, "epoch": 0.6447295597484277, "grad_norm": 0.6167861819267273, "kl": 0.0872802734375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8720359802246094, "reward_std": 0.15703540295362473, "rewards/accuracy_reward": 0.8822399973869324, "rewards/format_reward": 0.9897959232330322, "step": 6407 }, { "completion_length": 209.4693832397461, "epoch": 0.6448301886792452, "grad_norm": 0.5251312255859375, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.899999976158142, "reward_std": 0.11955629289150238, "rewards/accuracy_reward": 0.9102040827274323, "rewards/format_reward": 0.9897959232330322, "step": 6408 }, { "completion_length": 226.30612182617188, "epoch": 0.6449308176100629, "grad_norm": 1.3361858129501343, "kl": 0.09814453125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8705214262008667, "reward_std": 0.10625210404396057, "rewards/accuracy_reward": 0.8705214560031891, "rewards/format_reward": 1.0, "step": 6409 }, { "completion_length": 260.29591369628906, "epoch": 0.6450314465408805, "grad_norm": 1.1100118160247803, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8054885864257812, "reward_std": 0.1629699319601059, "rewards/accuracy_reward": 0.8054885566234589, "rewards/format_reward": 1.0, "step": 6410 }, { "completion_length": 273.80611419677734, "epoch": 0.6451320754716982, "grad_norm": 0.45259833335876465, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7778586745262146, "reward_std": 0.09188833087682724, "rewards/accuracy_reward": 0.7880628407001495, "rewards/format_reward": 0.9897959232330322, "step": 6411 }, { "completion_length": 220.62244415283203, "epoch": 0.6452327044025157, "grad_norm": 1.1136174201965332, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7888611555099487, "reward_std": 0.10238853469491005, "rewards/accuracy_reward": 0.7990652024745941, "rewards/format_reward": 0.9897959232330322, "step": 6412 }, { "completion_length": 210.80612182617188, "epoch": 0.6453333333333333, "grad_norm": 0.43473899364471436, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8769164085388184, "reward_std": 0.10419183224439621, "rewards/accuracy_reward": 0.8769164383411407, "rewards/format_reward": 1.0, "step": 6413 }, { "completion_length": 251.99999237060547, "epoch": 0.645433962264151, "grad_norm": 0.48998865485191345, "kl": 0.0684814453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7429718971252441, "reward_std": 0.1336149200797081, "rewards/accuracy_reward": 0.7735841274261475, "rewards/format_reward": 0.9693877398967743, "step": 6414 }, { "completion_length": 282.7346878051758, "epoch": 0.6455345911949686, "grad_norm": 0.5770426988601685, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8090908527374268, "reward_std": 0.14394830167293549, "rewards/accuracy_reward": 0.8397031426429749, "rewards/format_reward": 0.9693877398967743, "step": 6415 }, { "completion_length": 256.4387664794922, "epoch": 0.6456352201257861, "grad_norm": 0.8354294896125793, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.575773000717163, "reward_std": 0.15173307061195374, "rewards/accuracy_reward": 0.60638527572155, "rewards/format_reward": 0.9693877398967743, "step": 6416 }, { "completion_length": 270.3061218261719, "epoch": 0.6457358490566038, "grad_norm": 0.6203488111495972, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7112605571746826, "reward_std": 0.14529035985469818, "rewards/accuracy_reward": 0.7214646339416504, "rewards/format_reward": 0.9897959232330322, "step": 6417 }, { "completion_length": 169.14285278320312, "epoch": 0.6458364779874214, "grad_norm": 0.36679309606552124, "kl": 0.100830078125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.84225332736969, "reward_std": 0.09211403131484985, "rewards/accuracy_reward": 0.8524575531482697, "rewards/format_reward": 0.9897959232330322, "step": 6418 }, { "completion_length": 264.22447967529297, "epoch": 0.645937106918239, "grad_norm": 0.36607182025909424, "kl": 0.09765625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7747964262962341, "reward_std": 0.09065062925219536, "rewards/accuracy_reward": 0.7850005030632019, "rewards/format_reward": 0.9897959232330322, "step": 6419 }, { "completion_length": 195.4081573486328, "epoch": 0.6460377358490565, "grad_norm": 0.4639524519443512, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8206592202186584, "reward_std": 0.058938175439834595, "rewards/accuracy_reward": 0.8206592202186584, "rewards/format_reward": 1.0, "step": 6420 }, { "completion_length": 258.5408172607422, "epoch": 0.6461383647798742, "grad_norm": 0.8831503391265869, "kl": 0.0914306640625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6701393723487854, "reward_std": 0.14489243179559708, "rewards/accuracy_reward": 0.6803434789180756, "rewards/format_reward": 0.9897959232330322, "step": 6421 }, { "completion_length": 269.29591369628906, "epoch": 0.6462389937106918, "grad_norm": 0.40093594789505005, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8714285492897034, "reward_std": 0.07019339501857758, "rewards/accuracy_reward": 0.8714285492897034, "rewards/format_reward": 1.0, "step": 6422 }, { "completion_length": 251.9693832397461, "epoch": 0.6463396226415095, "grad_norm": 0.7995231747627258, "kl": 0.14306640625, "learning_rate": 1e-06, "loss": 0.0057, "reward": 1.7331528067588806, "reward_std": 0.1587376780807972, "rewards/accuracy_reward": 0.7331527471542358, "rewards/format_reward": 1.0, "step": 6423 }, { "completion_length": 258.27549743652344, "epoch": 0.6464402515723271, "grad_norm": 0.5790473818778992, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8228457570075989, "reward_std": 0.17090142518281937, "rewards/accuracy_reward": 0.8432539701461792, "rewards/format_reward": 0.9795918464660645, "step": 6424 }, { "completion_length": 360.24488830566406, "epoch": 0.6465408805031446, "grad_norm": 0.7387128472328186, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6598625779151917, "reward_std": 0.18043962866067886, "rewards/accuracy_reward": 0.6598625779151917, "rewards/format_reward": 1.0, "step": 6425 }, { "completion_length": 250.22447967529297, "epoch": 0.6466415094339623, "grad_norm": 1.9590036869049072, "kl": 0.21337890625, "learning_rate": 1e-06, "loss": 0.0085, "reward": 1.6642239093780518, "reward_std": 0.1873643919825554, "rewards/accuracy_reward": 0.6744279265403748, "rewards/format_reward": 0.9897959232330322, "step": 6426 }, { "completion_length": 246.30612182617188, "epoch": 0.6467421383647799, "grad_norm": 1.046788215637207, "kl": 0.0767822265625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7648239135742188, "reward_std": 0.1888149380683899, "rewards/accuracy_reward": 0.7750279605388641, "rewards/format_reward": 0.9897959232330322, "step": 6427 }, { "completion_length": 203.86734771728516, "epoch": 0.6468427672955975, "grad_norm": 0.8146345615386963, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.819557785987854, "reward_std": 0.12362705171108246, "rewards/accuracy_reward": 0.8297618925571442, "rewards/format_reward": 0.9897959232330322, "step": 6428 }, { "completion_length": 203.37754821777344, "epoch": 0.6469433962264151, "grad_norm": 1.5199410915374756, "kl": 0.065185546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.728664219379425, "reward_std": 0.1142568551003933, "rewards/accuracy_reward": 0.7286641597747803, "rewards/format_reward": 1.0, "step": 6429 }, { "completion_length": 283.62244415283203, "epoch": 0.6470440251572327, "grad_norm": 0.7620401382446289, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7374613285064697, "reward_std": 0.12058060988783836, "rewards/accuracy_reward": 0.7476654350757599, "rewards/format_reward": 0.9897959232330322, "step": 6430 }, { "completion_length": 279.07142639160156, "epoch": 0.6471446540880503, "grad_norm": 0.75578373670578, "kl": 0.10693359375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.5929322242736816, "reward_std": 0.1871037259697914, "rewards/accuracy_reward": 0.6031363010406494, "rewards/format_reward": 0.9897959232330322, "step": 6431 }, { "completion_length": 245.2551040649414, "epoch": 0.647245283018868, "grad_norm": 0.7976146340370178, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.887239694595337, "reward_std": 0.10542029328644276, "rewards/accuracy_reward": 0.8872397243976593, "rewards/format_reward": 1.0, "step": 6432 }, { "completion_length": 266.4795837402344, "epoch": 0.6473459119496855, "grad_norm": 1.5297685861587524, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6820148229599, "reward_std": 0.1953427493572235, "rewards/accuracy_reward": 0.6820148527622223, "rewards/format_reward": 1.0, "step": 6433 }, { "completion_length": 217.28570556640625, "epoch": 0.6474465408805031, "grad_norm": 0.5029420852661133, "kl": 0.1072998046875, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7666008472442627, "reward_std": 0.07506525702774525, "rewards/accuracy_reward": 0.7666008770465851, "rewards/format_reward": 1.0, "step": 6434 }, { "completion_length": 213.9285659790039, "epoch": 0.6475471698113208, "grad_norm": 0.99466472864151, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6782013177871704, "reward_std": 0.15051999688148499, "rewards/accuracy_reward": 0.6986095309257507, "rewards/format_reward": 0.9795918166637421, "step": 6435 }, { "completion_length": 245.81631469726562, "epoch": 0.6476477987421384, "grad_norm": 2.852888584136963, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.700215995311737, "reward_std": 0.1717616803944111, "rewards/accuracy_reward": 0.7206241488456726, "rewards/format_reward": 0.9795918166637421, "step": 6436 }, { "completion_length": 256.2550964355469, "epoch": 0.6477484276729559, "grad_norm": 1.3116703033447266, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7913832068443298, "reward_std": 0.2798802927136421, "rewards/accuracy_reward": 0.8117913901805878, "rewards/format_reward": 0.9795918166637421, "step": 6437 }, { "completion_length": 229.4897918701172, "epoch": 0.6478490566037736, "grad_norm": 0.5352261066436768, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7076677680015564, "reward_std": 0.14645925350487232, "rewards/accuracy_reward": 0.7178717851638794, "rewards/format_reward": 0.9897959232330322, "step": 6438 }, { "completion_length": 227.66326141357422, "epoch": 0.6479496855345912, "grad_norm": 0.8500181436538696, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7200031876564026, "reward_std": 0.13813599199056625, "rewards/accuracy_reward": 0.7200031876564026, "rewards/format_reward": 1.0, "step": 6439 }, { "completion_length": 220.91836547851562, "epoch": 0.6480503144654088, "grad_norm": 0.42973411083221436, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.914285659790039, "reward_std": 0.064793910831213, "rewards/accuracy_reward": 0.9244897663593292, "rewards/format_reward": 0.9897959232330322, "step": 6440 }, { "completion_length": 284.58162689208984, "epoch": 0.6481509433962264, "grad_norm": 0.6142435669898987, "kl": 0.097900390625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.4849948287010193, "reward_std": 0.13075167685747147, "rewards/accuracy_reward": 0.4951989948749542, "rewards/format_reward": 0.9897959232330322, "step": 6441 }, { "completion_length": 279.7550964355469, "epoch": 0.648251572327044, "grad_norm": 0.6115662455558777, "kl": 0.10107421875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6696025729179382, "reward_std": 0.17675847187638283, "rewards/accuracy_reward": 0.669602632522583, "rewards/format_reward": 1.0, "step": 6442 }, { "completion_length": 266.39796447753906, "epoch": 0.6483522012578616, "grad_norm": 0.7119870185852051, "kl": 0.09912109375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7092734575271606, "reward_std": 0.21268822252750397, "rewards/accuracy_reward": 0.7092735171318054, "rewards/format_reward": 1.0, "step": 6443 }, { "completion_length": 245.6734619140625, "epoch": 0.6484528301886793, "grad_norm": 0.24317428469657898, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7979590892791748, "reward_std": 0.0266350656747818, "rewards/accuracy_reward": 0.797959178686142, "rewards/format_reward": 1.0, "step": 6444 }, { "completion_length": 255.07141876220703, "epoch": 0.6485534591194968, "grad_norm": 0.5590606927871704, "kl": 0.09912109375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8179184198379517, "reward_std": 0.1529550962150097, "rewards/accuracy_reward": 0.8281225860118866, "rewards/format_reward": 0.9897959232330322, "step": 6445 }, { "completion_length": 208.82653045654297, "epoch": 0.6486540880503144, "grad_norm": 1.2348157167434692, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6221573948860168, "reward_std": 0.31379416584968567, "rewards/accuracy_reward": 0.6629737615585327, "rewards/format_reward": 0.9591836631298065, "step": 6446 }, { "completion_length": 261.80611419677734, "epoch": 0.6487547169811321, "grad_norm": 0.45457810163497925, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7718820571899414, "reward_std": 0.09958920255303383, "rewards/accuracy_reward": 0.7922902405261993, "rewards/format_reward": 0.9795918464660645, "step": 6447 }, { "completion_length": 321.1632537841797, "epoch": 0.6488553459119497, "grad_norm": 0.492076575756073, "kl": 0.0733642578125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.739473819732666, "reward_std": 0.19412457197904587, "rewards/accuracy_reward": 0.7700860798358917, "rewards/format_reward": 0.9693877398967743, "step": 6448 }, { "completion_length": 238.1734619140625, "epoch": 0.6489559748427673, "grad_norm": 0.6380415558815002, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8680272102355957, "reward_std": 0.16731463372707367, "rewards/accuracy_reward": 0.9088435173034668, "rewards/format_reward": 0.9591836631298065, "step": 6449 }, { "completion_length": 286.0102081298828, "epoch": 0.6490566037735849, "grad_norm": 0.5976930856704712, "kl": 0.106689453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7077612280845642, "reward_std": 0.1723974049091339, "rewards/accuracy_reward": 0.7281694412231445, "rewards/format_reward": 0.9795918464660645, "step": 6450 }, { "completion_length": 272.65306091308594, "epoch": 0.6491572327044025, "grad_norm": 1.3317128419876099, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7161615490913391, "reward_std": 0.23063670843839645, "rewards/accuracy_reward": 0.726365715265274, "rewards/format_reward": 0.9897959232330322, "step": 6451 }, { "completion_length": 189.12244415283203, "epoch": 0.6492578616352201, "grad_norm": 0.664165198802948, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.865160346031189, "reward_std": 0.1471341997385025, "rewards/accuracy_reward": 0.8855684995651245, "rewards/format_reward": 0.9795918464660645, "step": 6452 }, { "completion_length": 215.81632232666016, "epoch": 0.6493584905660378, "grad_norm": 0.5519504547119141, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7135789394378662, "reward_std": 0.15347164869308472, "rewards/accuracy_reward": 0.7135789096355438, "rewards/format_reward": 1.0, "step": 6453 }, { "completion_length": 271.6428527832031, "epoch": 0.6494591194968553, "grad_norm": 1.089640498161316, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6575989723205566, "reward_std": 0.28413496911525726, "rewards/accuracy_reward": 0.6984153687953949, "rewards/format_reward": 0.9591836333274841, "step": 6454 }, { "completion_length": 191.52040100097656, "epoch": 0.649559748427673, "grad_norm": 0.7248626351356506, "kl": 0.100830078125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6794747114181519, "reward_std": 0.14077387005090714, "rewards/accuracy_reward": 0.6794747710227966, "rewards/format_reward": 1.0, "step": 6455 }, { "completion_length": 213.29591369628906, "epoch": 0.6496603773584906, "grad_norm": 0.4853418171405792, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.805565893650055, "reward_std": 0.08886146172881126, "rewards/accuracy_reward": 0.8259740173816681, "rewards/format_reward": 0.9795918464660645, "step": 6456 }, { "completion_length": 278.77550506591797, "epoch": 0.6497610062893082, "grad_norm": 1.6583172082901, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6950437426567078, "reward_std": 0.26688186824321747, "rewards/accuracy_reward": 0.7358600497245789, "rewards/format_reward": 0.9591836333274841, "step": 6457 }, { "completion_length": 279.93878173828125, "epoch": 0.6498616352201257, "grad_norm": 0.870557427406311, "kl": 0.093994140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.5609887838363647, "reward_std": 0.20248573273420334, "rewards/accuracy_reward": 0.5711929202079773, "rewards/format_reward": 0.9897959232330322, "step": 6458 }, { "completion_length": 297.3163146972656, "epoch": 0.6499622641509434, "grad_norm": 0.4818290174007416, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6831477880477905, "reward_std": 0.19340825825929642, "rewards/accuracy_reward": 0.6933518946170807, "rewards/format_reward": 0.9897959232330322, "step": 6459 }, { "completion_length": 196.62245178222656, "epoch": 0.650062893081761, "grad_norm": 0.6745466589927673, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.743674635887146, "reward_std": 0.13180558755993843, "rewards/accuracy_reward": 0.7538787722587585, "rewards/format_reward": 0.9897959232330322, "step": 6460 }, { "completion_length": 183.03060913085938, "epoch": 0.6501635220125787, "grad_norm": 1.4195194244384766, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.819766640663147, "reward_std": 0.15786796063184738, "rewards/accuracy_reward": 0.8197666704654694, "rewards/format_reward": 1.0, "step": 6461 }, { "completion_length": 219.948974609375, "epoch": 0.6502641509433962, "grad_norm": 0.9582673907279968, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8324509859085083, "reward_std": 0.12682702392339706, "rewards/accuracy_reward": 0.8426550328731537, "rewards/format_reward": 0.9897959232330322, "step": 6462 }, { "completion_length": 260.56121826171875, "epoch": 0.6503647798742138, "grad_norm": 0.5806747674942017, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5605326294898987, "reward_std": 0.12763824313879013, "rewards/accuracy_reward": 0.5809408575296402, "rewards/format_reward": 0.9795918166637421, "step": 6463 }, { "completion_length": 246.77549743652344, "epoch": 0.6504654088050315, "grad_norm": 0.7589105367660522, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.734213650226593, "reward_std": 0.13972566649317741, "rewards/accuracy_reward": 0.7546218931674957, "rewards/format_reward": 0.9795918464660645, "step": 6464 }, { "completion_length": 220.1326446533203, "epoch": 0.6505660377358491, "grad_norm": 0.6500086188316345, "kl": 0.0966796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7285364866256714, "reward_std": 0.17708436399698257, "rewards/accuracy_reward": 0.7387405931949615, "rewards/format_reward": 0.9897959232330322, "step": 6465 }, { "completion_length": 257.1326370239258, "epoch": 0.6506666666666666, "grad_norm": 0.7129762172698975, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8182024955749512, "reward_std": 0.07116217911243439, "rewards/accuracy_reward": 0.8182023763656616, "rewards/format_reward": 1.0, "step": 6466 }, { "completion_length": 202.0, "epoch": 0.6507672955974843, "grad_norm": 1.006649136543274, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7393895983695984, "reward_std": 0.1688362881541252, "rewards/accuracy_reward": 0.7393894791603088, "rewards/format_reward": 1.0, "step": 6467 }, { "completion_length": 280.05101013183594, "epoch": 0.6508679245283019, "grad_norm": 0.7754718065261841, "kl": 0.1024169921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.565596103668213, "reward_std": 0.2052503600716591, "rewards/accuracy_reward": 0.5860042572021484, "rewards/format_reward": 0.9795918166637421, "step": 6468 }, { "completion_length": 278.52040100097656, "epoch": 0.6509685534591195, "grad_norm": 0.4042278826236725, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8575072288513184, "reward_std": 0.09119413048028946, "rewards/accuracy_reward": 0.8677113354206085, "rewards/format_reward": 0.9897959232330322, "step": 6469 }, { "completion_length": 249.65306091308594, "epoch": 0.6510691823899372, "grad_norm": 0.9024044275283813, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8043899536132812, "reward_std": 0.1816004514694214, "rewards/accuracy_reward": 0.8145941495895386, "rewards/format_reward": 0.9897959232330322, "step": 6470 }, { "completion_length": 210.4081573486328, "epoch": 0.6511698113207547, "grad_norm": 0.4514019191265106, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8795581459999084, "reward_std": 0.13198741525411606, "rewards/accuracy_reward": 0.8795581459999084, "rewards/format_reward": 1.0, "step": 6471 }, { "completion_length": 248.11223602294922, "epoch": 0.6512704402515723, "grad_norm": 1.7820312976837158, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.9214285016059875, "reward_std": 0.11416742205619812, "rewards/accuracy_reward": 0.9316325783729553, "rewards/format_reward": 0.9897959232330322, "step": 6472 }, { "completion_length": 240.57142639160156, "epoch": 0.65137106918239, "grad_norm": 0.8409026861190796, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7513281106948853, "reward_std": 0.14436346665024757, "rewards/accuracy_reward": 0.7513281106948853, "rewards/format_reward": 1.0, "step": 6473 }, { "completion_length": 187.87754821777344, "epoch": 0.6514716981132076, "grad_norm": 0.7825977802276611, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.863190472126007, "reward_std": 0.1524517759680748, "rewards/accuracy_reward": 0.8631904721260071, "rewards/format_reward": 1.0, "step": 6474 }, { "completion_length": 237.38775634765625, "epoch": 0.6515723270440251, "grad_norm": 2.3914542198181152, "kl": 0.062744140625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7607407569885254, "reward_std": 0.17411492764949799, "rewards/accuracy_reward": 0.7811489701271057, "rewards/format_reward": 0.9795918166637421, "step": 6475 }, { "completion_length": 283.55101776123047, "epoch": 0.6516729559748428, "grad_norm": 0.6042402386665344, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7627140283584595, "reward_std": 0.1212791483849287, "rewards/accuracy_reward": 0.7831222116947174, "rewards/format_reward": 0.9795918166637421, "step": 6476 }, { "completion_length": 299.82652282714844, "epoch": 0.6517735849056604, "grad_norm": 0.6073042154312134, "kl": 0.094970703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6747033596038818, "reward_std": 0.14328207448124886, "rewards/accuracy_reward": 0.684907466173172, "rewards/format_reward": 0.9897959232330322, "step": 6477 }, { "completion_length": 323.3571319580078, "epoch": 0.651874213836478, "grad_norm": 0.6944120526313782, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.460292100906372, "reward_std": 0.1989811658859253, "rewards/accuracy_reward": 0.47049616277217865, "rewards/format_reward": 0.9897959232330322, "step": 6478 }, { "completion_length": 263.32653045654297, "epoch": 0.6519748427672956, "grad_norm": 0.6247734427452087, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7151868343353271, "reward_std": 0.17508909106254578, "rewards/accuracy_reward": 0.7457990646362305, "rewards/format_reward": 0.9693877398967743, "step": 6479 }, { "completion_length": 266.15306091308594, "epoch": 0.6520754716981132, "grad_norm": 0.435056209564209, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7544491291046143, "reward_std": 0.11311712861061096, "rewards/accuracy_reward": 0.754449188709259, "rewards/format_reward": 1.0, "step": 6480 }, { "completion_length": 266.948974609375, "epoch": 0.6521761006289308, "grad_norm": 0.3628235161304474, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7038547992706299, "reward_std": 0.04946327582001686, "rewards/accuracy_reward": 0.703854888677597, "rewards/format_reward": 1.0, "step": 6481 }, { "completion_length": 262.7346954345703, "epoch": 0.6522767295597485, "grad_norm": 0.5135690569877625, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7551020383834839, "reward_std": 0.12395289912819862, "rewards/accuracy_reward": 0.7653061151504517, "rewards/format_reward": 0.9897959232330322, "step": 6482 }, { "completion_length": 180.9693832397461, "epoch": 0.652377358490566, "grad_norm": 0.7862600088119507, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.884353756904602, "reward_std": 0.11917255818843842, "rewards/accuracy_reward": 0.8945578336715698, "rewards/format_reward": 0.9897959232330322, "step": 6483 }, { "completion_length": 270.2142791748047, "epoch": 0.6524779874213836, "grad_norm": 4.9452691078186035, "kl": 0.14404296875, "learning_rate": 1e-06, "loss": 0.0058, "reward": 1.7615646719932556, "reward_std": 0.19857919588685036, "rewards/accuracy_reward": 0.771768718957901, "rewards/format_reward": 0.9897959232330322, "step": 6484 }, { "completion_length": 344.4081573486328, "epoch": 0.6525786163522013, "grad_norm": 0.7725825905799866, "kl": 0.09716796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7210719585418701, "reward_std": 0.14388969540596008, "rewards/accuracy_reward": 0.7312761545181274, "rewards/format_reward": 0.9897959232330322, "step": 6485 }, { "completion_length": 225.12244415283203, "epoch": 0.6526792452830189, "grad_norm": 0.7918111085891724, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7454556822776794, "reward_std": 0.14230354502797127, "rewards/accuracy_reward": 0.7454557418823242, "rewards/format_reward": 1.0, "step": 6486 }, { "completion_length": 352.5306091308594, "epoch": 0.6527798742138364, "grad_norm": 0.9866006374359131, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6832155585289001, "reward_std": 0.21444693207740784, "rewards/accuracy_reward": 0.7036237120628357, "rewards/format_reward": 0.9795918464660645, "step": 6487 }, { "completion_length": 249.51020050048828, "epoch": 0.6528805031446541, "grad_norm": 0.962058961391449, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7489795684814453, "reward_std": 0.22581296414136887, "rewards/accuracy_reward": 0.779591828584671, "rewards/format_reward": 0.9693877398967743, "step": 6488 }, { "completion_length": 253.34693145751953, "epoch": 0.6529811320754717, "grad_norm": 0.944897472858429, "kl": 0.123291015625, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.8878787755966187, "reward_std": 0.15211902558803558, "rewards/accuracy_reward": 0.9082869589328766, "rewards/format_reward": 0.9795918464660645, "step": 6489 }, { "completion_length": 251.38774871826172, "epoch": 0.6530817610062893, "grad_norm": 0.3489089012145996, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8569808602333069, "reward_std": 0.10374543815851212, "rewards/accuracy_reward": 0.8671849370002747, "rewards/format_reward": 0.9897959232330322, "step": 6490 }, { "completion_length": 221.84693145751953, "epoch": 0.6531823899371069, "grad_norm": 2.06790828704834, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8433105945587158, "reward_std": 0.16762005537748337, "rewards/accuracy_reward": 0.8535147309303284, "rewards/format_reward": 0.9897959232330322, "step": 6491 }, { "completion_length": 257.51019287109375, "epoch": 0.6532830188679245, "grad_norm": 0.5546865463256836, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8622890710830688, "reward_std": 0.15117328613996506, "rewards/accuracy_reward": 0.8724931180477142, "rewards/format_reward": 0.9897959232330322, "step": 6492 }, { "completion_length": 243.14286041259766, "epoch": 0.6533836477987421, "grad_norm": 0.597707986831665, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7450923323631287, "reward_std": 0.16398613154888153, "rewards/accuracy_reward": 0.7655004858970642, "rewards/format_reward": 0.9795918166637421, "step": 6493 }, { "completion_length": 317.3571472167969, "epoch": 0.6534842767295598, "grad_norm": 0.40791910886764526, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6216135025024414, "reward_std": 0.16143879666924477, "rewards/accuracy_reward": 0.6420216858386993, "rewards/format_reward": 0.9795918166637421, "step": 6494 }, { "completion_length": 264.5612258911133, "epoch": 0.6535849056603774, "grad_norm": 0.8322098851203918, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7132750153541565, "reward_std": 0.12671935185790062, "rewards/accuracy_reward": 0.7336831390857697, "rewards/format_reward": 0.9795918464660645, "step": 6495 }, { "completion_length": 329.52040100097656, "epoch": 0.6536855345911949, "grad_norm": 0.439198762178421, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7518718838691711, "reward_std": 0.06545657850801945, "rewards/accuracy_reward": 0.7518719136714935, "rewards/format_reward": 1.0, "step": 6496 }, { "completion_length": 340.2550964355469, "epoch": 0.6537861635220126, "grad_norm": 0.8449898362159729, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6124558448791504, "reward_std": 0.17358484119176865, "rewards/accuracy_reward": 0.6124559044837952, "rewards/format_reward": 1.0, "step": 6497 }, { "completion_length": 336.4897918701172, "epoch": 0.6538867924528302, "grad_norm": 0.7214037179946899, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.65154230594635, "reward_std": 0.2597115486860275, "rewards/accuracy_reward": 0.6821545660495758, "rewards/format_reward": 0.9693877398967743, "step": 6498 }, { "completion_length": 380.87754821777344, "epoch": 0.6539874213836478, "grad_norm": 0.6579660177230835, "kl": 0.0548095703125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.6328960061073303, "reward_std": 0.21711456775665283, "rewards/accuracy_reward": 0.6533041894435883, "rewards/format_reward": 0.9795918464660645, "step": 6499 }, { "completion_length": 338.2653045654297, "epoch": 0.6540880503144654, "grad_norm": 0.5870208144187927, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.848018765449524, "reward_std": 0.17357878386974335, "rewards/accuracy_reward": 0.8582228422164917, "rewards/format_reward": 0.9897959232330322, "step": 6500 }, { "completion_length": 344.3163146972656, "epoch": 0.654188679245283, "grad_norm": 0.4953734874725342, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7028196454048157, "reward_std": 0.10712437704205513, "rewards/accuracy_reward": 0.7130236625671387, "rewards/format_reward": 0.9897959232330322, "step": 6501 }, { "completion_length": 201.58162689208984, "epoch": 0.6542893081761006, "grad_norm": 1.196067452430725, "kl": 0.117919921875, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.791884958744049, "reward_std": 0.07540987618267536, "rewards/accuracy_reward": 0.7918848693370819, "rewards/format_reward": 1.0, "step": 6502 }, { "completion_length": 310.6734619140625, "epoch": 0.6543899371069183, "grad_norm": 0.5659863352775574, "kl": 0.1064453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.708738386631012, "reward_std": 0.19093843549489975, "rewards/accuracy_reward": 0.7189424932003021, "rewards/format_reward": 0.9897959232330322, "step": 6503 }, { "completion_length": 253.37755584716797, "epoch": 0.6544905660377358, "grad_norm": 1.459540605545044, "kl": 0.115234375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.834614336490631, "reward_std": 0.14423393830657005, "rewards/accuracy_reward": 0.8346142172813416, "rewards/format_reward": 1.0, "step": 6504 }, { "completion_length": 357.448974609375, "epoch": 0.6545911949685534, "grad_norm": 0.5524146556854248, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.742103934288025, "reward_std": 0.1828780397772789, "rewards/accuracy_reward": 0.7523080408573151, "rewards/format_reward": 0.9897959232330322, "step": 6505 }, { "completion_length": 316.4897918701172, "epoch": 0.6546918238993711, "grad_norm": 0.6817290186882019, "kl": 0.089111328125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6377261281013489, "reward_std": 0.1624366044998169, "rewards/accuracy_reward": 0.6377261281013489, "rewards/format_reward": 1.0, "step": 6506 }, { "completion_length": 250.78571319580078, "epoch": 0.6547924528301887, "grad_norm": 1.062880516052246, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.866676926612854, "reward_std": 0.16629644483327866, "rewards/accuracy_reward": 0.8666769564151764, "rewards/format_reward": 1.0, "step": 6507 }, { "completion_length": 262.8367233276367, "epoch": 0.6548930817610062, "grad_norm": 0.5491281151771545, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7801412343978882, "reward_std": 0.15211093425750732, "rewards/accuracy_reward": 0.790345311164856, "rewards/format_reward": 0.9897959232330322, "step": 6508 }, { "completion_length": 278.55101013183594, "epoch": 0.6549937106918239, "grad_norm": 1.8389062881469727, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7074844241142273, "reward_std": 0.20987670868635178, "rewards/accuracy_reward": 0.7380967736244202, "rewards/format_reward": 0.9693877398967743, "step": 6509 }, { "completion_length": 264.12244415283203, "epoch": 0.6550943396226415, "grad_norm": 0.5657184720039368, "kl": 0.0535888671875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.843537449836731, "reward_std": 0.17205794155597687, "rewards/accuracy_reward": 0.8435373604297638, "rewards/format_reward": 1.0, "step": 6510 }, { "completion_length": 255.85713958740234, "epoch": 0.6551949685534592, "grad_norm": 0.5383690595626831, "kl": 0.0753173828125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8074538111686707, "reward_std": 0.17236275970935822, "rewards/accuracy_reward": 0.8278620541095734, "rewards/format_reward": 0.9795918464660645, "step": 6511 }, { "completion_length": 254.5, "epoch": 0.6552955974842767, "grad_norm": 1.3173437118530273, "kl": 0.11328125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.6428571343421936, "reward_std": 0.09437987208366394, "rewards/accuracy_reward": 0.6632652878761292, "rewards/format_reward": 0.9795918166637421, "step": 6512 }, { "completion_length": 217.2040786743164, "epoch": 0.6553962264150943, "grad_norm": 0.3808048367500305, "kl": 0.1162109375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.821415901184082, "reward_std": 0.07215129490941763, "rewards/accuracy_reward": 0.8316200077533722, "rewards/format_reward": 0.9897959232330322, "step": 6513 }, { "completion_length": 379.9387664794922, "epoch": 0.655496855345912, "grad_norm": 0.9018296003341675, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7711193561553955, "reward_std": 0.210576593875885, "rewards/accuracy_reward": 0.8119356334209442, "rewards/format_reward": 0.9591836631298065, "step": 6514 }, { "completion_length": 264.29590606689453, "epoch": 0.6555974842767296, "grad_norm": 0.4686782658100128, "kl": 0.0819091796875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.848461627960205, "reward_std": 0.08012242242693901, "rewards/accuracy_reward": 0.8484615981578827, "rewards/format_reward": 1.0, "step": 6515 }, { "completion_length": 215.66326141357422, "epoch": 0.6556981132075471, "grad_norm": 0.8193197250366211, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7429532408714294, "reward_std": 0.15197789669036865, "rewards/accuracy_reward": 0.7531574070453644, "rewards/format_reward": 0.9897959232330322, "step": 6516 }, { "completion_length": 248.09183502197266, "epoch": 0.6557987421383648, "grad_norm": 0.579962432384491, "kl": 0.131103515625, "learning_rate": 1e-06, "loss": 0.0052, "reward": 1.873397171497345, "reward_std": 0.09795853681862354, "rewards/accuracy_reward": 0.8836013078689575, "rewards/format_reward": 0.9897959232330322, "step": 6517 }, { "completion_length": 223.55101776123047, "epoch": 0.6558993710691824, "grad_norm": 0.6154466271400452, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7397959232330322, "reward_std": 0.1444554589688778, "rewards/accuracy_reward": 0.7602040767669678, "rewards/format_reward": 0.9795918166637421, "step": 6518 }, { "completion_length": 169.32653045654297, "epoch": 0.656, "grad_norm": 1.0238144397735596, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7466500997543335, "reward_std": 0.13167144358158112, "rewards/accuracy_reward": 0.7670583128929138, "rewards/format_reward": 0.9795918464660645, "step": 6519 }, { "completion_length": 311.66326904296875, "epoch": 0.6561006289308177, "grad_norm": 0.7568957209587097, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5406218767166138, "reward_std": 0.1576196812093258, "rewards/accuracy_reward": 0.5610301196575165, "rewards/format_reward": 0.9795918464660645, "step": 6520 }, { "completion_length": 288.26529693603516, "epoch": 0.6562012578616352, "grad_norm": 0.5338272452354431, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.723987638950348, "reward_std": 0.17752864211797714, "rewards/accuracy_reward": 0.7341917753219604, "rewards/format_reward": 0.9897959232330322, "step": 6521 }, { "completion_length": 281.05101776123047, "epoch": 0.6563018867924528, "grad_norm": 1.2361245155334473, "kl": 0.14111328125, "learning_rate": 1e-06, "loss": 0.0056, "reward": 1.70954030752182, "reward_std": 0.30215445160865784, "rewards/accuracy_reward": 0.7401524484157562, "rewards/format_reward": 0.9693877398967743, "step": 6522 }, { "completion_length": 266.2346954345703, "epoch": 0.6564025157232705, "grad_norm": 0.3620169460773468, "kl": 0.0594482421875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7471655011177063, "reward_std": 0.0904175154864788, "rewards/accuracy_reward": 0.7471655309200287, "rewards/format_reward": 1.0, "step": 6523 }, { "completion_length": 210.29591369628906, "epoch": 0.6565031446540881, "grad_norm": 1.301998496055603, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7551019787788391, "reward_std": 0.2580472305417061, "rewards/accuracy_reward": 0.7959183752536774, "rewards/format_reward": 0.9591836631298065, "step": 6524 }, { "completion_length": 245.10203552246094, "epoch": 0.6566037735849056, "grad_norm": 0.5142315626144409, "kl": 0.1015625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.9310510158538818, "reward_std": 0.08661281876266003, "rewards/accuracy_reward": 0.9310509860515594, "rewards/format_reward": 1.0, "step": 6525 }, { "completion_length": 270.6428527832031, "epoch": 0.6567044025157233, "grad_norm": 0.6068621873855591, "kl": 0.12109375, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7120741605758667, "reward_std": 0.0783140417188406, "rewards/accuracy_reward": 0.7120741307735443, "rewards/format_reward": 1.0, "step": 6526 }, { "completion_length": 272.7653045654297, "epoch": 0.6568050314465409, "grad_norm": 0.6783110499382019, "kl": 0.0682373046875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6550322771072388, "reward_std": 0.18253595754504204, "rewards/accuracy_reward": 0.655032217502594, "rewards/format_reward": 1.0, "step": 6527 }, { "completion_length": 242.27549743652344, "epoch": 0.6569056603773585, "grad_norm": 0.7204439640045166, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7347069382667542, "reward_std": 0.13985558599233627, "rewards/accuracy_reward": 0.7347069382667542, "rewards/format_reward": 1.0, "step": 6528 }, { "completion_length": 226.6530532836914, "epoch": 0.6570062893081761, "grad_norm": 0.5574820041656494, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8304477334022522, "reward_std": 0.14106076210737228, "rewards/accuracy_reward": 0.8508559465408325, "rewards/format_reward": 0.9795918464660645, "step": 6529 }, { "completion_length": 292.448974609375, "epoch": 0.6571069182389937, "grad_norm": 0.4667949676513672, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7313567399978638, "reward_std": 0.09940250217914581, "rewards/accuracy_reward": 0.7313567399978638, "rewards/format_reward": 1.0, "step": 6530 }, { "completion_length": 197.9795913696289, "epoch": 0.6572075471698113, "grad_norm": 0.6469074487686157, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8596420288085938, "reward_std": 0.16222495585680008, "rewards/accuracy_reward": 0.8698460459709167, "rewards/format_reward": 0.9897959232330322, "step": 6531 }, { "completion_length": 251.73468017578125, "epoch": 0.657308176100629, "grad_norm": 0.49159690737724304, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8283465504646301, "reward_std": 0.14388269558548927, "rewards/accuracy_reward": 0.8385506272315979, "rewards/format_reward": 0.9897959232330322, "step": 6532 }, { "completion_length": 215.5204086303711, "epoch": 0.6574088050314465, "grad_norm": 0.7463185787200928, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.93265300989151, "reward_std": 0.1590418852865696, "rewards/accuracy_reward": 0.9530612230300903, "rewards/format_reward": 0.9795918166637421, "step": 6533 }, { "completion_length": 314.6428527832031, "epoch": 0.6575094339622641, "grad_norm": 1.2242904901504517, "kl": 0.10205078125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7231740951538086, "reward_std": 0.1825430616736412, "rewards/accuracy_reward": 0.7333781719207764, "rewards/format_reward": 0.9897959232330322, "step": 6534 }, { "completion_length": 228.61224365234375, "epoch": 0.6576100628930818, "grad_norm": 0.5259366631507874, "kl": 0.0982666015625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7797508835792542, "reward_std": 0.1482444442808628, "rewards/accuracy_reward": 0.7899549603462219, "rewards/format_reward": 0.9897959232330322, "step": 6535 }, { "completion_length": 169.08162689208984, "epoch": 0.6577106918238994, "grad_norm": 0.5091679692268372, "kl": 0.1025390625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.90705007314682, "reward_std": 0.06989090144634247, "rewards/accuracy_reward": 0.9070500731468201, "rewards/format_reward": 1.0, "step": 6536 }, { "completion_length": 189.64285278320312, "epoch": 0.6578113207547169, "grad_norm": 0.9465368986129761, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7879170775413513, "reward_std": 0.22559714317321777, "rewards/accuracy_reward": 0.8083252012729645, "rewards/format_reward": 0.9795918464660645, "step": 6537 }, { "completion_length": 256.74488830566406, "epoch": 0.6579119496855346, "grad_norm": 0.6073492169380188, "kl": 0.0777587890625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7399504780769348, "reward_std": 0.18929435312747955, "rewards/accuracy_reward": 0.7603586614131927, "rewards/format_reward": 0.9795918464660645, "step": 6538 }, { "completion_length": 255.55101776123047, "epoch": 0.6580125786163522, "grad_norm": 0.598511278629303, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8545454144477844, "reward_std": 0.1432083547115326, "rewards/accuracy_reward": 0.87495356798172, "rewards/format_reward": 0.9795918464660645, "step": 6539 }, { "completion_length": 203.9693832397461, "epoch": 0.6581132075471698, "grad_norm": 0.31785663962364197, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.800000011920929, "reward_std": 0.08700969815254211, "rewards/accuracy_reward": 0.7999999523162842, "rewards/format_reward": 1.0, "step": 6540 }, { "completion_length": 262.57141876220703, "epoch": 0.6582138364779874, "grad_norm": 0.596589982509613, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.758467674255371, "reward_std": 0.24401142075657845, "rewards/accuracy_reward": 0.7788758277893066, "rewards/format_reward": 0.9795918166637421, "step": 6541 }, { "completion_length": 265.37754821777344, "epoch": 0.658314465408805, "grad_norm": 0.5042358636856079, "kl": 0.0882568359375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8775509595870972, "reward_std": 0.15193557180464268, "rewards/accuracy_reward": 0.918367326259613, "rewards/format_reward": 0.9591836631298065, "step": 6542 }, { "completion_length": 289.1224365234375, "epoch": 0.6584150943396226, "grad_norm": 0.4475109279155731, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.795918345451355, "reward_std": 0.08016148954629898, "rewards/accuracy_reward": 0.8163265287876129, "rewards/format_reward": 0.9795918166637421, "step": 6543 }, { "completion_length": 190.79591369628906, "epoch": 0.6585157232704403, "grad_norm": 0.7680304646492004, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6781402230262756, "reward_std": 0.17006570100784302, "rewards/accuracy_reward": 0.6781402826309204, "rewards/format_reward": 1.0, "step": 6544 }, { "completion_length": 225.72449493408203, "epoch": 0.6586163522012579, "grad_norm": 0.6626322865486145, "kl": 0.130126953125, "learning_rate": 1e-06, "loss": 0.0052, "reward": 1.7789115905761719, "reward_std": 0.10309034585952759, "rewards/accuracy_reward": 0.8095238208770752, "rewards/format_reward": 0.9693877398967743, "step": 6545 }, { "completion_length": 198.7142791748047, "epoch": 0.6587169811320754, "grad_norm": 1.0660865306854248, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7392290234565735, "reward_std": 0.18822817504405975, "rewards/accuracy_reward": 0.7494331002235413, "rewards/format_reward": 0.9897959232330322, "step": 6546 }, { "completion_length": 205.30612182617188, "epoch": 0.6588176100628931, "grad_norm": 0.6863428354263306, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8377771973609924, "reward_std": 0.12432360276579857, "rewards/accuracy_reward": 0.8479812443256378, "rewards/format_reward": 0.9897959232330322, "step": 6547 }, { "completion_length": 220.30611419677734, "epoch": 0.6589182389937107, "grad_norm": 0.407912015914917, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8756957650184631, "reward_std": 0.05399492383003235, "rewards/accuracy_reward": 0.8961039185523987, "rewards/format_reward": 0.9795918166637421, "step": 6548 }, { "completion_length": 170.87754821777344, "epoch": 0.6590188679245284, "grad_norm": 5.213278293609619, "kl": 0.10888671875, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8131239414215088, "reward_std": 0.13157396018505096, "rewards/accuracy_reward": 0.8335320949554443, "rewards/format_reward": 0.9795918464660645, "step": 6549 }, { "completion_length": 361.39794921875, "epoch": 0.6591194968553459, "grad_norm": 0.8182289004325867, "kl": 0.0706787109375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6431045532226562, "reward_std": 0.21741227805614471, "rewards/accuracy_reward": 0.683920830488205, "rewards/format_reward": 0.9591836631298065, "step": 6550 }, { "completion_length": 200.4897918701172, "epoch": 0.6592201257861635, "grad_norm": 0.812978208065033, "kl": 0.0587158203125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8189369440078735, "reward_std": 0.16968997567892075, "rewards/accuracy_reward": 0.8291410207748413, "rewards/format_reward": 0.9897959232330322, "step": 6551 }, { "completion_length": 201.6326446533203, "epoch": 0.6593207547169811, "grad_norm": 2.427719831466675, "kl": 0.102294921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8804664611816406, "reward_std": 0.12727374583482742, "rewards/accuracy_reward": 0.890670508146286, "rewards/format_reward": 0.9897959232330322, "step": 6552 }, { "completion_length": 223.4693832397461, "epoch": 0.6594213836477988, "grad_norm": 0.5644420385360718, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7958959341049194, "reward_std": 0.0871461033821106, "rewards/accuracy_reward": 0.7958959341049194, "rewards/format_reward": 1.0, "step": 6553 }, { "completion_length": 252.72447967529297, "epoch": 0.6595220125786163, "grad_norm": 0.5789003372192383, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6663751006126404, "reward_std": 0.1553320437669754, "rewards/accuracy_reward": 0.666375145316124, "rewards/format_reward": 1.0, "step": 6554 }, { "completion_length": 273.6734619140625, "epoch": 0.659622641509434, "grad_norm": 0.6907994747161865, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8885332942008972, "reward_std": 0.1279209852218628, "rewards/accuracy_reward": 0.9089414477348328, "rewards/format_reward": 0.9795918166637421, "step": 6555 }, { "completion_length": 269.1836624145508, "epoch": 0.6597232704402516, "grad_norm": 0.6743392944335938, "kl": 0.0941162109375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.5928439497947693, "reward_std": 0.19885768741369247, "rewards/accuracy_reward": 0.5928438901901245, "rewards/format_reward": 1.0, "step": 6556 }, { "completion_length": 251.2346954345703, "epoch": 0.6598238993710692, "grad_norm": 0.6548418998718262, "kl": 0.0667724609375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8090415596961975, "reward_std": 0.19941747188568115, "rewards/accuracy_reward": 0.8498579561710358, "rewards/format_reward": 0.9591836631298065, "step": 6557 }, { "completion_length": 249.69386291503906, "epoch": 0.6599245283018867, "grad_norm": 0.37517210841178894, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8098477721214294, "reward_std": 0.050984304398298264, "rewards/accuracy_reward": 0.8200518488883972, "rewards/format_reward": 0.9897959232330322, "step": 6558 }, { "completion_length": 175.9795913696289, "epoch": 0.6600251572327044, "grad_norm": 0.702714204788208, "kl": 0.10693359375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.847398579120636, "reward_std": 0.12821850180625916, "rewards/accuracy_reward": 0.8678067624568939, "rewards/format_reward": 0.9795918464660645, "step": 6559 }, { "completion_length": 221.2040786743164, "epoch": 0.660125786163522, "grad_norm": 0.49589458107948303, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.814630150794983, "reward_std": 0.10730156674981117, "rewards/accuracy_reward": 0.8350382745265961, "rewards/format_reward": 0.9795918464660645, "step": 6560 }, { "completion_length": 224.05101776123047, "epoch": 0.6602264150943397, "grad_norm": 0.5545485019683838, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7496598958969116, "reward_std": 0.15584539622068405, "rewards/accuracy_reward": 0.7700680196285248, "rewards/format_reward": 0.9795918464660645, "step": 6561 }, { "completion_length": 249.76529693603516, "epoch": 0.6603270440251572, "grad_norm": 1.224410057067871, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8218923807144165, "reward_std": 0.1666109412908554, "rewards/accuracy_reward": 0.8320964574813843, "rewards/format_reward": 0.9897959232330322, "step": 6562 }, { "completion_length": 182.2448959350586, "epoch": 0.6604276729559748, "grad_norm": 0.6147088408470154, "kl": 0.1053466796875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8504328727722168, "reward_std": 0.08997277542948723, "rewards/accuracy_reward": 0.8606369495391846, "rewards/format_reward": 0.9897959232330322, "step": 6563 }, { "completion_length": 234.66326141357422, "epoch": 0.6605283018867925, "grad_norm": 1.14491605758667, "kl": 0.0802001953125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7348353266716003, "reward_std": 0.1415671668946743, "rewards/accuracy_reward": 0.745039314031601, "rewards/format_reward": 0.9897959232330322, "step": 6564 }, { "completion_length": 208.80611419677734, "epoch": 0.6606289308176101, "grad_norm": 1.1769342422485352, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.796841561794281, "reward_std": 0.11262838914990425, "rewards/accuracy_reward": 0.8070456683635712, "rewards/format_reward": 0.9897959232330322, "step": 6565 }, { "completion_length": 271.31632232666016, "epoch": 0.6607295597484276, "grad_norm": 0.46858251094818115, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6942896842956543, "reward_std": 0.08685238659381866, "rewards/accuracy_reward": 0.6942898333072662, "rewards/format_reward": 1.0, "step": 6566 }, { "completion_length": 153.7653045654297, "epoch": 0.6608301886792453, "grad_norm": 1.3148224353790283, "kl": 0.118408203125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.8359315395355225, "reward_std": 0.06863855943083763, "rewards/accuracy_reward": 0.8461356461048126, "rewards/format_reward": 0.9897959232330322, "step": 6567 }, { "completion_length": 249.81632232666016, "epoch": 0.6609308176100629, "grad_norm": 1.026412844657898, "kl": 0.10107421875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7003400921821594, "reward_std": 0.19844643399119377, "rewards/accuracy_reward": 0.7105441689491272, "rewards/format_reward": 0.9897959232330322, "step": 6568 }, { "completion_length": 260.4081573486328, "epoch": 0.6610314465408805, "grad_norm": 0.6079325079917908, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7167233228683472, "reward_std": 0.1797201707959175, "rewards/accuracy_reward": 0.7167233824729919, "rewards/format_reward": 1.0, "step": 6569 }, { "completion_length": 192.14285278320312, "epoch": 0.6611320754716982, "grad_norm": 0.8235225677490234, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7735027074813843, "reward_std": 0.1631704457104206, "rewards/accuracy_reward": 0.7837068736553192, "rewards/format_reward": 0.9897959232330322, "step": 6570 }, { "completion_length": 181.16325759887695, "epoch": 0.6612327044025157, "grad_norm": 0.6956577301025391, "kl": 0.1123046875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7546524405479431, "reward_std": 0.10058419778943062, "rewards/accuracy_reward": 0.775060623884201, "rewards/format_reward": 0.9795918464660645, "step": 6571 }, { "completion_length": 145.89795684814453, "epoch": 0.6613333333333333, "grad_norm": 0.38951775431632996, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.9070810675621033, "reward_std": 0.056096408516168594, "rewards/accuracy_reward": 0.9070809781551361, "rewards/format_reward": 1.0, "step": 6572 }, { "completion_length": 204.38774871826172, "epoch": 0.661433962264151, "grad_norm": 1.1099960803985596, "kl": 0.10888671875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.5983502864837646, "reward_std": 0.1710757166147232, "rewards/accuracy_reward": 0.6187584400177002, "rewards/format_reward": 0.9795918464660645, "step": 6573 }, { "completion_length": 232.66326904296875, "epoch": 0.6615345911949686, "grad_norm": 0.9337470531463623, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7722918391227722, "reward_std": 0.25676096230745316, "rewards/accuracy_reward": 0.7824959754943848, "rewards/format_reward": 0.9897959232330322, "step": 6574 }, { "completion_length": 248.73468780517578, "epoch": 0.6616352201257861, "grad_norm": 0.6427657008171082, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.764547348022461, "reward_std": 0.15555666387081146, "rewards/accuracy_reward": 0.7645472884178162, "rewards/format_reward": 1.0, "step": 6575 }, { "completion_length": 229.77550506591797, "epoch": 0.6617358490566038, "grad_norm": 0.5692445635795593, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7654452919960022, "reward_std": 0.13533557578921318, "rewards/accuracy_reward": 0.785853385925293, "rewards/format_reward": 0.9795918166637421, "step": 6576 }, { "completion_length": 195.51020050048828, "epoch": 0.6618364779874214, "grad_norm": 0.7959848046302795, "kl": 0.0743408203125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8489795923233032, "reward_std": 0.13053559139370918, "rewards/accuracy_reward": 0.8489795923233032, "rewards/format_reward": 1.0, "step": 6577 }, { "completion_length": 250.6734619140625, "epoch": 0.661937106918239, "grad_norm": 0.7213318347930908, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.7535197734832764, "reward_std": 0.15421555191278458, "rewards/accuracy_reward": 0.7637238204479218, "rewards/format_reward": 0.9897959232330322, "step": 6578 }, { "completion_length": 233.10203552246094, "epoch": 0.6620377358490566, "grad_norm": 0.41916322708129883, "kl": 0.0877685546875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8356278538703918, "reward_std": 0.02437134040519595, "rewards/accuracy_reward": 0.8356278836727142, "rewards/format_reward": 1.0, "step": 6579 }, { "completion_length": 249.0408172607422, "epoch": 0.6621383647798742, "grad_norm": 0.8204989433288574, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6635989546775818, "reward_std": 0.26753222197294235, "rewards/accuracy_reward": 0.6738030016422272, "rewards/format_reward": 0.9897959232330322, "step": 6580 }, { "completion_length": 184.15306091308594, "epoch": 0.6622389937106918, "grad_norm": 0.786055862903595, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8817150592803955, "reward_std": 0.16166307777166367, "rewards/accuracy_reward": 0.8817150890827179, "rewards/format_reward": 1.0, "step": 6581 }, { "completion_length": 282.6224365234375, "epoch": 0.6623396226415095, "grad_norm": 0.5024241805076599, "kl": 0.09765625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.5449016094207764, "reward_std": 0.147605299949646, "rewards/accuracy_reward": 0.5551057010889053, "rewards/format_reward": 0.9897959232330322, "step": 6582 }, { "completion_length": 235.6530532836914, "epoch": 0.662440251572327, "grad_norm": 3.231659412384033, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7447521090507507, "reward_std": 0.16301575303077698, "rewards/accuracy_reward": 0.7447521686553955, "rewards/format_reward": 1.0, "step": 6583 }, { "completion_length": 179.74488830566406, "epoch": 0.6625408805031446, "grad_norm": 5.460433006286621, "kl": 0.11279296875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7440571188926697, "reward_std": 0.12783575057983398, "rewards/accuracy_reward": 0.7440571486949921, "rewards/format_reward": 1.0, "step": 6584 }, { "completion_length": 206.47958374023438, "epoch": 0.6626415094339623, "grad_norm": 1.3828881978988647, "kl": 0.12646484375, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.8437075018882751, "reward_std": 0.1473664678633213, "rewards/accuracy_reward": 0.8539115786552429, "rewards/format_reward": 0.9897959232330322, "step": 6585 }, { "completion_length": 237.90816497802734, "epoch": 0.6627421383647799, "grad_norm": 0.5712831020355225, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7448861598968506, "reward_std": 0.10239867120981216, "rewards/accuracy_reward": 0.744886189699173, "rewards/format_reward": 1.0, "step": 6586 }, { "completion_length": 201.39795684814453, "epoch": 0.6628427672955974, "grad_norm": 0.7676007151603699, "kl": 0.1259765625, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.8609076738357544, "reward_std": 0.11167335137724876, "rewards/accuracy_reward": 0.8609077334403992, "rewards/format_reward": 1.0, "step": 6587 }, { "completion_length": 216.0408172607422, "epoch": 0.6629433962264151, "grad_norm": 0.4769030809402466, "kl": 0.0665283203125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.9013605117797852, "reward_std": 0.05684427544474602, "rewards/accuracy_reward": 0.9115646183490753, "rewards/format_reward": 0.9897959232330322, "step": 6588 }, { "completion_length": 202.15306091308594, "epoch": 0.6630440251572327, "grad_norm": 1.2943981885910034, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7328237891197205, "reward_std": 0.26425474882125854, "rewards/accuracy_reward": 0.7736401557922363, "rewards/format_reward": 0.9591836631298065, "step": 6589 }, { "completion_length": 203.79591369628906, "epoch": 0.6631446540880503, "grad_norm": 0.9866037964820862, "kl": 0.1396484375, "learning_rate": 1e-06, "loss": 0.0056, "reward": 1.7559218406677246, "reward_std": 0.17176298052072525, "rewards/accuracy_reward": 0.7661259174346924, "rewards/format_reward": 0.9897959232330322, "step": 6590 }, { "completion_length": 218.73468780517578, "epoch": 0.6632452830188679, "grad_norm": 0.862582802772522, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.82552570104599, "reward_std": 0.22524303942918777, "rewards/accuracy_reward": 0.8459337949752808, "rewards/format_reward": 0.9795918464660645, "step": 6591 }, { "completion_length": 274.2755126953125, "epoch": 0.6633459119496855, "grad_norm": 0.7690967321395874, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.65425306558609, "reward_std": 0.17237158864736557, "rewards/accuracy_reward": 0.6542531549930573, "rewards/format_reward": 1.0, "step": 6592 }, { "completion_length": 205.81632232666016, "epoch": 0.6634465408805031, "grad_norm": 0.8417600393295288, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7754454016685486, "reward_std": 0.11230863258242607, "rewards/accuracy_reward": 0.7754454016685486, "rewards/format_reward": 1.0, "step": 6593 }, { "completion_length": 249.58163452148438, "epoch": 0.6635471698113208, "grad_norm": 1.0209770202636719, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8709338307380676, "reward_std": 0.1668248549103737, "rewards/accuracy_reward": 0.8709338009357452, "rewards/format_reward": 1.0, "step": 6594 }, { "completion_length": 278.5612335205078, "epoch": 0.6636477987421384, "grad_norm": 0.3882410228252411, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7761595845222473, "reward_std": 0.1028229296207428, "rewards/accuracy_reward": 0.7761595249176025, "rewards/format_reward": 1.0, "step": 6595 }, { "completion_length": 192.04081344604492, "epoch": 0.6637484276729559, "grad_norm": 0.8854731321334839, "kl": 0.109130859375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8925169706344604, "reward_std": 0.15346548706293106, "rewards/accuracy_reward": 0.9129251539707184, "rewards/format_reward": 0.9795918166637421, "step": 6596 }, { "completion_length": 250.01020050048828, "epoch": 0.6638490566037736, "grad_norm": 1.1268563270568848, "kl": 0.0721435546875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7887755036354065, "reward_std": 0.15851136296987534, "rewards/accuracy_reward": 0.809183657169342, "rewards/format_reward": 0.9795918166637421, "step": 6597 }, { "completion_length": 216.7244873046875, "epoch": 0.6639496855345912, "grad_norm": 0.5279861092567444, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7661939859390259, "reward_std": 0.1289951428771019, "rewards/accuracy_reward": 0.7661939859390259, "rewards/format_reward": 1.0, "step": 6598 }, { "completion_length": 257.06121826171875, "epoch": 0.6640503144654089, "grad_norm": 0.5135689973831177, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6256478428840637, "reward_std": 0.11633413471281528, "rewards/accuracy_reward": 0.6358519792556763, "rewards/format_reward": 0.9897959232330322, "step": 6599 }, { "completion_length": 274.3163146972656, "epoch": 0.6641509433962264, "grad_norm": 0.6049805879592896, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8015769720077515, "reward_std": 0.11659090593457222, "rewards/accuracy_reward": 0.8015769720077515, "rewards/format_reward": 1.0, "step": 6600 }, { "completion_length": 266.5816345214844, "epoch": 0.664251572327044, "grad_norm": 0.6049551963806152, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7857142686843872, "reward_std": 0.10490182042121887, "rewards/accuracy_reward": 0.7857142686843872, "rewards/format_reward": 1.0, "step": 6601 }, { "completion_length": 214.948974609375, "epoch": 0.6643522012578617, "grad_norm": 1.0890560150146484, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7208410501480103, "reward_std": 0.20648684352636337, "rewards/accuracy_reward": 0.7310450971126556, "rewards/format_reward": 0.9897959232330322, "step": 6602 }, { "completion_length": 179.448974609375, "epoch": 0.6644528301886793, "grad_norm": 0.5678979754447937, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.9173468947410583, "reward_std": 0.11836857162415981, "rewards/accuracy_reward": 0.9173469245433807, "rewards/format_reward": 1.0, "step": 6603 }, { "completion_length": 180.6938705444336, "epoch": 0.6645534591194968, "grad_norm": 0.8577497601509094, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8363164067268372, "reward_std": 0.18488717824220657, "rewards/accuracy_reward": 0.8363164663314819, "rewards/format_reward": 1.0, "step": 6604 }, { "completion_length": 256.28570556640625, "epoch": 0.6646540880503145, "grad_norm": 0.725050687789917, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.86929053068161, "reward_std": 0.13367466069757938, "rewards/accuracy_reward": 0.8794946074485779, "rewards/format_reward": 0.9897959232330322, "step": 6605 }, { "completion_length": 270.98978424072266, "epoch": 0.6647547169811321, "grad_norm": 0.7018144726753235, "kl": 0.056640625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7250728607177734, "reward_std": 0.2233010157942772, "rewards/accuracy_reward": 0.745481014251709, "rewards/format_reward": 0.9795918464660645, "step": 6606 }, { "completion_length": 236.14285278320312, "epoch": 0.6648553459119497, "grad_norm": 0.5442066192626953, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7721089124679565, "reward_std": 0.12541663646697998, "rewards/accuracy_reward": 0.7721088230609894, "rewards/format_reward": 1.0, "step": 6607 }, { "completion_length": 230.59183502197266, "epoch": 0.6649559748427673, "grad_norm": 0.711072564125061, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7991777658462524, "reward_std": 0.14942077547311783, "rewards/accuracy_reward": 0.8195859491825104, "rewards/format_reward": 0.9795918464660645, "step": 6608 }, { "completion_length": 302.4387664794922, "epoch": 0.6650566037735849, "grad_norm": 10.654563903808594, "kl": 0.0762939453125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5835349559783936, "reward_std": 0.13630487397313118, "rewards/accuracy_reward": 0.5835349857807159, "rewards/format_reward": 1.0, "step": 6609 }, { "completion_length": 250.9591827392578, "epoch": 0.6651572327044025, "grad_norm": 0.9283419251441956, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6878423690795898, "reward_std": 0.29756657779216766, "rewards/accuracy_reward": 0.7082505524158478, "rewards/format_reward": 0.9795918464660645, "step": 6610 }, { "completion_length": 221.4081573486328, "epoch": 0.6652578616352202, "grad_norm": 1.921728253364563, "kl": 0.116455078125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.828613817691803, "reward_std": 0.09907831624150276, "rewards/accuracy_reward": 0.8388178944587708, "rewards/format_reward": 0.9897959232330322, "step": 6611 }, { "completion_length": 202.32652282714844, "epoch": 0.6653584905660377, "grad_norm": 0.60077303647995, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8982831239700317, "reward_std": 0.11158495023846626, "rewards/accuracy_reward": 0.8982830941677094, "rewards/format_reward": 1.0, "step": 6612 }, { "completion_length": 257.9081573486328, "epoch": 0.6654591194968553, "grad_norm": 0.27157384157180786, "kl": 0.0565185546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8414965867996216, "reward_std": 0.08507254347205162, "rewards/accuracy_reward": 0.8517006635665894, "rewards/format_reward": 0.9897959232330322, "step": 6613 }, { "completion_length": 265.27550506591797, "epoch": 0.665559748427673, "grad_norm": 0.592155933380127, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.825100064277649, "reward_std": 0.2029627375304699, "rewards/accuracy_reward": 0.8455082774162292, "rewards/format_reward": 0.9795918464660645, "step": 6614 }, { "completion_length": 241.61224365234375, "epoch": 0.6656603773584906, "grad_norm": 0.548262894153595, "kl": 0.0777587890625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.845700442790985, "reward_std": 0.1515939086675644, "rewards/accuracy_reward": 0.8559045493602753, "rewards/format_reward": 0.9897959232330322, "step": 6615 }, { "completion_length": 249.43877410888672, "epoch": 0.6657610062893081, "grad_norm": 0.8246836066246033, "kl": 0.110107421875, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.841292381286621, "reward_std": 0.166177149862051, "rewards/accuracy_reward": 0.8719046115875244, "rewards/format_reward": 0.9693877398967743, "step": 6616 }, { "completion_length": 273.6836700439453, "epoch": 0.6658616352201258, "grad_norm": 0.7319405674934387, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6654566526412964, "reward_std": 0.14292478561401367, "rewards/accuracy_reward": 0.675660640001297, "rewards/format_reward": 0.9897959232330322, "step": 6617 }, { "completion_length": 251.6020278930664, "epoch": 0.6659622641509434, "grad_norm": 1.4031879901885986, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.633237600326538, "reward_std": 0.15699095278978348, "rewards/accuracy_reward": 0.6434417963027954, "rewards/format_reward": 0.9897959232330322, "step": 6618 }, { "completion_length": 244.75509643554688, "epoch": 0.666062893081761, "grad_norm": 1.2575165033340454, "kl": 0.107421875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.9346938729286194, "reward_std": 0.10553880035877228, "rewards/accuracy_reward": 0.934693843126297, "rewards/format_reward": 1.0, "step": 6619 }, { "completion_length": 287.32653045654297, "epoch": 0.6661635220125787, "grad_norm": 0.5434849262237549, "kl": 0.0762939453125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7910592555999756, "reward_std": 0.06390703469514847, "rewards/accuracy_reward": 0.791059285402298, "rewards/format_reward": 1.0, "step": 6620 }, { "completion_length": 207.9693832397461, "epoch": 0.6662641509433962, "grad_norm": 0.6153391003608704, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8714285492897034, "reward_std": 0.07507972698658705, "rewards/accuracy_reward": 0.8714285790920258, "rewards/format_reward": 1.0, "step": 6621 }, { "completion_length": 235.91836547851562, "epoch": 0.6663647798742138, "grad_norm": 0.3965597450733185, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8425862193107605, "reward_std": 0.0802600122988224, "rewards/accuracy_reward": 0.8527902662754059, "rewards/format_reward": 0.9897959232330322, "step": 6622 }, { "completion_length": 202.11223602294922, "epoch": 0.6664654088050315, "grad_norm": 0.5619210600852966, "kl": 0.09716796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8659882545471191, "reward_std": 0.12206529825925827, "rewards/accuracy_reward": 0.8761922419071198, "rewards/format_reward": 0.9897959232330322, "step": 6623 }, { "completion_length": 224.37754821777344, "epoch": 0.6665660377358491, "grad_norm": 0.38754451274871826, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.9323129057884216, "reward_std": 0.0515330545604229, "rewards/accuracy_reward": 0.9425170123577118, "rewards/format_reward": 0.9897959232330322, "step": 6624 }, { "completion_length": 252.51020050048828, "epoch": 0.6666666666666666, "grad_norm": 0.3739687502384186, "kl": 0.054931640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.9736394882202148, "reward_std": 0.06430474855005741, "rewards/accuracy_reward": 0.994047611951828, "rewards/format_reward": 0.9795918166637421, "step": 6625 }, { "completion_length": 211.1734619140625, "epoch": 0.6667672955974843, "grad_norm": 1.1211950778961182, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8905285000801086, "reward_std": 0.06774890795350075, "rewards/accuracy_reward": 0.890528529882431, "rewards/format_reward": 1.0, "step": 6626 }, { "completion_length": 266.81632232666016, "epoch": 0.6668679245283019, "grad_norm": 2.874624490737915, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7391220331192017, "reward_std": 0.13631677627563477, "rewards/accuracy_reward": 0.7493261098861694, "rewards/format_reward": 0.9897959232330322, "step": 6627 }, { "completion_length": 223.85713958740234, "epoch": 0.6669685534591195, "grad_norm": 0.9335833191871643, "kl": 0.11083984375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8769173622131348, "reward_std": 0.19546016305685043, "rewards/accuracy_reward": 0.8973254859447479, "rewards/format_reward": 0.9795918166637421, "step": 6628 }, { "completion_length": 247.4591827392578, "epoch": 0.6670691823899371, "grad_norm": 0.32011979818344116, "kl": 0.0596923828125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8265305757522583, "reward_std": 0.08099238574504852, "rewards/accuracy_reward": 0.8265305757522583, "rewards/format_reward": 1.0, "step": 6629 }, { "completion_length": 263.8877487182617, "epoch": 0.6671698113207547, "grad_norm": 0.8363995552062988, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7776376008987427, "reward_std": 0.1850365847349167, "rewards/accuracy_reward": 0.7878417074680328, "rewards/format_reward": 0.9897959232330322, "step": 6630 }, { "completion_length": 199.43877410888672, "epoch": 0.6672704402515723, "grad_norm": 0.7487041354179382, "kl": 0.109130859375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7171394228935242, "reward_std": 0.11132236570119858, "rewards/accuracy_reward": 0.7273434102535248, "rewards/format_reward": 0.9897959232330322, "step": 6631 }, { "completion_length": 224.99999237060547, "epoch": 0.66737106918239, "grad_norm": 0.673419177532196, "kl": 0.0697021484375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8038960099220276, "reward_std": 0.16765587031841278, "rewards/accuracy_reward": 0.8141001760959625, "rewards/format_reward": 0.9897959232330322, "step": 6632 }, { "completion_length": 254.0, "epoch": 0.6674716981132075, "grad_norm": 0.8379478454589844, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.5304896831512451, "reward_std": 0.2512616813182831, "rewards/accuracy_reward": 0.5713060200214386, "rewards/format_reward": 0.9591836631298065, "step": 6633 }, { "completion_length": 258.9183578491211, "epoch": 0.6675723270440251, "grad_norm": 0.639039158821106, "kl": 0.0986328125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.5612114667892456, "reward_std": 0.13548027910292149, "rewards/accuracy_reward": 0.5714154988527298, "rewards/format_reward": 0.9897959232330322, "step": 6634 }, { "completion_length": 173.57141876220703, "epoch": 0.6676729559748428, "grad_norm": 0.9741620421409607, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8508655428886414, "reward_std": 0.13454869017004967, "rewards/accuracy_reward": 0.8712736964225769, "rewards/format_reward": 0.9795918464660645, "step": 6635 }, { "completion_length": 211.14285278320312, "epoch": 0.6677735849056604, "grad_norm": 0.6647586822509766, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6931928396224976, "reward_std": 0.19442682713270187, "rewards/accuracy_reward": 0.7136009931564331, "rewards/format_reward": 0.9795918166637421, "step": 6636 }, { "completion_length": 245.79591369628906, "epoch": 0.6678742138364779, "grad_norm": 0.3754861652851105, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8525046706199646, "reward_std": 0.10420266166329384, "rewards/accuracy_reward": 0.8729128241539001, "rewards/format_reward": 0.9795918464660645, "step": 6637 }, { "completion_length": 207.86734008789062, "epoch": 0.6679748427672956, "grad_norm": 1.0378741025924683, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7397958636283875, "reward_std": 0.25410860776901245, "rewards/accuracy_reward": 0.7602040767669678, "rewards/format_reward": 0.9795918166637421, "step": 6638 }, { "completion_length": 244.81632232666016, "epoch": 0.6680754716981132, "grad_norm": 0.5774000287055969, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7214969992637634, "reward_std": 0.10260594636201859, "rewards/accuracy_reward": 0.7214970290660858, "rewards/format_reward": 1.0, "step": 6639 }, { "completion_length": 230.09183502197266, "epoch": 0.6681761006289308, "grad_norm": 0.60205078125, "kl": 0.0596923828125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8046647310256958, "reward_std": 0.06130511686205864, "rewards/accuracy_reward": 0.8148687481880188, "rewards/format_reward": 0.9897959232330322, "step": 6640 }, { "completion_length": 237.5, "epoch": 0.6682767295597484, "grad_norm": 1.0962011814117432, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7436021566390991, "reward_std": 0.2001640722155571, "rewards/accuracy_reward": 0.7640103399753571, "rewards/format_reward": 0.9795918464660645, "step": 6641 }, { "completion_length": 305.5918273925781, "epoch": 0.668377358490566, "grad_norm": 0.6715831160545349, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7043797373771667, "reward_std": 0.21121463924646378, "rewards/accuracy_reward": 0.7145838439464569, "rewards/format_reward": 0.9897959232330322, "step": 6642 }, { "completion_length": 215.76529693603516, "epoch": 0.6684779874213836, "grad_norm": 2.4351367950439453, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.627696692943573, "reward_std": 0.2445562146604061, "rewards/accuracy_reward": 0.6481049507856369, "rewards/format_reward": 0.9795918464660645, "step": 6643 }, { "completion_length": 226.01019287109375, "epoch": 0.6685786163522013, "grad_norm": 0.3742770254611969, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7494986057281494, "reward_std": 0.02394854463636875, "rewards/accuracy_reward": 0.7494986355304718, "rewards/format_reward": 1.0, "step": 6644 }, { "completion_length": 200.9591827392578, "epoch": 0.6686792452830189, "grad_norm": 0.567581057548523, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.795120358467102, "reward_std": 0.06265763938426971, "rewards/accuracy_reward": 0.7951203286647797, "rewards/format_reward": 1.0, "step": 6645 }, { "completion_length": 236.08162689208984, "epoch": 0.6687798742138364, "grad_norm": 0.5446937680244446, "kl": 0.055908203125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.836734652519226, "reward_std": 0.12370206788182259, "rewards/accuracy_reward": 0.8367346823215485, "rewards/format_reward": 1.0, "step": 6646 }, { "completion_length": 247.83673095703125, "epoch": 0.6688805031446541, "grad_norm": 0.7654412388801575, "kl": 0.0565185546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8861438632011414, "reward_std": 0.12125175446271896, "rewards/accuracy_reward": 0.8861439228057861, "rewards/format_reward": 1.0, "step": 6647 }, { "completion_length": 191.36734008789062, "epoch": 0.6689811320754717, "grad_norm": 0.7404459714889526, "kl": 0.114501953125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.863722026348114, "reward_std": 0.1414753757417202, "rewards/accuracy_reward": 0.863722026348114, "rewards/format_reward": 1.0, "step": 6648 }, { "completion_length": 232.02040100097656, "epoch": 0.6690817610062894, "grad_norm": 0.5625020265579224, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7761337161064148, "reward_std": 0.14392846450209618, "rewards/accuracy_reward": 0.7965419292449951, "rewards/format_reward": 0.9795918464660645, "step": 6649 }, { "completion_length": 303.7857208251953, "epoch": 0.6691823899371069, "grad_norm": 0.6578230857849121, "kl": 0.0745849609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7382572889328003, "reward_std": 0.16004251316189766, "rewards/accuracy_reward": 0.7484612762928009, "rewards/format_reward": 0.9897959232330322, "step": 6650 }, { "completion_length": 227.78570556640625, "epoch": 0.6692830188679245, "grad_norm": 0.7894550561904907, "kl": 0.091064453125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7928872108459473, "reward_std": 0.09150435589253902, "rewards/accuracy_reward": 0.792887270450592, "rewards/format_reward": 1.0, "step": 6651 }, { "completion_length": 178.67346954345703, "epoch": 0.6693836477987422, "grad_norm": 0.7891771793365479, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.9743602275848389, "reward_std": 0.03909852355718613, "rewards/accuracy_reward": 0.9743602573871613, "rewards/format_reward": 1.0, "step": 6652 }, { "completion_length": 231.02040100097656, "epoch": 0.6694842767295598, "grad_norm": 0.5957646369934082, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.750850260257721, "reward_std": 0.08181390911340714, "rewards/accuracy_reward": 0.7610544562339783, "rewards/format_reward": 0.9897959232330322, "step": 6653 }, { "completion_length": 272.3673400878906, "epoch": 0.6695849056603773, "grad_norm": 1.1527267694473267, "kl": 0.0758056640625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7963151335716248, "reward_std": 0.11353373900055885, "rewards/accuracy_reward": 0.8065192103385925, "rewards/format_reward": 0.9897959232330322, "step": 6654 }, { "completion_length": 214.7040786743164, "epoch": 0.669685534591195, "grad_norm": 1.0658646821975708, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8109815120697021, "reward_std": 0.16846679151058197, "rewards/accuracy_reward": 0.8211856484413147, "rewards/format_reward": 0.9897959232330322, "step": 6655 }, { "completion_length": 218.88774871826172, "epoch": 0.6697861635220126, "grad_norm": 0.4581063687801361, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8904547691345215, "reward_std": 0.0697201918810606, "rewards/accuracy_reward": 0.8904547691345215, "rewards/format_reward": 1.0, "step": 6656 }, { "completion_length": 249.2755126953125, "epoch": 0.6698867924528302, "grad_norm": 0.5629197359085083, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5979672074317932, "reward_std": 0.17533235251903534, "rewards/accuracy_reward": 0.6183754503726959, "rewards/format_reward": 0.9795918166637421, "step": 6657 }, { "completion_length": 195.2142791748047, "epoch": 0.6699874213836478, "grad_norm": 1.3753238916397095, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7704081535339355, "reward_std": 0.14165958017110825, "rewards/accuracy_reward": 0.7704081535339355, "rewards/format_reward": 1.0, "step": 6658 }, { "completion_length": 223.61224365234375, "epoch": 0.6700880503144654, "grad_norm": 0.5695688128471375, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7342146039009094, "reward_std": 0.11918096989393234, "rewards/accuracy_reward": 0.734214574098587, "rewards/format_reward": 1.0, "step": 6659 }, { "completion_length": 246.81632232666016, "epoch": 0.670188679245283, "grad_norm": 0.602332353591919, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8525046110153198, "reward_std": 0.08283235877752304, "rewards/accuracy_reward": 0.86270871758461, "rewards/format_reward": 0.9897959232330322, "step": 6660 }, { "completion_length": 203.79591369628906, "epoch": 0.6702893081761007, "grad_norm": 1.1426047086715698, "kl": 0.0582275390625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8115718960762024, "reward_std": 0.07858635112643242, "rewards/accuracy_reward": 0.8217760324478149, "rewards/format_reward": 0.9897959232330322, "step": 6661 }, { "completion_length": 183.33673095703125, "epoch": 0.6703899371069182, "grad_norm": 2.378317356109619, "kl": 0.12548828125, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.7948932647705078, "reward_std": 0.18153589218854904, "rewards/accuracy_reward": 0.7948933243751526, "rewards/format_reward": 1.0, "step": 6662 }, { "completion_length": 298.4591751098633, "epoch": 0.6704905660377358, "grad_norm": 1.09751296043396, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.790120542049408, "reward_std": 0.1222404520958662, "rewards/accuracy_reward": 0.8003245890140533, "rewards/format_reward": 0.9897959232330322, "step": 6663 }, { "completion_length": 304.2142791748047, "epoch": 0.6705911949685535, "grad_norm": 0.4648325741291046, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7979592084884644, "reward_std": 0.1568838767707348, "rewards/accuracy_reward": 0.8183673322200775, "rewards/format_reward": 0.9795918166637421, "step": 6664 }, { "completion_length": 335.19386291503906, "epoch": 0.6706918238993711, "grad_norm": 1.2505781650543213, "kl": 0.0582275390625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7676940560340881, "reward_std": 0.25826120376586914, "rewards/accuracy_reward": 0.7983062565326691, "rewards/format_reward": 0.9693877398967743, "step": 6665 }, { "completion_length": 236.06121826171875, "epoch": 0.6707924528301887, "grad_norm": 0.36683547496795654, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8736637234687805, "reward_std": 0.04128750041127205, "rewards/accuracy_reward": 0.8736637532711029, "rewards/format_reward": 1.0, "step": 6666 }, { "completion_length": 242.9795913696289, "epoch": 0.6708930817610063, "grad_norm": 1.1484894752502441, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7165470123291016, "reward_std": 0.17774659395217896, "rewards/accuracy_reward": 0.7267511188983917, "rewards/format_reward": 0.9897959232330322, "step": 6667 }, { "completion_length": 303.8877487182617, "epoch": 0.6709937106918239, "grad_norm": 1.0245164632797241, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.723873257637024, "reward_std": 0.1738975066691637, "rewards/accuracy_reward": 0.7340773046016693, "rewards/format_reward": 0.9897959232330322, "step": 6668 }, { "completion_length": 306.8673400878906, "epoch": 0.6710943396226415, "grad_norm": 7.021929740905762, "kl": 0.148193359375, "learning_rate": 1e-06, "loss": 0.0059, "reward": 1.647562325000763, "reward_std": 0.1374327652156353, "rewards/accuracy_reward": 0.6679705083370209, "rewards/format_reward": 0.9795918166637421, "step": 6669 }, { "completion_length": 259.76529693603516, "epoch": 0.6711949685534592, "grad_norm": 0.6188436150550842, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.829446017742157, "reward_std": 0.11415939033031464, "rewards/accuracy_reward": 0.8396501243114471, "rewards/format_reward": 0.9897959232330322, "step": 6670 }, { "completion_length": 240.63265228271484, "epoch": 0.6712955974842767, "grad_norm": 0.5534709095954895, "kl": 0.0870361328125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8064532279968262, "reward_std": 0.11062419041991234, "rewards/accuracy_reward": 0.8064531981945038, "rewards/format_reward": 1.0, "step": 6671 }, { "completion_length": 185.43877410888672, "epoch": 0.6713962264150943, "grad_norm": 0.9070342183113098, "kl": 0.065185546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8163264989852905, "reward_std": 0.1079898476600647, "rewards/accuracy_reward": 0.8265306055545807, "rewards/format_reward": 0.9897959232330322, "step": 6672 }, { "completion_length": 295.7040710449219, "epoch": 0.671496855345912, "grad_norm": 2.817601203918457, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6305806636810303, "reward_std": 0.12199905328452587, "rewards/accuracy_reward": 0.6611928343772888, "rewards/format_reward": 0.9693877398967743, "step": 6673 }, { "completion_length": 256.9591827392578, "epoch": 0.6715974842767296, "grad_norm": 0.7303600907325745, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6908946633338928, "reward_std": 0.20301024615764618, "rewards/accuracy_reward": 0.7215069234371185, "rewards/format_reward": 0.9693877398967743, "step": 6674 }, { "completion_length": 239.77550506591797, "epoch": 0.6716981132075471, "grad_norm": 1.7659187316894531, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7857239842414856, "reward_std": 0.20466865599155426, "rewards/accuracy_reward": 0.8061321675777435, "rewards/format_reward": 0.9795918464660645, "step": 6675 }, { "completion_length": 267.5306091308594, "epoch": 0.6717987421383648, "grad_norm": 0.42313647270202637, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8833259344100952, "reward_std": 0.13306792825460434, "rewards/accuracy_reward": 0.9037341177463531, "rewards/format_reward": 0.9795918464660645, "step": 6676 }, { "completion_length": 272.4285659790039, "epoch": 0.6718993710691824, "grad_norm": 1.4546778202056885, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7855842113494873, "reward_std": 0.2529866173863411, "rewards/accuracy_reward": 0.7957884073257446, "rewards/format_reward": 0.9897959232330322, "step": 6677 }, { "completion_length": 223.9285659790039, "epoch": 0.672, "grad_norm": 1.6908109188079834, "kl": 0.11767578125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.6904422640800476, "reward_std": 0.24305851757526398, "rewards/accuracy_reward": 0.7108505070209503, "rewards/format_reward": 0.9795918166637421, "step": 6678 }, { "completion_length": 248.1836700439453, "epoch": 0.6721006289308176, "grad_norm": 1.533218264579773, "kl": 0.0966796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7931766510009766, "reward_std": 0.20021165162324905, "rewards/accuracy_reward": 0.8135848045349121, "rewards/format_reward": 0.9795918464660645, "step": 6679 }, { "completion_length": 202.75509643554688, "epoch": 0.6722012578616352, "grad_norm": 5.986477375030518, "kl": 0.1240234375, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.7851769924163818, "reward_std": 0.10001462697982788, "rewards/accuracy_reward": 0.8055850565433502, "rewards/format_reward": 0.9795918166637421, "step": 6680 }, { "completion_length": 271.2142868041992, "epoch": 0.6723018867924528, "grad_norm": 0.5482429265975952, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8351991772651672, "reward_std": 0.1522037722170353, "rewards/accuracy_reward": 0.8556073009967804, "rewards/format_reward": 0.9795918166637421, "step": 6681 }, { "completion_length": 195.4897918701172, "epoch": 0.6724025157232705, "grad_norm": 0.6363087892532349, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7635670900344849, "reward_std": 0.08630107343196869, "rewards/accuracy_reward": 0.7635671496391296, "rewards/format_reward": 1.0, "step": 6682 }, { "completion_length": 258.75508880615234, "epoch": 0.672503144654088, "grad_norm": 0.7685080766677856, "kl": 0.1337890625, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.6394739151000977, "reward_std": 0.11659643054008484, "rewards/accuracy_reward": 0.6394740641117096, "rewards/format_reward": 1.0, "step": 6683 }, { "completion_length": 271.11224365234375, "epoch": 0.6726037735849056, "grad_norm": 0.27763664722442627, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7663883566856384, "reward_std": 0.054152440279722214, "rewards/accuracy_reward": 0.7663883566856384, "rewards/format_reward": 1.0, "step": 6684 }, { "completion_length": 269.55101776123047, "epoch": 0.6727044025157233, "grad_norm": 0.7021828889846802, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7952248454093933, "reward_std": 0.09930987842381, "rewards/accuracy_reward": 0.7952248156070709, "rewards/format_reward": 1.0, "step": 6685 }, { "completion_length": 203.41836547851562, "epoch": 0.6728050314465409, "grad_norm": 0.5830265283584595, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.731539785861969, "reward_std": 0.12216119840741158, "rewards/accuracy_reward": 0.7519480586051941, "rewards/format_reward": 0.9795918464660645, "step": 6686 }, { "completion_length": 254.75509643554688, "epoch": 0.6729056603773584, "grad_norm": 0.457964152097702, "kl": 0.0543212890625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8072562217712402, "reward_std": 0.11116048134863377, "rewards/accuracy_reward": 0.8072562217712402, "rewards/format_reward": 1.0, "step": 6687 }, { "completion_length": 295.4285583496094, "epoch": 0.6730062893081761, "grad_norm": 0.5192583799362183, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.805557370185852, "reward_std": 0.1404729187488556, "rewards/accuracy_reward": 0.82596555352211, "rewards/format_reward": 0.9795918464660645, "step": 6688 }, { "completion_length": 278.9591827392578, "epoch": 0.6731069182389937, "grad_norm": 0.718982994556427, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7846209406852722, "reward_std": 0.15756122022867203, "rewards/accuracy_reward": 0.8050291240215302, "rewards/format_reward": 0.9795918166637421, "step": 6689 }, { "completion_length": 280.5408020019531, "epoch": 0.6732075471698113, "grad_norm": 0.6162552833557129, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6569761037826538, "reward_std": 0.16534697636961937, "rewards/accuracy_reward": 0.6671801507472992, "rewards/format_reward": 0.9897959232330322, "step": 6690 }, { "completion_length": 315.12245178222656, "epoch": 0.673308176100629, "grad_norm": 0.8733510971069336, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6665370464324951, "reward_std": 0.2029930055141449, "rewards/accuracy_reward": 0.6767411530017853, "rewards/format_reward": 0.9897959232330322, "step": 6691 }, { "completion_length": 329.6938781738281, "epoch": 0.6734088050314465, "grad_norm": 0.6341254711151123, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.5836736559867859, "reward_std": 0.2261323556303978, "rewards/accuracy_reward": 0.61428602039814, "rewards/format_reward": 0.9693877398967743, "step": 6692 }, { "completion_length": 212.9795913696289, "epoch": 0.6735094339622641, "grad_norm": 0.667275071144104, "kl": 0.10302734375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7870126366615295, "reward_std": 0.12727763503789902, "rewards/accuracy_reward": 0.7972167432308197, "rewards/format_reward": 0.9897959232330322, "step": 6693 }, { "completion_length": 260.9795913696289, "epoch": 0.6736100628930818, "grad_norm": 0.6910128593444824, "kl": 0.111328125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8463916778564453, "reward_std": 0.11962802335619926, "rewards/accuracy_reward": 0.8565958738327026, "rewards/format_reward": 0.9897959232330322, "step": 6694 }, { "completion_length": 254.02040100097656, "epoch": 0.6737106918238994, "grad_norm": 0.43412312865257263, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7856623530387878, "reward_std": 0.10319872945547104, "rewards/accuracy_reward": 0.785662442445755, "rewards/format_reward": 1.0, "step": 6695 }, { "completion_length": 280.7448959350586, "epoch": 0.673811320754717, "grad_norm": 0.5531964302062988, "kl": 0.0606689453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7408736944198608, "reward_std": 0.08209188655018806, "rewards/accuracy_reward": 0.7408736944198608, "rewards/format_reward": 1.0, "step": 6696 }, { "completion_length": 270.29591369628906, "epoch": 0.6739119496855346, "grad_norm": 0.5425513386726379, "kl": 0.0631103515625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8619047403335571, "reward_std": 0.1881280653178692, "rewards/accuracy_reward": 0.8721087872982025, "rewards/format_reward": 0.9897959232330322, "step": 6697 }, { "completion_length": 278.05101013183594, "epoch": 0.6740125786163522, "grad_norm": 0.6613531708717346, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6799849271774292, "reward_std": 0.12872076034545898, "rewards/accuracy_reward": 0.6901890337467194, "rewards/format_reward": 0.9897959232330322, "step": 6698 }, { "completion_length": 215.40816497802734, "epoch": 0.6741132075471699, "grad_norm": 1.0478099584579468, "kl": 0.107666015625, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.727807879447937, "reward_std": 0.14561722055077553, "rewards/accuracy_reward": 0.7278079092502594, "rewards/format_reward": 1.0, "step": 6699 }, { "completion_length": 246.77550506591797, "epoch": 0.6742138364779874, "grad_norm": 0.5606674551963806, "kl": 0.107177734375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7356992363929749, "reward_std": 0.15109731629490852, "rewards/accuracy_reward": 0.7356992065906525, "rewards/format_reward": 1.0, "step": 6700 }, { "completion_length": 213.07142639160156, "epoch": 0.674314465408805, "grad_norm": 0.9587053060531616, "kl": 0.1259765625, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.777339518070221, "reward_std": 0.130808275192976, "rewards/accuracy_reward": 0.7977476716041565, "rewards/format_reward": 0.9795918464660645, "step": 6701 }, { "completion_length": 189.7551040649414, "epoch": 0.6744150943396227, "grad_norm": 1.056257724761963, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.787657916545868, "reward_std": 0.07377313077449799, "rewards/accuracy_reward": 0.7876579165458679, "rewards/format_reward": 1.0, "step": 6702 }, { "completion_length": 308.7550964355469, "epoch": 0.6745157232704403, "grad_norm": 0.507653534412384, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7064657807350159, "reward_std": 0.13230548053979874, "rewards/accuracy_reward": 0.7166699171066284, "rewards/format_reward": 0.9897959232330322, "step": 6703 }, { "completion_length": 301.8571319580078, "epoch": 0.6746163522012578, "grad_norm": 0.999130368232727, "kl": 0.0692138671875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7490329146385193, "reward_std": 0.16870933771133423, "rewards/accuracy_reward": 0.7694410979747772, "rewards/format_reward": 0.9795918166637421, "step": 6704 }, { "completion_length": 297.12245178222656, "epoch": 0.6747169811320755, "grad_norm": 0.4186384677886963, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7020407915115356, "reward_std": 0.12124544009566307, "rewards/accuracy_reward": 0.7224489748477936, "rewards/format_reward": 0.9795918166637421, "step": 6705 }, { "completion_length": 236.9081573486328, "epoch": 0.6748176100628931, "grad_norm": 0.6269952654838562, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.783048391342163, "reward_std": 0.14016642048954964, "rewards/accuracy_reward": 0.7830485105514526, "rewards/format_reward": 1.0, "step": 6706 }, { "completion_length": 293.1632537841797, "epoch": 0.6749182389937107, "grad_norm": 1.6546040773391724, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6780113577842712, "reward_std": 0.27433306723833084, "rewards/accuracy_reward": 0.7086236476898193, "rewards/format_reward": 0.9693877398967743, "step": 6707 }, { "completion_length": 268.39796447753906, "epoch": 0.6750188679245283, "grad_norm": 0.9232499599456787, "kl": 0.057373046875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.763192355632782, "reward_std": 0.22258498519659042, "rewards/accuracy_reward": 0.7733964920043945, "rewards/format_reward": 0.9897959232330322, "step": 6708 }, { "completion_length": 192.80611419677734, "epoch": 0.6751194968553459, "grad_norm": 0.9962837100028992, "kl": 0.096923828125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8095688819885254, "reward_std": 0.12586896121501923, "rewards/accuracy_reward": 0.8299770057201385, "rewards/format_reward": 0.9795918464660645, "step": 6709 }, { "completion_length": 292.10203552246094, "epoch": 0.6752201257861635, "grad_norm": 0.5210533738136292, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.795918345451355, "reward_std": 0.23373909294605255, "rewards/accuracy_reward": 0.8469387590885162, "rewards/format_reward": 0.9489795863628387, "step": 6710 }, { "completion_length": 318.51019287109375, "epoch": 0.6753207547169812, "grad_norm": 0.44498956203460693, "kl": 0.0736083984375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.725117027759552, "reward_std": 0.14442498236894608, "rewards/accuracy_reward": 0.7251170575618744, "rewards/format_reward": 1.0, "step": 6711 }, { "completion_length": 285.06121826171875, "epoch": 0.6754213836477987, "grad_norm": 0.6834045052528381, "kl": 0.0587158203125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6476190090179443, "reward_std": 0.17923010140657425, "rewards/accuracy_reward": 0.6680271923542023, "rewards/format_reward": 0.9795918464660645, "step": 6712 }, { "completion_length": 273.23468017578125, "epoch": 0.6755220125786163, "grad_norm": 1.2550389766693115, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8116698861122131, "reward_std": 0.28466950356960297, "rewards/accuracy_reward": 0.8524862229824066, "rewards/format_reward": 0.9591836631298065, "step": 6713 }, { "completion_length": 317.9591827392578, "epoch": 0.675622641509434, "grad_norm": 0.6266700625419617, "kl": 0.0601806640625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.748717486858368, "reward_std": 0.1764105185866356, "rewards/accuracy_reward": 0.7793296873569489, "rewards/format_reward": 0.9693877398967743, "step": 6714 }, { "completion_length": 319.46937561035156, "epoch": 0.6757232704402516, "grad_norm": 1.7382935285568237, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8239795565605164, "reward_std": 0.16810067743062973, "rewards/accuracy_reward": 0.8341836631298065, "rewards/format_reward": 0.9897959232330322, "step": 6715 }, { "completion_length": 183.81632232666016, "epoch": 0.6758238993710692, "grad_norm": 1.1890156269073486, "kl": 0.13427734375, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.857416808605194, "reward_std": 0.1580227129161358, "rewards/accuracy_reward": 0.8778249323368073, "rewards/format_reward": 0.9795918166637421, "step": 6716 }, { "completion_length": 181.29591369628906, "epoch": 0.6759245283018868, "grad_norm": 1.0710597038269043, "kl": 0.124755859375, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.753617763519287, "reward_std": 0.04582291888073087, "rewards/accuracy_reward": 0.7536177933216095, "rewards/format_reward": 1.0, "step": 6717 }, { "completion_length": 295.4183654785156, "epoch": 0.6760251572327044, "grad_norm": 0.4019322693347931, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6602110266685486, "reward_std": 0.10805843956768513, "rewards/accuracy_reward": 0.6704151332378387, "rewards/format_reward": 0.9897959232330322, "step": 6718 }, { "completion_length": 286.57142639160156, "epoch": 0.676125786163522, "grad_norm": 0.9439589381217957, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7162026762962341, "reward_std": 0.1365415472537279, "rewards/accuracy_reward": 0.7366110384464264, "rewards/format_reward": 0.9795918166637421, "step": 6719 }, { "completion_length": 248.0408172607422, "epoch": 0.6762264150943397, "grad_norm": 0.6126861572265625, "kl": 0.110107421875, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8588435053825378, "reward_std": 0.10687560215592384, "rewards/accuracy_reward": 0.8588435053825378, "rewards/format_reward": 1.0, "step": 6720 }, { "completion_length": 231.20407104492188, "epoch": 0.6763270440251572, "grad_norm": 0.5247771739959717, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.846530556678772, "reward_std": 0.16215763241052628, "rewards/accuracy_reward": 0.8669387698173523, "rewards/format_reward": 0.9795918464660645, "step": 6721 }, { "completion_length": 284.79590606689453, "epoch": 0.6764276729559748, "grad_norm": 0.5364428162574768, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8013655543327332, "reward_std": 0.1446991115808487, "rewards/accuracy_reward": 0.8115697503089905, "rewards/format_reward": 0.9897959232330322, "step": 6722 }, { "completion_length": 295.6428527832031, "epoch": 0.6765283018867925, "grad_norm": 1.145029067993164, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7100340127944946, "reward_std": 0.19900383800268173, "rewards/accuracy_reward": 0.7202380895614624, "rewards/format_reward": 0.9897959232330322, "step": 6723 }, { "completion_length": 277.20408630371094, "epoch": 0.6766289308176101, "grad_norm": 0.8791751861572266, "kl": 0.112060546875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.635866641998291, "reward_std": 0.20326674729585648, "rewards/accuracy_reward": 0.6460707485675812, "rewards/format_reward": 0.9897959232330322, "step": 6724 }, { "completion_length": 294.93875885009766, "epoch": 0.6767295597484276, "grad_norm": 0.5618229508399963, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8172119855880737, "reward_std": 0.13254191726446152, "rewards/accuracy_reward": 0.8376201689243317, "rewards/format_reward": 0.9795918464660645, "step": 6725 }, { "completion_length": 189.82653045654297, "epoch": 0.6768301886792453, "grad_norm": 0.5422241687774658, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.9009482264518738, "reward_std": 0.13180746510624886, "rewards/accuracy_reward": 0.9213564097881317, "rewards/format_reward": 0.9795918166637421, "step": 6726 }, { "completion_length": 233.2346954345703, "epoch": 0.6769308176100629, "grad_norm": 0.5421875715255737, "kl": 0.115478515625, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.848891258239746, "reward_std": 0.10861014574766159, "rewards/accuracy_reward": 0.8692993521690369, "rewards/format_reward": 0.9795918166637421, "step": 6727 }, { "completion_length": 343.56121826171875, "epoch": 0.6770314465408805, "grad_norm": 0.39403006434440613, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7515626549720764, "reward_std": 0.11982444580644369, "rewards/accuracy_reward": 0.7617667615413666, "rewards/format_reward": 0.9897959232330322, "step": 6728 }, { "completion_length": 285.1734619140625, "epoch": 0.6771320754716981, "grad_norm": 0.6670039296150208, "kl": 0.09716796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7980498671531677, "reward_std": 0.12492987141013145, "rewards/accuracy_reward": 0.7980498969554901, "rewards/format_reward": 1.0, "step": 6729 }, { "completion_length": 231.91836547851562, "epoch": 0.6772327044025157, "grad_norm": 4.811575412750244, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.725664734840393, "reward_std": 0.13314105942845345, "rewards/accuracy_reward": 0.7358689308166504, "rewards/format_reward": 0.9897959232330322, "step": 6730 }, { "completion_length": 278.83673095703125, "epoch": 0.6773333333333333, "grad_norm": 0.6980561017990112, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7569573521614075, "reward_std": 0.15173207968473434, "rewards/accuracy_reward": 0.7671613693237305, "rewards/format_reward": 0.9897959232330322, "step": 6731 }, { "completion_length": 306.52040100097656, "epoch": 0.677433962264151, "grad_norm": 0.8832533359527588, "kl": 0.0743408203125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.811688244342804, "reward_std": 0.3194694146513939, "rewards/accuracy_reward": 0.842300534248352, "rewards/format_reward": 0.9693877398967743, "step": 6732 }, { "completion_length": 242.6224365234375, "epoch": 0.6775345911949685, "grad_norm": 0.5878891348838806, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7187280058860779, "reward_std": 0.13190508633852005, "rewards/accuracy_reward": 0.7187281250953674, "rewards/format_reward": 1.0, "step": 6733 }, { "completion_length": 196.9897918701172, "epoch": 0.6776352201257861, "grad_norm": 0.8071401715278625, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.9489795565605164, "reward_std": 0.06517763435840607, "rewards/accuracy_reward": 0.9693877398967743, "rewards/format_reward": 0.9795918166637421, "step": 6734 }, { "completion_length": 268.87754821777344, "epoch": 0.6777358490566038, "grad_norm": 2.4892470836639404, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8109512329101562, "reward_std": 0.0810624286532402, "rewards/accuracy_reward": 0.8211552500724792, "rewards/format_reward": 0.9897959232330322, "step": 6735 }, { "completion_length": 283.6428527832031, "epoch": 0.6778364779874214, "grad_norm": 0.6103280782699585, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7243042588233948, "reward_std": 0.16181910037994385, "rewards/accuracy_reward": 0.7447124421596527, "rewards/format_reward": 0.9795918166637421, "step": 6736 }, { "completion_length": 237.11224365234375, "epoch": 0.6779371069182389, "grad_norm": 1.975391149520874, "kl": 0.0662841796875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7755101919174194, "reward_std": 0.13035526871681213, "rewards/accuracy_reward": 0.7857142686843872, "rewards/format_reward": 0.9897959232330322, "step": 6737 }, { "completion_length": 300.30611419677734, "epoch": 0.6780377358490566, "grad_norm": 0.36823752522468567, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8163883090019226, "reward_std": 0.13907874189317226, "rewards/accuracy_reward": 0.8470006585121155, "rewards/format_reward": 0.9693877398967743, "step": 6738 }, { "completion_length": 237.8163299560547, "epoch": 0.6781383647798742, "grad_norm": 0.510169506072998, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8533909916877747, "reward_std": 0.11282718181610107, "rewards/accuracy_reward": 0.8635950982570648, "rewards/format_reward": 0.9897959232330322, "step": 6739 }, { "completion_length": 259.78570556640625, "epoch": 0.6782389937106919, "grad_norm": 0.2267264574766159, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.9325085878372192, "reward_std": 0.029425138141959906, "rewards/accuracy_reward": 0.9325085878372192, "rewards/format_reward": 1.0, "step": 6740 }, { "completion_length": 240.38775634765625, "epoch": 0.6783396226415095, "grad_norm": 0.5523471832275391, "kl": 0.0562744140625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8066917657852173, "reward_std": 0.08522481471300125, "rewards/accuracy_reward": 0.8066917359828949, "rewards/format_reward": 1.0, "step": 6741 }, { "completion_length": 340.29590606689453, "epoch": 0.678440251572327, "grad_norm": 0.5707544088363647, "kl": 0.0716552734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.591431975364685, "reward_std": 0.18519281595945358, "rewards/accuracy_reward": 0.6220442056655884, "rewards/format_reward": 0.9693877398967743, "step": 6742 }, { "completion_length": 161.93877410888672, "epoch": 0.6785408805031446, "grad_norm": 0.5932420492172241, "kl": 0.12646484375, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.8631725311279297, "reward_std": 0.0577996876090765, "rewards/accuracy_reward": 0.8733766376972198, "rewards/format_reward": 0.9897959232330322, "step": 6743 }, { "completion_length": 227.07142639160156, "epoch": 0.6786415094339623, "grad_norm": 1.7544264793395996, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7741341590881348, "reward_std": 0.19062313437461853, "rewards/accuracy_reward": 0.7945423424243927, "rewards/format_reward": 0.9795918464660645, "step": 6744 }, { "completion_length": 322.4795837402344, "epoch": 0.6787421383647799, "grad_norm": 1.4784038066864014, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6250033378601074, "reward_std": 0.15649202838540077, "rewards/accuracy_reward": 0.6454115211963654, "rewards/format_reward": 0.9795918166637421, "step": 6745 }, { "completion_length": 316.1428527832031, "epoch": 0.6788427672955974, "grad_norm": 0.6406766176223755, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.812131404876709, "reward_std": 0.20408396422863007, "rewards/accuracy_reward": 0.8427436947822571, "rewards/format_reward": 0.9693877398967743, "step": 6746 }, { "completion_length": 257.0, "epoch": 0.6789433962264151, "grad_norm": 1.5236968994140625, "kl": 0.137451171875, "learning_rate": 1e-06, "loss": 0.0055, "reward": 1.6978084444999695, "reward_std": 0.12656420841813087, "rewards/accuracy_reward": 0.7080126702785492, "rewards/format_reward": 0.9897959232330322, "step": 6747 }, { "completion_length": 246.25509643554688, "epoch": 0.6790440251572327, "grad_norm": 0.6590423583984375, "kl": 0.0762939453125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7605684399604797, "reward_std": 0.19824448972940445, "rewards/accuracy_reward": 0.7605684697628021, "rewards/format_reward": 1.0, "step": 6748 }, { "completion_length": 281.3163299560547, "epoch": 0.6791446540880504, "grad_norm": 0.7123144268989563, "kl": 0.105712890625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6881908178329468, "reward_std": 0.1870282143354416, "rewards/accuracy_reward": 0.7085990309715271, "rewards/format_reward": 0.9795918464660645, "step": 6749 }, { "completion_length": 224.01020050048828, "epoch": 0.6792452830188679, "grad_norm": 0.41925373673439026, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.9160040616989136, "reward_std": 0.07096568681299686, "rewards/accuracy_reward": 0.9160040020942688, "rewards/format_reward": 1.0, "step": 6750 }, { "completion_length": 293.1020278930664, "epoch": 0.6793459119496855, "grad_norm": 0.5827377438545227, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7022112011909485, "reward_std": 0.149881973862648, "rewards/accuracy_reward": 0.7124153077602386, "rewards/format_reward": 0.9897959232330322, "step": 6751 }, { "completion_length": 230.18366241455078, "epoch": 0.6794465408805032, "grad_norm": 1.9411671161651611, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7732531428337097, "reward_std": 0.2538967654109001, "rewards/accuracy_reward": 0.7936612665653229, "rewards/format_reward": 0.9795918464660645, "step": 6752 }, { "completion_length": 241.86734008789062, "epoch": 0.6795471698113208, "grad_norm": 2.2166757583618164, "kl": 0.0750732421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7043235301971436, "reward_std": 0.06965433433651924, "rewards/accuracy_reward": 0.7043235898017883, "rewards/format_reward": 1.0, "step": 6753 }, { "completion_length": 319.1632537841797, "epoch": 0.6796477987421383, "grad_norm": 0.66994309425354, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6071452498435974, "reward_std": 0.19431446120142937, "rewards/accuracy_reward": 0.617349311709404, "rewards/format_reward": 0.9897959232330322, "step": 6754 }, { "completion_length": 274.21427154541016, "epoch": 0.679748427672956, "grad_norm": 0.5548255443572998, "kl": 0.111328125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7519104480743408, "reward_std": 0.15966198220849037, "rewards/accuracy_reward": 0.751910537481308, "rewards/format_reward": 1.0, "step": 6755 }, { "completion_length": 235.65306091308594, "epoch": 0.6798490566037736, "grad_norm": 1.1863901615142822, "kl": 0.113525390625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.6196449398994446, "reward_std": 0.2653956562280655, "rewards/accuracy_reward": 0.6502571702003479, "rewards/format_reward": 0.9693877398967743, "step": 6756 }, { "completion_length": 245.63265228271484, "epoch": 0.6799496855345912, "grad_norm": 0.19007596373558044, "kl": 0.115234375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.9081632494926453, "reward_std": 0.03485357388854027, "rewards/accuracy_reward": 0.918367326259613, "rewards/format_reward": 0.9897959232330322, "step": 6757 }, { "completion_length": 205.73468780517578, "epoch": 0.6800503144654088, "grad_norm": 0.3456554114818573, "kl": 0.099609375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8149334788322449, "reward_std": 0.06427008472383022, "rewards/accuracy_reward": 0.8251376152038574, "rewards/format_reward": 0.9897959232330322, "step": 6758 }, { "completion_length": 239.4081573486328, "epoch": 0.6801509433962264, "grad_norm": 1.1045798063278198, "kl": 0.099609375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7928791642189026, "reward_std": 0.08395208790898323, "rewards/accuracy_reward": 0.8030832707881927, "rewards/format_reward": 0.9897959232330322, "step": 6759 }, { "completion_length": 241.35713958740234, "epoch": 0.680251572327044, "grad_norm": 0.5003445148468018, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7781547904014587, "reward_std": 0.07295482978224754, "rewards/accuracy_reward": 0.7781548202037811, "rewards/format_reward": 1.0, "step": 6760 }, { "completion_length": 253.7040786743164, "epoch": 0.6803522012578617, "grad_norm": 0.8502867817878723, "kl": 0.113525390625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.8207380175590515, "reward_std": 0.17359097301959991, "rewards/accuracy_reward": 0.8309420645236969, "rewards/format_reward": 0.9897959232330322, "step": 6761 }, { "completion_length": 208.89795684814453, "epoch": 0.6804528301886792, "grad_norm": 0.6608180403709412, "kl": 0.10986328125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7405709624290466, "reward_std": 0.1211351528763771, "rewards/accuracy_reward": 0.750775009393692, "rewards/format_reward": 0.9897959232330322, "step": 6762 }, { "completion_length": 186.66326141357422, "epoch": 0.6805534591194968, "grad_norm": 0.7825878262519836, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7015371918678284, "reward_std": 0.05476026609539986, "rewards/accuracy_reward": 0.7015372514724731, "rewards/format_reward": 1.0, "step": 6763 }, { "completion_length": 258.0, "epoch": 0.6806540880503145, "grad_norm": 1.128116250038147, "kl": 0.094970703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7272109389305115, "reward_std": 0.16130079329013824, "rewards/accuracy_reward": 0.7476190328598022, "rewards/format_reward": 0.9795918464660645, "step": 6764 }, { "completion_length": 189.66326141357422, "epoch": 0.6807547169811321, "grad_norm": 0.5185973048210144, "kl": 0.10498046875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.801546037197113, "reward_std": 0.028376154601573944, "rewards/accuracy_reward": 0.8117501139640808, "rewards/format_reward": 0.9897959232330322, "step": 6765 }, { "completion_length": 200.36734008789062, "epoch": 0.6808553459119497, "grad_norm": 0.6998487114906311, "kl": 0.12109375, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.796976089477539, "reward_std": 0.1275608353316784, "rewards/accuracy_reward": 0.8071802854537964, "rewards/format_reward": 0.9897959232330322, "step": 6766 }, { "completion_length": 257.3571319580078, "epoch": 0.6809559748427673, "grad_norm": 1.6618212461471558, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7105317115783691, "reward_std": 0.1872611865401268, "rewards/accuracy_reward": 0.7105316817760468, "rewards/format_reward": 1.0, "step": 6767 }, { "completion_length": 194.38775634765625, "epoch": 0.6810566037735849, "grad_norm": 0.5729688405990601, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8624716401100159, "reward_std": 0.09714530408382416, "rewards/accuracy_reward": 0.8624716699123383, "rewards/format_reward": 1.0, "step": 6768 }, { "completion_length": 201.4693832397461, "epoch": 0.6811572327044025, "grad_norm": 0.28236034512519836, "kl": 0.1033935546875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.9203875064849854, "reward_std": 0.07307783886790276, "rewards/accuracy_reward": 0.9203875660896301, "rewards/format_reward": 1.0, "step": 6769 }, { "completion_length": 307.6428527832031, "epoch": 0.6812578616352202, "grad_norm": 0.9945938587188721, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5071914792060852, "reward_std": 0.17996331304311752, "rewards/accuracy_reward": 0.527599573135376, "rewards/format_reward": 0.9795918166637421, "step": 6770 }, { "completion_length": 246.4795913696289, "epoch": 0.6813584905660377, "grad_norm": 1.192880630493164, "kl": 0.10107421875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7107521295547485, "reward_std": 0.1277632750570774, "rewards/accuracy_reward": 0.7413643598556519, "rewards/format_reward": 0.9693877398967743, "step": 6771 }, { "completion_length": 271.27550506591797, "epoch": 0.6814591194968553, "grad_norm": 0.8333380818367004, "kl": 0.1259765625, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.662407398223877, "reward_std": 0.17041685432195663, "rewards/accuracy_reward": 0.6726114749908447, "rewards/format_reward": 0.9897959232330322, "step": 6772 }, { "completion_length": 241.77550506591797, "epoch": 0.681559748427673, "grad_norm": 0.7097369432449341, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.714857280254364, "reward_std": 0.19828187301754951, "rewards/accuracy_reward": 0.7454695701599121, "rewards/format_reward": 0.9693877398967743, "step": 6773 }, { "completion_length": 283.948974609375, "epoch": 0.6816603773584906, "grad_norm": 0.5854558944702148, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7461127042770386, "reward_std": 0.18123720586299896, "rewards/accuracy_reward": 0.7869290411472321, "rewards/format_reward": 0.9591836631298065, "step": 6774 }, { "completion_length": 259.42857360839844, "epoch": 0.6817610062893081, "grad_norm": 0.8558329939842224, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7648937702178955, "reward_std": 0.19786541908979416, "rewards/accuracy_reward": 0.7750978767871857, "rewards/format_reward": 0.9897959232330322, "step": 6775 }, { "completion_length": 194.1734619140625, "epoch": 0.6818616352201258, "grad_norm": 0.13464729487895966, "kl": 0.09765625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.9241982102394104, "reward_std": 0.011570342816412449, "rewards/accuracy_reward": 0.9241982102394104, "rewards/format_reward": 1.0, "step": 6776 }, { "completion_length": 202.24488830566406, "epoch": 0.6819622641509434, "grad_norm": 0.7732370495796204, "kl": 0.117919921875, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.8090142607688904, "reward_std": 0.15252558887004852, "rewards/accuracy_reward": 0.8090143203735352, "rewards/format_reward": 1.0, "step": 6777 }, { "completion_length": 166.13265228271484, "epoch": 0.682062893081761, "grad_norm": 1.9760226011276245, "kl": 0.1064453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.6954896450042725, "reward_std": 0.19741395115852356, "rewards/accuracy_reward": 0.7056937515735626, "rewards/format_reward": 0.9897959232330322, "step": 6778 }, { "completion_length": 199.4693832397461, "epoch": 0.6821635220125786, "grad_norm": 0.6554648280143738, "kl": 0.1138916015625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.827678918838501, "reward_std": 0.10023099184036255, "rewards/accuracy_reward": 0.8378830254077911, "rewards/format_reward": 0.9897959232330322, "step": 6779 }, { "completion_length": 253.7755126953125, "epoch": 0.6822641509433962, "grad_norm": 1.1208559274673462, "kl": 0.0784912109375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7317028045654297, "reward_std": 0.16917918622493744, "rewards/accuracy_reward": 0.7419069111347198, "rewards/format_reward": 0.9897959232330322, "step": 6780 }, { "completion_length": 254.34693908691406, "epoch": 0.6823647798742138, "grad_norm": 0.8506542444229126, "kl": 0.100830078125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7332559823989868, "reward_std": 0.2620515450835228, "rewards/accuracy_reward": 0.7638682425022125, "rewards/format_reward": 0.9693877398967743, "step": 6781 }, { "completion_length": 208.87754821777344, "epoch": 0.6824654088050315, "grad_norm": 0.9169580340385437, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.746355652809143, "reward_std": 0.16151238977909088, "rewards/accuracy_reward": 0.766763836145401, "rewards/format_reward": 0.9795918464660645, "step": 6782 }, { "completion_length": 240.62244415283203, "epoch": 0.682566037735849, "grad_norm": 0.4301142990589142, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7667081356048584, "reward_std": 0.06652279943227768, "rewards/accuracy_reward": 0.787116289138794, "rewards/format_reward": 0.9795918166637421, "step": 6783 }, { "completion_length": 235.39794921875, "epoch": 0.6826666666666666, "grad_norm": 0.7370247840881348, "kl": 0.126708984375, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.69685560464859, "reward_std": 0.1657116338610649, "rewards/accuracy_reward": 0.7070597112178802, "rewards/format_reward": 0.9897959232330322, "step": 6784 }, { "completion_length": 250.09182739257812, "epoch": 0.6827672955974843, "grad_norm": 0.5218515992164612, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7656813859939575, "reward_std": 0.1374020166695118, "rewards/accuracy_reward": 0.7758854627609253, "rewards/format_reward": 0.9897959232330322, "step": 6785 }, { "completion_length": 215.55101776123047, "epoch": 0.6828679245283019, "grad_norm": 0.6724594831466675, "kl": 0.1484375, "learning_rate": 1e-06, "loss": 0.006, "reward": 1.7133844494819641, "reward_std": 0.14598765224218369, "rewards/accuracy_reward": 0.7133845686912537, "rewards/format_reward": 1.0, "step": 6786 }, { "completion_length": 277.8571319580078, "epoch": 0.6829685534591194, "grad_norm": 6.703578948974609, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8352769017219543, "reward_std": 0.15653802081942558, "rewards/accuracy_reward": 0.876093327999115, "rewards/format_reward": 0.9591836631298065, "step": 6787 }, { "completion_length": 307.2142791748047, "epoch": 0.6830691823899371, "grad_norm": 0.6112971901893616, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.494886875152588, "reward_std": 0.17674307897686958, "rewards/accuracy_reward": 0.5050910115242004, "rewards/format_reward": 0.9897959232330322, "step": 6788 }, { "completion_length": 175.05101776123047, "epoch": 0.6831698113207547, "grad_norm": 0.5703505873680115, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8570502400398254, "reward_std": 0.08406605944037437, "rewards/accuracy_reward": 0.8570502698421478, "rewards/format_reward": 1.0, "step": 6789 }, { "completion_length": 184.05101776123047, "epoch": 0.6832704402515724, "grad_norm": 0.9154426455497742, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.852769672870636, "reward_std": 0.05840015783905983, "rewards/accuracy_reward": 0.8527696430683136, "rewards/format_reward": 1.0, "step": 6790 }, { "completion_length": 198.60203552246094, "epoch": 0.68337106918239, "grad_norm": 0.8915449380874634, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6974290013313293, "reward_std": 0.16449443250894547, "rewards/accuracy_reward": 0.6974290609359741, "rewards/format_reward": 1.0, "step": 6791 }, { "completion_length": 294.9183578491211, "epoch": 0.6834716981132075, "grad_norm": 0.7929893732070923, "kl": 0.10888671875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.5728999376296997, "reward_std": 0.24675142019987106, "rewards/accuracy_reward": 0.5831040591001511, "rewards/format_reward": 0.9897959232330322, "step": 6792 }, { "completion_length": 278.3265380859375, "epoch": 0.6835723270440252, "grad_norm": 0.7499387860298157, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.747483253479004, "reward_std": 0.1854601800441742, "rewards/accuracy_reward": 0.7678913176059723, "rewards/format_reward": 0.9795918166637421, "step": 6793 }, { "completion_length": 189.2346954345703, "epoch": 0.6836729559748428, "grad_norm": 1.0810296535491943, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7842565774917603, "reward_std": 0.1491200104355812, "rewards/accuracy_reward": 0.7842565476894379, "rewards/format_reward": 1.0, "step": 6794 }, { "completion_length": 234.84693145751953, "epoch": 0.6837735849056604, "grad_norm": 0.9790921211242676, "kl": 0.108154296875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.722449004650116, "reward_std": 0.22637860476970673, "rewards/accuracy_reward": 0.7530612051486969, "rewards/format_reward": 0.9693877398967743, "step": 6795 }, { "completion_length": 198.5, "epoch": 0.683874213836478, "grad_norm": 0.879320502281189, "kl": 0.0682373046875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8060669302940369, "reward_std": 0.15338389575481415, "rewards/accuracy_reward": 0.8264751136302948, "rewards/format_reward": 0.9795918464660645, "step": 6796 }, { "completion_length": 224.6428451538086, "epoch": 0.6839748427672956, "grad_norm": 1.0306915044784546, "kl": 0.129638671875, "learning_rate": 1e-06, "loss": 0.0053, "reward": 1.6615488529205322, "reward_std": 0.21888232976198196, "rewards/accuracy_reward": 0.6921611130237579, "rewards/format_reward": 0.9693877398967743, "step": 6797 }, { "completion_length": 241.36734008789062, "epoch": 0.6840754716981132, "grad_norm": 0.5552818179130554, "kl": 0.091552734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7997946739196777, "reward_std": 0.11080068349838257, "rewards/accuracy_reward": 0.7997947633266449, "rewards/format_reward": 1.0, "step": 6798 }, { "completion_length": 216.87754821777344, "epoch": 0.6841761006289309, "grad_norm": 0.7617443799972534, "kl": 0.120361328125, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7530962228775024, "reward_std": 0.16258443146944046, "rewards/accuracy_reward": 0.7633002996444702, "rewards/format_reward": 0.9897959232330322, "step": 6799 }, { "completion_length": 235.3571319580078, "epoch": 0.6842767295597484, "grad_norm": 0.7163278460502625, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8608843088150024, "reward_std": 0.1655549332499504, "rewards/accuracy_reward": 0.8812924921512604, "rewards/format_reward": 0.9795918464660645, "step": 6800 }, { "completion_length": 197.448974609375, "epoch": 0.684377358490566, "grad_norm": 1.2271919250488281, "kl": 0.0694580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8168367147445679, "reward_std": 0.23984336107969284, "rewards/accuracy_reward": 0.8168367147445679, "rewards/format_reward": 1.0, "step": 6801 }, { "completion_length": 255.92855834960938, "epoch": 0.6844779874213837, "grad_norm": 0.7668208479881287, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.816628873348236, "reward_std": 0.15811002627015114, "rewards/accuracy_reward": 0.8166288435459137, "rewards/format_reward": 1.0, "step": 6802 }, { "completion_length": 255.95917510986328, "epoch": 0.6845786163522013, "grad_norm": 0.5402372479438782, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7403029203414917, "reward_std": 0.13340186700224876, "rewards/accuracy_reward": 0.7505070269107819, "rewards/format_reward": 0.9897959232330322, "step": 6803 }, { "completion_length": 211.13265228271484, "epoch": 0.6846792452830188, "grad_norm": 0.9436323046684265, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6159992218017578, "reward_std": 0.19801947474479675, "rewards/accuracy_reward": 0.6159992516040802, "rewards/format_reward": 1.0, "step": 6804 }, { "completion_length": 226.7448959350586, "epoch": 0.6847798742138365, "grad_norm": 0.6928024291992188, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8046777248382568, "reward_std": 0.10965510085225105, "rewards/accuracy_reward": 0.804677814245224, "rewards/format_reward": 1.0, "step": 6805 }, { "completion_length": 241.1530532836914, "epoch": 0.6848805031446541, "grad_norm": 0.586996853351593, "kl": 0.10595703125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.9122305512428284, "reward_std": 0.11388813331723213, "rewards/accuracy_reward": 0.9122306406497955, "rewards/format_reward": 1.0, "step": 6806 }, { "completion_length": 237.55101013183594, "epoch": 0.6849811320754717, "grad_norm": 0.7435793876647949, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8061224222183228, "reward_std": 0.15069952607154846, "rewards/accuracy_reward": 0.8265306055545807, "rewards/format_reward": 0.9795918166637421, "step": 6807 }, { "completion_length": 281.2755126953125, "epoch": 0.6850817610062893, "grad_norm": 0.7253326773643494, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7667954564094543, "reward_std": 0.16482602059841156, "rewards/accuracy_reward": 0.7974077463150024, "rewards/format_reward": 0.9693877398967743, "step": 6808 }, { "completion_length": 272.1326446533203, "epoch": 0.6851823899371069, "grad_norm": 0.8781453967094421, "kl": 0.119384765625, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.649949073791504, "reward_std": 0.21407339721918106, "rewards/accuracy_reward": 0.6703572869300842, "rewards/format_reward": 0.9795918166637421, "step": 6809 }, { "completion_length": 231.31631469726562, "epoch": 0.6852830188679245, "grad_norm": 0.5765876173973083, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8895716667175293, "reward_std": 0.10672533884644508, "rewards/accuracy_reward": 0.8997757732868195, "rewards/format_reward": 0.9897959232330322, "step": 6810 }, { "completion_length": 309.6938781738281, "epoch": 0.6853836477987422, "grad_norm": 0.5552103519439697, "kl": 0.0780029296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.762629210948944, "reward_std": 0.1427537016570568, "rewards/accuracy_reward": 0.7728332579135895, "rewards/format_reward": 0.9897959232330322, "step": 6811 }, { "completion_length": 229.04080963134766, "epoch": 0.6854842767295597, "grad_norm": 1.3254773616790771, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.562040627002716, "reward_std": 0.130258247256279, "rewards/accuracy_reward": 0.5722447633743286, "rewards/format_reward": 0.9897959232330322, "step": 6812 }, { "completion_length": 239.38775634765625, "epoch": 0.6855849056603773, "grad_norm": 0.9887751340866089, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.5902615785598755, "reward_std": 0.11940514668822289, "rewards/accuracy_reward": 0.600465714931488, "rewards/format_reward": 0.9897959232330322, "step": 6813 }, { "completion_length": 219.9081573486328, "epoch": 0.685685534591195, "grad_norm": 0.6971067786216736, "kl": 0.101318359375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6361935138702393, "reward_std": 0.0791652649641037, "rewards/accuracy_reward": 0.6668058186769485, "rewards/format_reward": 0.9693877398967743, "step": 6814 }, { "completion_length": 189.32652282714844, "epoch": 0.6857861635220126, "grad_norm": 0.7917124629020691, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7806411385536194, "reward_std": 0.20023195445537567, "rewards/accuracy_reward": 0.7908452153205872, "rewards/format_reward": 0.9897959232330322, "step": 6815 }, { "completion_length": 193.52040100097656, "epoch": 0.6858867924528302, "grad_norm": 0.4211457669734955, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.787426769733429, "reward_std": 0.13750357180833817, "rewards/accuracy_reward": 0.8078348934650421, "rewards/format_reward": 0.9795918166637421, "step": 6816 }, { "completion_length": 181.64285278320312, "epoch": 0.6859874213836478, "grad_norm": 0.728684663772583, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8650793433189392, "reward_std": 0.12402895465493202, "rewards/accuracy_reward": 0.8650793433189392, "rewards/format_reward": 1.0, "step": 6817 }, { "completion_length": 238.7448959350586, "epoch": 0.6860880503144654, "grad_norm": 1.1913968324661255, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8571428060531616, "reward_std": 0.146170012652874, "rewards/accuracy_reward": 0.8877550661563873, "rewards/format_reward": 0.9693877398967743, "step": 6818 }, { "completion_length": 228.38774871826172, "epoch": 0.686188679245283, "grad_norm": 1.0598289966583252, "kl": 0.128173828125, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.5947338938713074, "reward_std": 0.17325299605727196, "rewards/accuracy_reward": 0.6049380004405975, "rewards/format_reward": 0.9897959232330322, "step": 6819 }, { "completion_length": 257.79590606689453, "epoch": 0.6862893081761007, "grad_norm": 0.3888552486896515, "kl": 0.135009765625, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.67909175157547, "reward_std": 0.09866811335086823, "rewards/accuracy_reward": 0.6790917217731476, "rewards/format_reward": 1.0, "step": 6820 }, { "completion_length": 249.6836700439453, "epoch": 0.6863899371069182, "grad_norm": 0.7906296849250793, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8265305757522583, "reward_std": 0.16984088718891144, "rewards/accuracy_reward": 0.8367346823215485, "rewards/format_reward": 0.9897959232330322, "step": 6821 }, { "completion_length": 206.25509643554688, "epoch": 0.6864905660377358, "grad_norm": 0.6002498269081116, "kl": 0.06103515625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.735548973083496, "reward_std": 0.1858682706952095, "rewards/accuracy_reward": 0.755957156419754, "rewards/format_reward": 0.9795918166637421, "step": 6822 }, { "completion_length": 217.9183578491211, "epoch": 0.6865911949685535, "grad_norm": 1.7787668704986572, "kl": 0.108154296875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.663403570652008, "reward_std": 0.2722727209329605, "rewards/accuracy_reward": 0.6940158605575562, "rewards/format_reward": 0.9693877398967743, "step": 6823 }, { "completion_length": 220.29591369628906, "epoch": 0.6866918238993711, "grad_norm": 0.44102737307548523, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.8062195777893066, "reward_std": 0.08134188503026962, "rewards/accuracy_reward": 0.8062195777893066, "rewards/format_reward": 1.0, "step": 6824 }, { "completion_length": 261.55101776123047, "epoch": 0.6867924528301886, "grad_norm": 0.4706028997898102, "kl": 0.100830078125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8269977569580078, "reward_std": 0.14691338315606117, "rewards/accuracy_reward": 0.8372018337249756, "rewards/format_reward": 0.9897959232330322, "step": 6825 }, { "completion_length": 203.38775634765625, "epoch": 0.6868930817610063, "grad_norm": 0.6737871170043945, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7638012170791626, "reward_std": 0.15442460030317307, "rewards/accuracy_reward": 0.7740053236484528, "rewards/format_reward": 0.9897959232330322, "step": 6826 }, { "completion_length": 273.2653045654297, "epoch": 0.6869937106918239, "grad_norm": 0.8002402782440186, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7653303146362305, "reward_std": 0.27253517508506775, "rewards/accuracy_reward": 0.8061467111110687, "rewards/format_reward": 0.9591836631298065, "step": 6827 }, { "completion_length": 186.16326141357422, "epoch": 0.6870943396226415, "grad_norm": 1.0364353656768799, "kl": 0.123779296875, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.6473053097724915, "reward_std": 0.11165430024266243, "rewards/accuracy_reward": 0.6473053991794586, "rewards/format_reward": 1.0, "step": 6828 }, { "completion_length": 193.448974609375, "epoch": 0.6871949685534591, "grad_norm": 0.6491318345069885, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8252627849578857, "reward_std": 0.11002867296338081, "rewards/accuracy_reward": 0.8558750152587891, "rewards/format_reward": 0.9693877398967743, "step": 6829 }, { "completion_length": 243.76529693603516, "epoch": 0.6872955974842767, "grad_norm": 0.884227991104126, "kl": 0.11181640625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.702631950378418, "reward_std": 0.3259834945201874, "rewards/accuracy_reward": 0.7434483170509338, "rewards/format_reward": 0.9591836631298065, "step": 6830 }, { "completion_length": 217.4081573486328, "epoch": 0.6873962264150943, "grad_norm": 0.9892213940620422, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8104308247566223, "reward_std": 0.19656437821686268, "rewards/accuracy_reward": 0.8308389782905579, "rewards/format_reward": 0.9795918166637421, "step": 6831 }, { "completion_length": 253.2448959350586, "epoch": 0.687496855345912, "grad_norm": 0.7229545712471008, "kl": 0.112060546875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.770605981349945, "reward_std": 0.15631911903619766, "rewards/accuracy_reward": 0.7910140752792358, "rewards/format_reward": 0.9795918464660645, "step": 6832 }, { "completion_length": 177.2448959350586, "epoch": 0.6875974842767295, "grad_norm": 0.8680943846702576, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.805636465549469, "reward_std": 0.17155035585165024, "rewards/accuracy_reward": 0.8056364953517914, "rewards/format_reward": 1.0, "step": 6833 }, { "completion_length": 199.7244873046875, "epoch": 0.6876981132075471, "grad_norm": 1.0913341045379639, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.768098771572113, "reward_std": 0.15001579374074936, "rewards/accuracy_reward": 0.7885069847106934, "rewards/format_reward": 0.9795918464660645, "step": 6834 }, { "completion_length": 242.30612182617188, "epoch": 0.6877987421383648, "grad_norm": 0.34558433294296265, "kl": 0.0753173828125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.668407380580902, "reward_std": 0.10516506806015968, "rewards/accuracy_reward": 0.6990196406841278, "rewards/format_reward": 0.9693877398967743, "step": 6835 }, { "completion_length": 280.7958984375, "epoch": 0.6878993710691824, "grad_norm": 0.4132179021835327, "kl": 0.0572509765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8095237016677856, "reward_std": 0.13417114317417145, "rewards/accuracy_reward": 0.8401360809803009, "rewards/format_reward": 0.9693877398967743, "step": 6836 }, { "completion_length": 345.4183654785156, "epoch": 0.688, "grad_norm": 0.8716685771942139, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7738094925880432, "reward_std": 0.14081589132547379, "rewards/accuracy_reward": 0.784013569355011, "rewards/format_reward": 0.9897959232330322, "step": 6837 }, { "completion_length": 261.84693908691406, "epoch": 0.6881006289308176, "grad_norm": 0.6621041297912598, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7312843799591064, "reward_std": 0.16752362251281738, "rewards/accuracy_reward": 0.7414884567260742, "rewards/format_reward": 0.9897959232330322, "step": 6838 }, { "completion_length": 259.28570556640625, "epoch": 0.6882012578616352, "grad_norm": 0.7784952521324158, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.673469364643097, "reward_std": 0.25229230523109436, "rewards/accuracy_reward": 0.6938775181770325, "rewards/format_reward": 0.9795918166637421, "step": 6839 }, { "completion_length": 246.29591369628906, "epoch": 0.6883018867924529, "grad_norm": 0.8633591532707214, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7075539231300354, "reward_std": 0.16068000346422195, "rewards/accuracy_reward": 0.7279620468616486, "rewards/format_reward": 0.9795918464660645, "step": 6840 }, { "completion_length": 216.73468017578125, "epoch": 0.6884025157232705, "grad_norm": 1.007272481918335, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7821304202079773, "reward_std": 0.23403286188840866, "rewards/accuracy_reward": 0.8229467272758484, "rewards/format_reward": 0.9591836333274841, "step": 6841 }, { "completion_length": 197.35713958740234, "epoch": 0.688503144654088, "grad_norm": 0.5301870703697205, "kl": 0.0787353515625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.942176878452301, "reward_std": 0.0950244590640068, "rewards/accuracy_reward": 0.942176878452301, "rewards/format_reward": 1.0, "step": 6842 }, { "completion_length": 205.05101013183594, "epoch": 0.6886037735849057, "grad_norm": 0.5858825445175171, "kl": 0.10302734375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.842574417591095, "reward_std": 0.10889612138271332, "rewards/accuracy_reward": 0.8629826009273529, "rewards/format_reward": 0.9795918166637421, "step": 6843 }, { "completion_length": 206.61223602294922, "epoch": 0.6887044025157233, "grad_norm": 0.4944264590740204, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7883425951004028, "reward_std": 0.15004462003707886, "rewards/accuracy_reward": 0.7883425951004028, "rewards/format_reward": 1.0, "step": 6844 }, { "completion_length": 300.4897918701172, "epoch": 0.6888050314465409, "grad_norm": 0.37485140562057495, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7538264989852905, "reward_std": 0.1530945599079132, "rewards/accuracy_reward": 0.7742346823215485, "rewards/format_reward": 0.9795918464660645, "step": 6845 }, { "completion_length": 311.5408020019531, "epoch": 0.6889056603773585, "grad_norm": 0.4144272208213806, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6229406595230103, "reward_std": 0.1381811574101448, "rewards/accuracy_reward": 0.6433488130569458, "rewards/format_reward": 0.9795918166637421, "step": 6846 }, { "completion_length": 290.51019287109375, "epoch": 0.6890062893081761, "grad_norm": 0.9781574606895447, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7569484114646912, "reward_std": 0.23634088039398193, "rewards/accuracy_reward": 0.7875607013702393, "rewards/format_reward": 0.9693877398967743, "step": 6847 }, { "completion_length": 293.06121826171875, "epoch": 0.6891069182389937, "grad_norm": 0.6267758011817932, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.5924741625785828, "reward_std": 0.1768053025007248, "rewards/accuracy_reward": 0.6332904696464539, "rewards/format_reward": 0.9591836333274841, "step": 6848 }, { "completion_length": 274.551025390625, "epoch": 0.6892075471698114, "grad_norm": 0.9143728613853455, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6794688701629639, "reward_std": 0.1919749677181244, "rewards/accuracy_reward": 0.7100811302661896, "rewards/format_reward": 0.9693877398967743, "step": 6849 }, { "completion_length": 186.04080963134766, "epoch": 0.6893081761006289, "grad_norm": 0.8388480544090271, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8691450953483582, "reward_std": 0.11839396506547928, "rewards/accuracy_reward": 0.8793492615222931, "rewards/format_reward": 0.9897959232330322, "step": 6850 }, { "completion_length": 222.2142791748047, "epoch": 0.6894088050314465, "grad_norm": 1.0763168334960938, "kl": 0.091064453125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8385963439941406, "reward_std": 0.10226782411336899, "rewards/accuracy_reward": 0.848800390958786, "rewards/format_reward": 0.9897959232330322, "step": 6851 }, { "completion_length": 219.34693145751953, "epoch": 0.6895094339622642, "grad_norm": 1.8048313856124878, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.748311161994934, "reward_std": 0.1399782858788967, "rewards/accuracy_reward": 0.7483111619949341, "rewards/format_reward": 1.0, "step": 6852 }, { "completion_length": 226.82653045654297, "epoch": 0.6896100628930818, "grad_norm": 0.8959035277366638, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.691446840763092, "reward_std": 0.15170998126268387, "rewards/accuracy_reward": 0.71185502409935, "rewards/format_reward": 0.9795918464660645, "step": 6853 }, { "completion_length": 222.75509643554688, "epoch": 0.6897106918238993, "grad_norm": 0.6685446500778198, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.822562336921692, "reward_std": 0.2234789878129959, "rewards/accuracy_reward": 0.8531745672225952, "rewards/format_reward": 0.9693877398967743, "step": 6854 }, { "completion_length": 245.29591369628906, "epoch": 0.689811320754717, "grad_norm": 0.750098466873169, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.792759895324707, "reward_std": 0.21072281152009964, "rewards/accuracy_reward": 0.8233721852302551, "rewards/format_reward": 0.9693877398967743, "step": 6855 }, { "completion_length": 196.38775634765625, "epoch": 0.6899119496855346, "grad_norm": 0.6756132245063782, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7567391395568848, "reward_std": 0.15139839053153992, "rewards/accuracy_reward": 0.7771473228931427, "rewards/format_reward": 0.9795918464660645, "step": 6856 }, { "completion_length": 236.448974609375, "epoch": 0.6900125786163522, "grad_norm": 6.248440742492676, "kl": 0.29150390625, "learning_rate": 1e-06, "loss": 0.0116, "reward": 1.6576123237609863, "reward_std": 0.19218652695417404, "rewards/accuracy_reward": 0.6678164899349213, "rewards/format_reward": 0.9897959232330322, "step": 6857 }, { "completion_length": 238.84693145751953, "epoch": 0.6901132075471698, "grad_norm": 0.7318693399429321, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6789763569831848, "reward_std": 0.14852388948202133, "rewards/accuracy_reward": 0.6891804337501526, "rewards/format_reward": 0.9897959232330322, "step": 6858 }, { "completion_length": 202.30611419677734, "epoch": 0.6902138364779874, "grad_norm": 0.472753643989563, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.835374116897583, "reward_std": 0.10200808010995388, "rewards/accuracy_reward": 0.8455781936645508, "rewards/format_reward": 0.9897959232330322, "step": 6859 }, { "completion_length": 248.16326904296875, "epoch": 0.690314465408805, "grad_norm": 0.7211328744888306, "kl": 0.127197265625, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.7952856421470642, "reward_std": 0.09762259759008884, "rewards/accuracy_reward": 0.8054897487163544, "rewards/format_reward": 0.9897959232330322, "step": 6860 }, { "completion_length": 194.11224365234375, "epoch": 0.6904150943396227, "grad_norm": 0.6261725425720215, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.897392213344574, "reward_std": 0.13782917335629463, "rewards/accuracy_reward": 0.9075963497161865, "rewards/format_reward": 0.9897959232330322, "step": 6861 }, { "completion_length": 285.12245178222656, "epoch": 0.6905157232704403, "grad_norm": 0.6706671714782715, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7898687720298767, "reward_std": 0.1349531076848507, "rewards/accuracy_reward": 0.7898687720298767, "rewards/format_reward": 1.0, "step": 6862 }, { "completion_length": 294.1632614135742, "epoch": 0.6906163522012578, "grad_norm": 0.8031715750694275, "kl": 0.0560302734375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.5279297232627869, "reward_std": 0.29688073694705963, "rewards/accuracy_reward": 0.568746030330658, "rewards/format_reward": 0.9591836333274841, "step": 6863 }, { "completion_length": 241.6734619140625, "epoch": 0.6907169811320755, "grad_norm": 0.7620147466659546, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8175214529037476, "reward_std": 0.19660478830337524, "rewards/accuracy_reward": 0.8481336832046509, "rewards/format_reward": 0.9693877398967743, "step": 6864 }, { "completion_length": 160.9285659790039, "epoch": 0.6908176100628931, "grad_norm": 0.31777143478393555, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.9291279911994934, "reward_std": 0.07006166083738208, "rewards/accuracy_reward": 0.939332127571106, "rewards/format_reward": 0.9897959232330322, "step": 6865 }, { "completion_length": 191.52040100097656, "epoch": 0.6909182389937107, "grad_norm": 0.48847445845603943, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8937331438064575, "reward_std": 0.10935335233807564, "rewards/accuracy_reward": 0.9039373397827148, "rewards/format_reward": 0.9897959232330322, "step": 6866 }, { "completion_length": 281.9183654785156, "epoch": 0.6910188679245283, "grad_norm": 0.5253016948699951, "kl": 0.0584716796875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.702097475528717, "reward_std": 0.20209750160574913, "rewards/accuracy_reward": 0.7225056290626526, "rewards/format_reward": 0.9795918166637421, "step": 6867 }, { "completion_length": 221.64285278320312, "epoch": 0.6911194968553459, "grad_norm": 0.7952848076820374, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7748199105262756, "reward_std": 0.137664794921875, "rewards/accuracy_reward": 0.785023957490921, "rewards/format_reward": 0.9897959232330322, "step": 6868 }, { "completion_length": 242.12244415283203, "epoch": 0.6912201257861635, "grad_norm": 1.4987825155258179, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5449658632278442, "reward_std": 0.2568305507302284, "rewards/accuracy_reward": 0.5653741210699081, "rewards/format_reward": 0.9795918464660645, "step": 6869 }, { "completion_length": 198.1938705444336, "epoch": 0.6913207547169812, "grad_norm": 0.7696750164031982, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7131519317626953, "reward_std": 0.2441297024488449, "rewards/accuracy_reward": 0.7539682686328888, "rewards/format_reward": 0.9591836631298065, "step": 6870 }, { "completion_length": 207.93877410888672, "epoch": 0.6914213836477987, "grad_norm": 1.1916916370391846, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7868306040763855, "reward_std": 0.1499793902039528, "rewards/accuracy_reward": 0.7868306040763855, "rewards/format_reward": 1.0, "step": 6871 }, { "completion_length": 230.2142791748047, "epoch": 0.6915220125786163, "grad_norm": 0.6966267824172974, "kl": 0.053466796875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8160820007324219, "reward_std": 0.171725794672966, "rewards/accuracy_reward": 0.8262861371040344, "rewards/format_reward": 0.9897959232330322, "step": 6872 }, { "completion_length": 261.42857360839844, "epoch": 0.691622641509434, "grad_norm": 0.6837520003318787, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8164369463920593, "reward_std": 0.1711144894361496, "rewards/accuracy_reward": 0.8266410827636719, "rewards/format_reward": 0.9897959232330322, "step": 6873 }, { "completion_length": 232.69387817382812, "epoch": 0.6917232704402516, "grad_norm": 0.9283671379089355, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.79124915599823, "reward_std": 0.14281513541936874, "rewards/accuracy_reward": 0.8014533221721649, "rewards/format_reward": 0.9897959232330322, "step": 6874 }, { "completion_length": 214.2653045654297, "epoch": 0.6918238993710691, "grad_norm": 1.6762906312942505, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8418265581130981, "reward_std": 0.14229674637317657, "rewards/accuracy_reward": 0.8520305752754211, "rewards/format_reward": 0.9897959232330322, "step": 6875 }, { "completion_length": 210.12244415283203, "epoch": 0.6919245283018868, "grad_norm": 0.5996108651161194, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8350582718849182, "reward_std": 0.07662389054894447, "rewards/accuracy_reward": 0.8554664850234985, "rewards/format_reward": 0.9795918464660645, "step": 6876 }, { "completion_length": 230.88774871826172, "epoch": 0.6920251572327044, "grad_norm": 0.7761682271957397, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.743928611278534, "reward_std": 0.19028232246637344, "rewards/accuracy_reward": 0.7541326284408569, "rewards/format_reward": 0.9897959232330322, "step": 6877 }, { "completion_length": 168.448974609375, "epoch": 0.692125786163522, "grad_norm": 1.0378315448760986, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8030611872673035, "reward_std": 0.11345487833023071, "rewards/accuracy_reward": 0.8132652938365936, "rewards/format_reward": 0.9897959232330322, "step": 6878 }, { "completion_length": 238.53060150146484, "epoch": 0.6922264150943396, "grad_norm": 1.1115586757659912, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7565032839775085, "reward_std": 0.13616849854588509, "rewards/accuracy_reward": 0.7565033435821533, "rewards/format_reward": 1.0, "step": 6879 }, { "completion_length": 169.53060913085938, "epoch": 0.6923270440251572, "grad_norm": 0.8986800909042358, "kl": 0.0716552734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8544217944145203, "reward_std": 0.1886369064450264, "rewards/accuracy_reward": 0.8544217348098755, "rewards/format_reward": 1.0, "step": 6880 }, { "completion_length": 224.80612182617188, "epoch": 0.6924276729559748, "grad_norm": 0.9153855443000793, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.829931914806366, "reward_std": 0.1270286701619625, "rewards/accuracy_reward": 0.8503400981426239, "rewards/format_reward": 0.9795918166637421, "step": 6881 }, { "completion_length": 243.55101013183594, "epoch": 0.6925283018867925, "grad_norm": 0.6462942957878113, "kl": 0.0670166015625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6936882734298706, "reward_std": 0.2048315554857254, "rewards/accuracy_reward": 0.7140964269638062, "rewards/format_reward": 0.9795918464660645, "step": 6882 }, { "completion_length": 176.45917510986328, "epoch": 0.69262893081761, "grad_norm": 0.5895510911941528, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7602968215942383, "reward_std": 0.111871637403965, "rewards/accuracy_reward": 0.7705009281635284, "rewards/format_reward": 0.9897959232330322, "step": 6883 }, { "completion_length": 236.75508880615234, "epoch": 0.6927295597484276, "grad_norm": 0.39732488989830017, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8979591727256775, "reward_std": 0.08099238574504852, "rewards/accuracy_reward": 0.918367326259613, "rewards/format_reward": 0.9795918464660645, "step": 6884 }, { "completion_length": 212.10203552246094, "epoch": 0.6928301886792453, "grad_norm": 0.8262245655059814, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8845772743225098, "reward_std": 0.13052881881594658, "rewards/accuracy_reward": 0.8947812914848328, "rewards/format_reward": 0.9897959232330322, "step": 6885 }, { "completion_length": 195.61224365234375, "epoch": 0.6929308176100629, "grad_norm": 1.4489343166351318, "kl": 0.1087646484375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7857142686843872, "reward_std": 0.2731986939907074, "rewards/accuracy_reward": 0.8265306055545807, "rewards/format_reward": 0.9591836333274841, "step": 6886 }, { "completion_length": 173.80612182617188, "epoch": 0.6930314465408806, "grad_norm": 0.8082853555679321, "kl": 0.098388671875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7536641955375671, "reward_std": 0.11914343386888504, "rewards/accuracy_reward": 0.7638682723045349, "rewards/format_reward": 0.9897959232330322, "step": 6887 }, { "completion_length": 254.8775405883789, "epoch": 0.6931320754716981, "grad_norm": 0.5427502393722534, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8632085919380188, "reward_std": 0.12246743962168694, "rewards/accuracy_reward": 0.8632085919380188, "rewards/format_reward": 1.0, "step": 6888 }, { "completion_length": 187.4285659790039, "epoch": 0.6932327044025157, "grad_norm": 1.1015959978103638, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.750158429145813, "reward_std": 0.15029089897871017, "rewards/accuracy_reward": 0.750158429145813, "rewards/format_reward": 1.0, "step": 6889 }, { "completion_length": 301.4183654785156, "epoch": 0.6933333333333334, "grad_norm": 0.9714811444282532, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6469873189926147, "reward_std": 0.227456197142601, "rewards/accuracy_reward": 0.6775995790958405, "rewards/format_reward": 0.9693877398967743, "step": 6890 }, { "completion_length": 221.75509643554688, "epoch": 0.693433962264151, "grad_norm": 0.4457911252975464, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8469387292861938, "reward_std": 0.06914122216403484, "rewards/accuracy_reward": 0.857142835855484, "rewards/format_reward": 0.9897959232330322, "step": 6891 }, { "completion_length": 242.02040100097656, "epoch": 0.6935345911949685, "grad_norm": 0.7945155501365662, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6360061168670654, "reward_std": 0.21995276026427746, "rewards/accuracy_reward": 0.6870266199111938, "rewards/format_reward": 0.9489795863628387, "step": 6892 }, { "completion_length": 189.07142639160156, "epoch": 0.6936352201257862, "grad_norm": 1.0231995582580566, "kl": 0.102294921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7793932557106018, "reward_std": 0.13546598330140114, "rewards/accuracy_reward": 0.7895973026752472, "rewards/format_reward": 0.9897959232330322, "step": 6893 }, { "completion_length": 158.91836547851562, "epoch": 0.6937358490566038, "grad_norm": 0.25446048378944397, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7881673574447632, "reward_std": 0.053135938942432404, "rewards/accuracy_reward": 0.798371434211731, "rewards/format_reward": 0.9897959232330322, "step": 6894 }, { "completion_length": 228.25509643554688, "epoch": 0.6938364779874214, "grad_norm": 0.5419249534606934, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6270191073417664, "reward_std": 0.0907408557832241, "rewards/accuracy_reward": 0.6372232139110565, "rewards/format_reward": 0.9897959232330322, "step": 6895 }, { "completion_length": 242.06121826171875, "epoch": 0.693937106918239, "grad_norm": 0.9860469102859497, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8079063892364502, "reward_std": 0.1847693920135498, "rewards/accuracy_reward": 0.8079063892364502, "rewards/format_reward": 1.0, "step": 6896 }, { "completion_length": 209.22447967529297, "epoch": 0.6940377358490566, "grad_norm": 5.201344966888428, "kl": 0.193359375, "learning_rate": 1e-06, "loss": 0.0077, "reward": 1.6836734414100647, "reward_std": 0.19163841009140015, "rewards/accuracy_reward": 0.7040816396474838, "rewards/format_reward": 0.9795918166637421, "step": 6897 }, { "completion_length": 223.79591369628906, "epoch": 0.6941383647798742, "grad_norm": 0.8031338453292847, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7245308756828308, "reward_std": 0.18479404598474503, "rewards/accuracy_reward": 0.7347349524497986, "rewards/format_reward": 0.9897959232330322, "step": 6898 }, { "completion_length": 202.67346954345703, "epoch": 0.6942389937106919, "grad_norm": 0.753568708896637, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8165009021759033, "reward_std": 0.20929992198944092, "rewards/accuracy_reward": 0.8471131920814514, "rewards/format_reward": 0.9693877398967743, "step": 6899 }, { "completion_length": 265.6122360229492, "epoch": 0.6943396226415094, "grad_norm": 2.2439513206481934, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.799092948436737, "reward_std": 0.28475649654865265, "rewards/accuracy_reward": 0.8297052085399628, "rewards/format_reward": 0.9693877398967743, "step": 6900 }, { "completion_length": 272.2857131958008, "epoch": 0.694440251572327, "grad_norm": 0.8292939066886902, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6922696232795715, "reward_std": 0.1924561709165573, "rewards/accuracy_reward": 0.7126777768135071, "rewards/format_reward": 0.9795918464660645, "step": 6901 }, { "completion_length": 175.2448959350586, "epoch": 0.6945408805031447, "grad_norm": 1.3434311151504517, "kl": 0.0767822265625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8673468828201294, "reward_std": 0.09670460596680641, "rewards/accuracy_reward": 0.8673469126224518, "rewards/format_reward": 1.0, "step": 6902 }, { "completion_length": 204.33673095703125, "epoch": 0.6946415094339623, "grad_norm": 0.8086062669754028, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8741496205329895, "reward_std": 0.19121698290109634, "rewards/accuracy_reward": 0.8945578038692474, "rewards/format_reward": 0.9795918464660645, "step": 6903 }, { "completion_length": 259.32653045654297, "epoch": 0.6947421383647798, "grad_norm": 2.507340669631958, "kl": 0.199462890625, "learning_rate": 1e-06, "loss": 0.008, "reward": 1.758746325969696, "reward_std": 0.20677971839904785, "rewards/accuracy_reward": 0.7995626330375671, "rewards/format_reward": 0.9591836333274841, "step": 6904 }, { "completion_length": 235.91836547851562, "epoch": 0.6948427672955975, "grad_norm": 0.9482529759407043, "kl": 0.135986328125, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.7355744242668152, "reward_std": 0.15753905475139618, "rewards/accuracy_reward": 0.745778501033783, "rewards/format_reward": 0.9897959232330322, "step": 6905 }, { "completion_length": 247.18366241455078, "epoch": 0.6949433962264151, "grad_norm": 1.1406680345535278, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.745422899723053, "reward_std": 0.17307641357183456, "rewards/accuracy_reward": 0.7658310830593109, "rewards/format_reward": 0.9795918166637421, "step": 6906 }, { "completion_length": 276.6428527832031, "epoch": 0.6950440251572327, "grad_norm": 0.9562528133392334, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8376151323318481, "reward_std": 0.16470851749181747, "rewards/accuracy_reward": 0.8478192687034607, "rewards/format_reward": 0.9897959232330322, "step": 6907 }, { "completion_length": 203.62244415283203, "epoch": 0.6951446540880503, "grad_norm": 0.8985860347747803, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.9387754797935486, "reward_std": 0.11805073916912079, "rewards/accuracy_reward": 0.9489795863628387, "rewards/format_reward": 0.9897959232330322, "step": 6908 }, { "completion_length": 180.87754821777344, "epoch": 0.6952452830188679, "grad_norm": 0.6941994428634644, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.782993197441101, "reward_std": 0.08319712802767754, "rewards/accuracy_reward": 0.7931972742080688, "rewards/format_reward": 0.9897959232330322, "step": 6909 }, { "completion_length": 206.0204086303711, "epoch": 0.6953459119496855, "grad_norm": 0.7490121722221375, "kl": 0.0655517578125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.742225468158722, "reward_std": 0.12522855028510094, "rewards/accuracy_reward": 0.7626336514949799, "rewards/format_reward": 0.9795918464660645, "step": 6910 }, { "completion_length": 249.2040786743164, "epoch": 0.6954465408805032, "grad_norm": 0.6810418367385864, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7070474028587341, "reward_std": 0.2448555752635002, "rewards/accuracy_reward": 0.7274556159973145, "rewards/format_reward": 0.9795918464660645, "step": 6911 }, { "completion_length": 191.62245178222656, "epoch": 0.6955471698113208, "grad_norm": 0.6463564038276672, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8484987616539001, "reward_std": 0.14390341192483902, "rewards/accuracy_reward": 0.8587029278278351, "rewards/format_reward": 0.9897959232330322, "step": 6912 }, { "completion_length": 246.4387664794922, "epoch": 0.6956477987421383, "grad_norm": 0.8341658711433411, "kl": 0.12841796875, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.5991841554641724, "reward_std": 0.23868519812822342, "rewards/accuracy_reward": 0.6195923984050751, "rewards/format_reward": 0.9795918464660645, "step": 6913 }, { "completion_length": 191.6836700439453, "epoch": 0.695748427672956, "grad_norm": 1.641955852508545, "kl": 0.14990234375, "learning_rate": 1e-06, "loss": 0.006, "reward": 1.7918853163719177, "reward_std": 0.12097354233264923, "rewards/accuracy_reward": 0.8122934997081757, "rewards/format_reward": 0.9795918166637421, "step": 6914 }, { "completion_length": 273.2040710449219, "epoch": 0.6958490566037736, "grad_norm": 0.8504961133003235, "kl": 0.0595703125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7843402028083801, "reward_std": 0.22943364083766937, "rewards/accuracy_reward": 0.8251565992832184, "rewards/format_reward": 0.9591836333274841, "step": 6915 }, { "completion_length": 269.34693908691406, "epoch": 0.6959496855345912, "grad_norm": 1.941637396812439, "kl": 0.0523681640625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7368740439414978, "reward_std": 0.2503724694252014, "rewards/accuracy_reward": 0.7674863636493683, "rewards/format_reward": 0.9693877398967743, "step": 6916 }, { "completion_length": 258.08162689208984, "epoch": 0.6960503144654088, "grad_norm": 0.5973963737487793, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.4747980833053589, "reward_std": 0.22375348955392838, "rewards/accuracy_reward": 0.5054103285074234, "rewards/format_reward": 0.9693877398967743, "step": 6917 }, { "completion_length": 294.30611419677734, "epoch": 0.6961509433962264, "grad_norm": 0.7235993146896362, "kl": 0.0933837890625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.505812406539917, "reward_std": 0.28060800582170486, "rewards/accuracy_reward": 0.5568328499794006, "rewards/format_reward": 0.9489795565605164, "step": 6918 }, { "completion_length": 255.10203552246094, "epoch": 0.696251572327044, "grad_norm": 0.4562951922416687, "kl": 0.0848388671875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7383686304092407, "reward_std": 0.09319804236292839, "rewards/accuracy_reward": 0.7485727369785309, "rewards/format_reward": 0.9897959232330322, "step": 6919 }, { "completion_length": 233.78571319580078, "epoch": 0.6963522012578617, "grad_norm": 0.6902015805244446, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7702516913414001, "reward_std": 0.19127461314201355, "rewards/accuracy_reward": 0.7906597852706909, "rewards/format_reward": 0.9795918464660645, "step": 6920 }, { "completion_length": 238.6428451538086, "epoch": 0.6964528301886792, "grad_norm": 0.91241055727005, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7121840119361877, "reward_std": 0.20968008413910866, "rewards/accuracy_reward": 0.7427962720394135, "rewards/format_reward": 0.9693877398967743, "step": 6921 }, { "completion_length": 183.65306091308594, "epoch": 0.6965534591194968, "grad_norm": 0.8271381855010986, "kl": 0.111083984375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7887799739837646, "reward_std": 0.12083415687084198, "rewards/accuracy_reward": 0.8091881573200226, "rewards/format_reward": 0.9795918166637421, "step": 6922 }, { "completion_length": 281.22447967529297, "epoch": 0.6966540880503145, "grad_norm": 0.7395521998405457, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7309457063674927, "reward_std": 0.21708609908819199, "rewards/accuracy_reward": 0.7615579962730408, "rewards/format_reward": 0.9693877398967743, "step": 6923 }, { "completion_length": 239.52040100097656, "epoch": 0.6967547169811321, "grad_norm": 1.6072282791137695, "kl": 0.091552734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7313323020935059, "reward_std": 0.18329786136746407, "rewards/accuracy_reward": 0.741536408662796, "rewards/format_reward": 0.9897959232330322, "step": 6924 }, { "completion_length": 255.4387664794922, "epoch": 0.6968553459119496, "grad_norm": 3.5351815223693848, "kl": 0.12939453125, "learning_rate": 1e-06, "loss": 0.0052, "reward": 1.5884730815887451, "reward_std": 0.1480981931090355, "rewards/accuracy_reward": 0.5986772179603577, "rewards/format_reward": 0.9897959232330322, "step": 6925 }, { "completion_length": 204.7653045654297, "epoch": 0.6969559748427673, "grad_norm": 2.864454507827759, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8235665559768677, "reward_std": 0.16130249947309494, "rewards/accuracy_reward": 0.8439747393131256, "rewards/format_reward": 0.9795918464660645, "step": 6926 }, { "completion_length": 205.09182739257812, "epoch": 0.6970566037735849, "grad_norm": 0.8757882118225098, "kl": 0.10205078125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8237016797065735, "reward_std": 0.10251455381512642, "rewards/accuracy_reward": 0.8237017691135406, "rewards/format_reward": 1.0, "step": 6927 }, { "completion_length": 194.9285659790039, "epoch": 0.6971572327044026, "grad_norm": 0.7059340476989746, "kl": 0.10400390625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.864625871181488, "reward_std": 0.1704574078321457, "rewards/accuracy_reward": 0.8748299181461334, "rewards/format_reward": 0.9897959232330322, "step": 6928 }, { "completion_length": 170.28571319580078, "epoch": 0.6972578616352201, "grad_norm": 2.1437244415283203, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8333333730697632, "reward_std": 0.11608453467488289, "rewards/accuracy_reward": 0.8435374200344086, "rewards/format_reward": 0.9897959232330322, "step": 6929 }, { "completion_length": 175.87754440307617, "epoch": 0.6973584905660377, "grad_norm": 1.3254605531692505, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.80086749792099, "reward_std": 0.1393933668732643, "rewards/accuracy_reward": 0.8212757408618927, "rewards/format_reward": 0.9795918166637421, "step": 6930 }, { "completion_length": 183.7244873046875, "epoch": 0.6974591194968554, "grad_norm": 0.7639480829238892, "kl": 0.11962890625, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.762422263622284, "reward_std": 0.10248072072863579, "rewards/accuracy_reward": 0.7828304171562195, "rewards/format_reward": 0.9795918166637421, "step": 6931 }, { "completion_length": 206.85713958740234, "epoch": 0.697559748427673, "grad_norm": 0.6179865002632141, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.817792296409607, "reward_std": 0.1726093739271164, "rewards/accuracy_reward": 0.868812769651413, "rewards/format_reward": 0.9489795565605164, "step": 6932 }, { "completion_length": 182.9897918701172, "epoch": 0.6976603773584905, "grad_norm": 4.605338096618652, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7261447310447693, "reward_std": 0.19578655809164047, "rewards/accuracy_reward": 0.7261447012424469, "rewards/format_reward": 1.0, "step": 6933 }, { "completion_length": 255.61223602294922, "epoch": 0.6977610062893081, "grad_norm": 5.504959583282471, "kl": 0.112548828125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.6533212065696716, "reward_std": 0.15399858355522156, "rewards/accuracy_reward": 0.6737293601036072, "rewards/format_reward": 0.9795918464660645, "step": 6934 }, { "completion_length": 237.1326446533203, "epoch": 0.6978616352201258, "grad_norm": 1.0583800077438354, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7851919531822205, "reward_std": 0.1652705855667591, "rewards/accuracy_reward": 0.7953960597515106, "rewards/format_reward": 0.9897959232330322, "step": 6935 }, { "completion_length": 204.6836700439453, "epoch": 0.6979622641509434, "grad_norm": 0.7297905087471008, "kl": 0.059326171875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6275650262832642, "reward_std": 0.20664082467556, "rewards/accuracy_reward": 0.6581772714853287, "rewards/format_reward": 0.9693877398967743, "step": 6936 }, { "completion_length": 223.80611419677734, "epoch": 0.6980628930817611, "grad_norm": 0.8451403975486755, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7963640689849854, "reward_std": 0.18555746972560883, "rewards/accuracy_reward": 0.8167722523212433, "rewards/format_reward": 0.9795918464660645, "step": 6937 }, { "completion_length": 194.26529693603516, "epoch": 0.6981635220125786, "grad_norm": 0.6177085638046265, "kl": 0.10791015625, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7977735996246338, "reward_std": 0.1251700520515442, "rewards/accuracy_reward": 0.807977706193924, "rewards/format_reward": 0.9897959232330322, "step": 6938 }, { "completion_length": 140.12245178222656, "epoch": 0.6982641509433962, "grad_norm": 1.185473084449768, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8149280548095703, "reward_std": 0.07224939949810505, "rewards/accuracy_reward": 0.8149280846118927, "rewards/format_reward": 1.0, "step": 6939 }, { "completion_length": 262.29591369628906, "epoch": 0.6983647798742139, "grad_norm": 1.0274909734725952, "kl": 0.0836181640625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.5849076509475708, "reward_std": 0.2673520967364311, "rewards/accuracy_reward": 0.6155199706554413, "rewards/format_reward": 0.9693877398967743, "step": 6940 }, { "completion_length": 172.1734619140625, "epoch": 0.6984654088050315, "grad_norm": 1.923300862312317, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.911564588546753, "reward_std": 0.20845618844032288, "rewards/accuracy_reward": 0.9319727718830109, "rewards/format_reward": 0.9795918464660645, "step": 6941 }, { "completion_length": 203.4693832397461, "epoch": 0.698566037735849, "grad_norm": 0.9457712173461914, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6827847957611084, "reward_std": 0.170182965695858, "rewards/accuracy_reward": 0.7133969664573669, "rewards/format_reward": 0.9693877398967743, "step": 6942 }, { "completion_length": 192.07141876220703, "epoch": 0.6986666666666667, "grad_norm": 0.4963360130786896, "kl": 0.09814453125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.787536382675171, "reward_std": 0.13751855120062828, "rewards/accuracy_reward": 0.8079445958137512, "rewards/format_reward": 0.9795918166637421, "step": 6943 }, { "completion_length": 240.98979949951172, "epoch": 0.6987672955974843, "grad_norm": 0.6593217253684998, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7114151120185852, "reward_std": 0.15144556015729904, "rewards/accuracy_reward": 0.7216192185878754, "rewards/format_reward": 0.9897959232330322, "step": 6944 }, { "completion_length": 226.9591827392578, "epoch": 0.6988679245283019, "grad_norm": 0.9894723296165466, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.674344003200531, "reward_std": 0.26174575835466385, "rewards/accuracy_reward": 0.7049562335014343, "rewards/format_reward": 0.9693877398967743, "step": 6945 }, { "completion_length": 207.11224365234375, "epoch": 0.6989685534591195, "grad_norm": 0.584664523601532, "kl": 0.0684814453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6962820887565613, "reward_std": 0.1300109662115574, "rewards/accuracy_reward": 0.6962821781635284, "rewards/format_reward": 1.0, "step": 6946 }, { "completion_length": 190.61224365234375, "epoch": 0.6990691823899371, "grad_norm": 0.5209723711013794, "kl": 0.11865234375, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.8534270524978638, "reward_std": 0.13943127915263176, "rewards/accuracy_reward": 0.8636311590671539, "rewards/format_reward": 0.9897959232330322, "step": 6947 }, { "completion_length": 214.39794921875, "epoch": 0.6991698113207547, "grad_norm": 1.8087656497955322, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7707781791687012, "reward_std": 0.22839686274528503, "rewards/accuracy_reward": 0.780982255935669, "rewards/format_reward": 0.9897959232330322, "step": 6948 }, { "completion_length": 183.38774871826172, "epoch": 0.6992704402515724, "grad_norm": 1.164220929145813, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.653595209121704, "reward_std": 0.15605850517749786, "rewards/accuracy_reward": 0.6535952091217041, "rewards/format_reward": 1.0, "step": 6949 }, { "completion_length": 131.77550506591797, "epoch": 0.6993710691823899, "grad_norm": 1.1924420595169067, "kl": 0.13818359375, "learning_rate": 1e-06, "loss": 0.0055, "reward": 1.7086756229400635, "reward_std": 0.11948078125715256, "rewards/accuracy_reward": 0.7086756825447083, "rewards/format_reward": 1.0, "step": 6950 }, { "completion_length": 198.9591827392578, "epoch": 0.6994716981132075, "grad_norm": 0.936197817325592, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8845480680465698, "reward_std": 0.07840336859226227, "rewards/accuracy_reward": 0.89475217461586, "rewards/format_reward": 0.9897959232330322, "step": 6951 }, { "completion_length": 230.7040786743164, "epoch": 0.6995723270440252, "grad_norm": 0.8757817149162292, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.890338122844696, "reward_std": 0.17924565076828003, "rewards/accuracy_reward": 0.910746306180954, "rewards/format_reward": 0.9795918464660645, "step": 6952 }, { "completion_length": 205.25509643554688, "epoch": 0.6996729559748428, "grad_norm": 1.7395319938659668, "kl": 0.064697265625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.803055226802826, "reward_std": 0.19350522011518478, "rewards/accuracy_reward": 0.8132593631744385, "rewards/format_reward": 0.9897959232330322, "step": 6953 }, { "completion_length": 224.8265151977539, "epoch": 0.6997735849056603, "grad_norm": 1.5003527402877808, "kl": 0.124267578125, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.600282073020935, "reward_std": 0.1854584589600563, "rewards/accuracy_reward": 0.6104861199855804, "rewards/format_reward": 0.9897959232330322, "step": 6954 }, { "completion_length": 191.81632232666016, "epoch": 0.699874213836478, "grad_norm": 0.5826440453529358, "kl": 0.134765625, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.8228742480278015, "reward_std": 0.05438780225813389, "rewards/accuracy_reward": 0.8228742480278015, "rewards/format_reward": 1.0, "step": 6955 }, { "completion_length": 220.34693145751953, "epoch": 0.6999748427672956, "grad_norm": 0.5871669054031372, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7135766744613647, "reward_std": 0.11504267062991858, "rewards/accuracy_reward": 0.7339848279953003, "rewards/format_reward": 0.9795918166637421, "step": 6956 }, { "completion_length": 188.83673095703125, "epoch": 0.7000754716981132, "grad_norm": 0.41405171155929565, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.9337916374206543, "reward_std": 0.06357140466570854, "rewards/accuracy_reward": 0.9337917268276215, "rewards/format_reward": 1.0, "step": 6957 }, { "completion_length": 274.83673095703125, "epoch": 0.7001761006289308, "grad_norm": 1.0113376379013062, "kl": 0.0718994140625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6471402645111084, "reward_std": 0.23550906032323837, "rewards/accuracy_reward": 0.6471401751041412, "rewards/format_reward": 1.0, "step": 6958 }, { "completion_length": 197.93877410888672, "epoch": 0.7002767295597484, "grad_norm": 0.9497712254524231, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.757414996623993, "reward_std": 0.22176816314458847, "rewards/accuracy_reward": 0.7574150264263153, "rewards/format_reward": 1.0, "step": 6959 }, { "completion_length": 231.70407104492188, "epoch": 0.700377358490566, "grad_norm": 0.5425301790237427, "kl": 0.111328125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7364872694015503, "reward_std": 0.09305316209793091, "rewards/accuracy_reward": 0.7364873290061951, "rewards/format_reward": 1.0, "step": 6960 }, { "completion_length": 168.79591369628906, "epoch": 0.7004779874213837, "grad_norm": 0.9699786901473999, "kl": 0.103759765625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8388652801513672, "reward_std": 0.19057874381542206, "rewards/accuracy_reward": 0.8592734634876251, "rewards/format_reward": 0.9795918464660645, "step": 6961 }, { "completion_length": 231.5, "epoch": 0.7005786163522013, "grad_norm": 1.2796671390533447, "kl": 0.116455078125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.714846670627594, "reward_std": 0.1914496049284935, "rewards/accuracy_reward": 0.7352548539638519, "rewards/format_reward": 0.9795918464660645, "step": 6962 }, { "completion_length": 225.69387817382812, "epoch": 0.7006792452830188, "grad_norm": 1.0070384740829468, "kl": 0.091064453125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7500959634780884, "reward_std": 0.1579090841114521, "rewards/accuracy_reward": 0.7603000104427338, "rewards/format_reward": 0.9897959232330322, "step": 6963 }, { "completion_length": 227.52040100097656, "epoch": 0.7007798742138365, "grad_norm": 0.8527175188064575, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6743547916412354, "reward_std": 0.16800560057163239, "rewards/accuracy_reward": 0.6947629302740097, "rewards/format_reward": 0.9795918166637421, "step": 6964 }, { "completion_length": 244.75509643554688, "epoch": 0.7008805031446541, "grad_norm": 0.6415805220603943, "kl": 0.09814453125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6737034916877747, "reward_std": 0.16094154119491577, "rewards/accuracy_reward": 0.6737034916877747, "rewards/format_reward": 1.0, "step": 6965 }, { "completion_length": 243.01020050048828, "epoch": 0.7009811320754717, "grad_norm": 0.6977823972702026, "kl": 0.100830078125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6700732111930847, "reward_std": 0.16410263255238533, "rewards/accuracy_reward": 0.6700732409954071, "rewards/format_reward": 1.0, "step": 6966 }, { "completion_length": 220.64285278320312, "epoch": 0.7010817610062893, "grad_norm": 1.3563225269317627, "kl": 0.0765380859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6699442267417908, "reward_std": 0.21681468933820724, "rewards/accuracy_reward": 0.6801484227180481, "rewards/format_reward": 0.9897959232330322, "step": 6967 }, { "completion_length": 244.24488830566406, "epoch": 0.7011823899371069, "grad_norm": 4.002674102783203, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.705401599407196, "reward_std": 0.22859369218349457, "rewards/accuracy_reward": 0.7360139191150665, "rewards/format_reward": 0.9693877398967743, "step": 6968 }, { "completion_length": 263.26529693603516, "epoch": 0.7012830188679245, "grad_norm": 1.0794119834899902, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7430676817893982, "reward_std": 0.1950230747461319, "rewards/accuracy_reward": 0.7634758353233337, "rewards/format_reward": 0.9795918464660645, "step": 6969 }, { "completion_length": 254.1428451538086, "epoch": 0.7013836477987422, "grad_norm": 0.7964116334915161, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7708462476730347, "reward_std": 0.17956513166427612, "rewards/accuracy_reward": 0.7708462178707123, "rewards/format_reward": 1.0, "step": 6970 }, { "completion_length": 271.5918273925781, "epoch": 0.7014842767295597, "grad_norm": 0.6185782551765442, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6883713006973267, "reward_std": 0.22079363465309143, "rewards/accuracy_reward": 0.7189835906028748, "rewards/format_reward": 0.9693877398967743, "step": 6971 }, { "completion_length": 212.46939086914062, "epoch": 0.7015849056603773, "grad_norm": 0.7651980519294739, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.9408162236213684, "reward_std": 0.1374439299106598, "rewards/accuracy_reward": 0.9408162832260132, "rewards/format_reward": 1.0, "step": 6972 }, { "completion_length": 206.61224365234375, "epoch": 0.701685534591195, "grad_norm": 0.589611828327179, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8681766390800476, "reward_std": 0.08318907953798771, "rewards/accuracy_reward": 0.86817666888237, "rewards/format_reward": 1.0, "step": 6973 }, { "completion_length": 277.4693908691406, "epoch": 0.7017861635220126, "grad_norm": 0.5564979314804077, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8109154105186462, "reward_std": 0.10007880628108978, "rewards/accuracy_reward": 0.8211195170879364, "rewards/format_reward": 0.9897959232330322, "step": 6974 }, { "completion_length": 281.6632537841797, "epoch": 0.7018867924528301, "grad_norm": 0.9017826318740845, "kl": 0.1201171875, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.6810759902000427, "reward_std": 0.27918580919504166, "rewards/accuracy_reward": 0.7116883099079132, "rewards/format_reward": 0.9693877398967743, "step": 6975 }, { "completion_length": 222.6938705444336, "epoch": 0.7019874213836478, "grad_norm": 0.6579614877700806, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.763625979423523, "reward_std": 0.18035131692886353, "rewards/accuracy_reward": 0.7636260092258453, "rewards/format_reward": 1.0, "step": 6976 }, { "completion_length": 246.89796447753906, "epoch": 0.7020880503144654, "grad_norm": 0.782738983631134, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7527862191200256, "reward_std": 0.16492315381765366, "rewards/accuracy_reward": 0.7629904448986053, "rewards/format_reward": 0.9897959232330322, "step": 6977 }, { "completion_length": 224.16326141357422, "epoch": 0.702188679245283, "grad_norm": 0.662425696849823, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6240697503089905, "reward_std": 0.0716125387698412, "rewards/accuracy_reward": 0.6240697801113129, "rewards/format_reward": 1.0, "step": 6978 }, { "completion_length": 180.14285278320312, "epoch": 0.7022893081761006, "grad_norm": 1.3444854021072388, "kl": 0.1201171875, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7851735353469849, "reward_std": 0.10667885094881058, "rewards/accuracy_reward": 0.7953775525093079, "rewards/format_reward": 0.9897959232330322, "step": 6979 }, { "completion_length": 154.6734619140625, "epoch": 0.7023899371069182, "grad_norm": 0.3701116442680359, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8186761140823364, "reward_std": 0.06235591322183609, "rewards/accuracy_reward": 0.8186761438846588, "rewards/format_reward": 1.0, "step": 6980 }, { "completion_length": 213.30611419677734, "epoch": 0.7024905660377359, "grad_norm": 0.9401167035102844, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.6262920498847961, "reward_std": 0.3048735409975052, "rewards/accuracy_reward": 0.667108416557312, "rewards/format_reward": 0.9591836631298065, "step": 6981 }, { "completion_length": 197.74488830566406, "epoch": 0.7025911949685535, "grad_norm": 0.9164140224456787, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7162851691246033, "reward_std": 0.24755064398050308, "rewards/accuracy_reward": 0.746897429227829, "rewards/format_reward": 0.9693877398967743, "step": 6982 }, { "completion_length": 245.9795913696289, "epoch": 0.702691823899371, "grad_norm": 0.8218940496444702, "kl": 0.0699462890625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6334939002990723, "reward_std": 0.2330261990427971, "rewards/accuracy_reward": 0.6436980068683624, "rewards/format_reward": 0.9897959232330322, "step": 6983 }, { "completion_length": 263.6326446533203, "epoch": 0.7027924528301887, "grad_norm": 0.6966466307640076, "kl": 0.0631103515625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7758687138557434, "reward_std": 0.10429036989808083, "rewards/accuracy_reward": 0.7758687138557434, "rewards/format_reward": 1.0, "step": 6984 }, { "completion_length": 208.33673095703125, "epoch": 0.7028930817610063, "grad_norm": 0.5116955041885376, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8253989815711975, "reward_std": 0.07335223257541656, "rewards/accuracy_reward": 0.8356031179428101, "rewards/format_reward": 0.9897959232330322, "step": 6985 }, { "completion_length": 194.32652282714844, "epoch": 0.7029937106918239, "grad_norm": 0.7746355533599854, "kl": 0.091552734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7710232734680176, "reward_std": 0.08649081364274025, "rewards/accuracy_reward": 0.7710232734680176, "rewards/format_reward": 1.0, "step": 6986 }, { "completion_length": 174.23468780517578, "epoch": 0.7030943396226416, "grad_norm": 0.14428265392780304, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 6987 }, { "completion_length": 240.54080963134766, "epoch": 0.7031949685534591, "grad_norm": 0.6655307412147522, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8090938329696655, "reward_std": 0.1857026442885399, "rewards/accuracy_reward": 0.8192979395389557, "rewards/format_reward": 0.9897959232330322, "step": 6988 }, { "completion_length": 315.8571472167969, "epoch": 0.7032955974842767, "grad_norm": 1.040372610092163, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.599640667438507, "reward_std": 0.13970430195331573, "rewards/accuracy_reward": 0.5996407270431519, "rewards/format_reward": 1.0, "step": 6989 }, { "completion_length": 248.9795913696289, "epoch": 0.7033962264150944, "grad_norm": 0.7123767137527466, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.5972789525985718, "reward_std": 0.136667151004076, "rewards/accuracy_reward": 0.597278892993927, "rewards/format_reward": 1.0, "step": 6990 }, { "completion_length": 256.5816192626953, "epoch": 0.703496855345912, "grad_norm": 0.692683219909668, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6823198795318604, "reward_std": 0.1404770463705063, "rewards/accuracy_reward": 0.7129320651292801, "rewards/format_reward": 0.9693877398967743, "step": 6991 }, { "completion_length": 241.10204315185547, "epoch": 0.7035974842767295, "grad_norm": 0.5495135188102722, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8250404000282288, "reward_std": 0.1282636933028698, "rewards/accuracy_reward": 0.8352445363998413, "rewards/format_reward": 0.9897959232330322, "step": 6992 }, { "completion_length": 228.75509643554688, "epoch": 0.7036981132075472, "grad_norm": 0.8440304398536682, "kl": 0.1109619140625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7397959232330322, "reward_std": 0.13129987940192223, "rewards/accuracy_reward": 0.7499999701976776, "rewards/format_reward": 0.9897959232330322, "step": 6993 }, { "completion_length": 244.01020050048828, "epoch": 0.7037987421383648, "grad_norm": 2.3346900939941406, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8328637480735779, "reward_std": 0.22755252569913864, "rewards/accuracy_reward": 0.8328637778759003, "rewards/format_reward": 1.0, "step": 6994 }, { "completion_length": 245.7551040649414, "epoch": 0.7038993710691824, "grad_norm": 0.48668375611305237, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8926384449005127, "reward_std": 0.08655962534248829, "rewards/accuracy_reward": 0.9130466282367706, "rewards/format_reward": 0.9795918166637421, "step": 6995 }, { "completion_length": 259.17346954345703, "epoch": 0.704, "grad_norm": 0.7605843544006348, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5391275882720947, "reward_std": 0.15801266580820084, "rewards/accuracy_reward": 0.5493315756320953, "rewards/format_reward": 0.9897959232330322, "step": 6996 }, { "completion_length": 222.28571319580078, "epoch": 0.7041006289308176, "grad_norm": 1.0403114557266235, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.751178503036499, "reward_std": 0.1556130163371563, "rewards/accuracy_reward": 0.7613826096057892, "rewards/format_reward": 0.9897959232330322, "step": 6997 }, { "completion_length": 267.6632614135742, "epoch": 0.7042012578616352, "grad_norm": 0.9111583828926086, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6447365283966064, "reward_std": 0.2960468679666519, "rewards/accuracy_reward": 0.6651447117328644, "rewards/format_reward": 0.9795918166637421, "step": 6998 }, { "completion_length": 260.2959213256836, "epoch": 0.7043018867924529, "grad_norm": 0.9717223644256592, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.736281156539917, "reward_std": 0.13452653214335442, "rewards/accuracy_reward": 0.7566893696784973, "rewards/format_reward": 0.9795918166637421, "step": 6999 }, { "completion_length": 176.06121826171875, "epoch": 0.7044025157232704, "grad_norm": 0.7294453382492065, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7103984355926514, "reward_std": 0.13238636776804924, "rewards/accuracy_reward": 0.7206025421619415, "rewards/format_reward": 0.9897959232330322, "step": 7000 }, { "completion_length": 177.82653045654297, "epoch": 0.704503144654088, "grad_norm": 4.988327503204346, "kl": 0.321533203125, "learning_rate": 1e-06, "loss": 0.0129, "reward": 1.8333333134651184, "reward_std": 0.15184257552027702, "rewards/accuracy_reward": 0.8435373902320862, "rewards/format_reward": 0.9897959232330322, "step": 7001 }, { "completion_length": 217.35713958740234, "epoch": 0.7046037735849057, "grad_norm": 0.7194982767105103, "kl": 0.0665283203125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8581146597862244, "reward_std": 0.18301359564065933, "rewards/accuracy_reward": 0.8683187365531921, "rewards/format_reward": 0.9897959232330322, "step": 7002 }, { "completion_length": 251.05101013183594, "epoch": 0.7047044025157233, "grad_norm": 0.2770492732524872, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.9406307339668274, "reward_std": 0.061626989394426346, "rewards/accuracy_reward": 0.9508349001407623, "rewards/format_reward": 0.9897959232330322, "step": 7003 }, { "completion_length": 192.23468780517578, "epoch": 0.7048050314465408, "grad_norm": 4.111672401428223, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7372353672981262, "reward_std": 0.18657179549336433, "rewards/accuracy_reward": 0.7576435804367065, "rewards/format_reward": 0.9795918166637421, "step": 7004 }, { "completion_length": 225.58162689208984, "epoch": 0.7049056603773585, "grad_norm": 0.5396813154220581, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.872934877872467, "reward_std": 0.08857348561286926, "rewards/accuracy_reward": 0.872934877872467, "rewards/format_reward": 1.0, "step": 7005 }, { "completion_length": 256.0408020019531, "epoch": 0.7050062893081761, "grad_norm": 0.951470673084259, "kl": 0.10302734375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8421800136566162, "reward_std": 0.14880462735891342, "rewards/accuracy_reward": 0.852384090423584, "rewards/format_reward": 0.9897959232330322, "step": 7006 }, { "completion_length": 211.62244415283203, "epoch": 0.7051069182389937, "grad_norm": 0.6659235954284668, "kl": 0.055419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8984337449073792, "reward_std": 0.14990247786045074, "rewards/accuracy_reward": 0.9188419580459595, "rewards/format_reward": 0.9795918166637421, "step": 7007 }, { "completion_length": 235.4693832397461, "epoch": 0.7052075471698113, "grad_norm": 1.0151513814926147, "kl": 0.102294921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.5583775043487549, "reward_std": 0.22196391969919205, "rewards/accuracy_reward": 0.5685817003250122, "rewards/format_reward": 0.9897959232330322, "step": 7008 }, { "completion_length": 209.07142639160156, "epoch": 0.7053081761006289, "grad_norm": 2.251570463180542, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8306931853294373, "reward_std": 0.1632041409611702, "rewards/accuracy_reward": 0.8306931853294373, "rewards/format_reward": 1.0, "step": 7009 }, { "completion_length": 235.56121826171875, "epoch": 0.7054088050314465, "grad_norm": 0.8834598660469055, "kl": 0.122802734375, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.7043957114219666, "reward_std": 0.2674306109547615, "rewards/accuracy_reward": 0.7248038649559021, "rewards/format_reward": 0.9795918464660645, "step": 7010 }, { "completion_length": 186.85713958740234, "epoch": 0.7055094339622642, "grad_norm": 0.7093043923377991, "kl": 0.10205078125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7978895902633667, "reward_std": 0.09603862464427948, "rewards/accuracy_reward": 0.8182977735996246, "rewards/format_reward": 0.9795918166637421, "step": 7011 }, { "completion_length": 217.42857360839844, "epoch": 0.7056100628930818, "grad_norm": 1.5991162061691284, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.842486023902893, "reward_std": 0.1341356784105301, "rewards/accuracy_reward": 0.8424860537052155, "rewards/format_reward": 1.0, "step": 7012 }, { "completion_length": 193.63265228271484, "epoch": 0.7057106918238993, "grad_norm": 0.8042822480201721, "kl": 0.1337890625, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.7925530672073364, "reward_std": 0.08306461200118065, "rewards/accuracy_reward": 0.8027571737766266, "rewards/format_reward": 0.9897959232330322, "step": 7013 }, { "completion_length": 289.4795837402344, "epoch": 0.705811320754717, "grad_norm": 0.683194637298584, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8071593642234802, "reward_std": 0.12906895950436592, "rewards/accuracy_reward": 0.8173634707927704, "rewards/format_reward": 0.9897959232330322, "step": 7014 }, { "completion_length": 237.80612182617188, "epoch": 0.7059119496855346, "grad_norm": 1.070243239402771, "kl": 0.10302734375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.783608615398407, "reward_std": 0.14634298533201218, "rewards/accuracy_reward": 0.7938127517700195, "rewards/format_reward": 0.9897959232330322, "step": 7015 }, { "completion_length": 273.65306091308594, "epoch": 0.7060125786163522, "grad_norm": 0.8829271793365479, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.685749590396881, "reward_std": 0.260208398103714, "rewards/accuracy_reward": 0.6959537267684937, "rewards/format_reward": 0.9897959232330322, "step": 7016 }, { "completion_length": 263.62245178222656, "epoch": 0.7061132075471698, "grad_norm": 0.7113324999809265, "kl": 0.0596923828125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7931572794914246, "reward_std": 0.1834576204419136, "rewards/accuracy_reward": 0.8135654628276825, "rewards/format_reward": 0.9795918464660645, "step": 7017 }, { "completion_length": 203.53060150146484, "epoch": 0.7062138364779874, "grad_norm": 0.6794554591178894, "kl": 0.0814208984375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8228376507759094, "reward_std": 0.15152857080101967, "rewards/accuracy_reward": 0.833041787147522, "rewards/format_reward": 0.9897959232330322, "step": 7018 }, { "completion_length": 278.42857360839844, "epoch": 0.706314465408805, "grad_norm": 0.7426548004150391, "kl": 0.0704345703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7462562918663025, "reward_std": 0.12778611574321985, "rewards/accuracy_reward": 0.7666644155979156, "rewards/format_reward": 0.9795918166637421, "step": 7019 }, { "completion_length": 256.03060150146484, "epoch": 0.7064150943396227, "grad_norm": 1.191826581954956, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7541125416755676, "reward_std": 0.1840699203312397, "rewards/accuracy_reward": 0.7847248017787933, "rewards/format_reward": 0.9693877398967743, "step": 7020 }, { "completion_length": 305.33673095703125, "epoch": 0.7065157232704402, "grad_norm": 0.9692051410675049, "kl": 0.119384765625, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.850746512413025, "reward_std": 0.20584720373153687, "rewards/accuracy_reward": 0.8915628790855408, "rewards/format_reward": 0.9591836631298065, "step": 7021 }, { "completion_length": 262.5, "epoch": 0.7066163522012578, "grad_norm": 0.5153719782829285, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.763265311717987, "reward_std": 0.07579246163368225, "rewards/accuracy_reward": 0.7632652819156647, "rewards/format_reward": 1.0, "step": 7022 }, { "completion_length": 236.38774871826172, "epoch": 0.7067169811320755, "grad_norm": 0.8197109699249268, "kl": 0.09326171875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6383219957351685, "reward_std": 0.15499529987573624, "rewards/accuracy_reward": 0.6485261023044586, "rewards/format_reward": 0.9897959232330322, "step": 7023 }, { "completion_length": 233.59183502197266, "epoch": 0.7068176100628931, "grad_norm": 1.145511269569397, "kl": 0.0966796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.663188636302948, "reward_std": 0.15520590543746948, "rewards/accuracy_reward": 0.673392653465271, "rewards/format_reward": 0.9897959232330322, "step": 7024 }, { "completion_length": 244.70407104492188, "epoch": 0.7069182389937106, "grad_norm": 0.703139066696167, "kl": 0.111328125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.745222806930542, "reward_std": 0.1947457566857338, "rewards/accuracy_reward": 0.7758349478244781, "rewards/format_reward": 0.9693877398967743, "step": 7025 }, { "completion_length": 305.7040710449219, "epoch": 0.7070188679245283, "grad_norm": 0.6003127694129944, "kl": 0.0606689453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7024837136268616, "reward_std": 0.09574300236999989, "rewards/accuracy_reward": 0.7024837732315063, "rewards/format_reward": 1.0, "step": 7026 }, { "completion_length": 270.4591751098633, "epoch": 0.7071194968553459, "grad_norm": 1.575628399848938, "kl": 0.10986328125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.6753973960876465, "reward_std": 0.2196165844798088, "rewards/accuracy_reward": 0.6753974258899689, "rewards/format_reward": 1.0, "step": 7027 }, { "completion_length": 193.5204086303711, "epoch": 0.7072201257861636, "grad_norm": 0.8682259917259216, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8221014738082886, "reward_std": 0.16708900779485703, "rewards/accuracy_reward": 0.8425096273422241, "rewards/format_reward": 0.9795918464660645, "step": 7028 }, { "completion_length": 302.846923828125, "epoch": 0.7073207547169811, "grad_norm": 1.2441462278366089, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6417805552482605, "reward_std": 0.2039778083562851, "rewards/accuracy_reward": 0.6723927855491638, "rewards/format_reward": 0.9693877398967743, "step": 7029 }, { "completion_length": 265.55101013183594, "epoch": 0.7074213836477987, "grad_norm": 0.9039466381072998, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6802416443824768, "reward_std": 0.2103966549038887, "rewards/accuracy_reward": 0.6904457211494446, "rewards/format_reward": 0.9897959232330322, "step": 7030 }, { "completion_length": 238.89794921875, "epoch": 0.7075220125786164, "grad_norm": 0.7462314367294312, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.748237431049347, "reward_std": 0.13424530252814293, "rewards/accuracy_reward": 0.7788496613502502, "rewards/format_reward": 0.9693877398967743, "step": 7031 }, { "completion_length": 339.6734619140625, "epoch": 0.707622641509434, "grad_norm": 0.645584762096405, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8566595911979675, "reward_std": 0.13541272282600403, "rewards/accuracy_reward": 0.8566596210002899, "rewards/format_reward": 1.0, "step": 7032 }, { "completion_length": 296.28570556640625, "epoch": 0.7077232704402515, "grad_norm": 0.6813359260559082, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.803419053554535, "reward_std": 0.15022124350070953, "rewards/accuracy_reward": 0.8136231005191803, "rewards/format_reward": 0.9897959232330322, "step": 7033 }, { "completion_length": 342.0305938720703, "epoch": 0.7078238993710692, "grad_norm": 0.8044772744178772, "kl": 0.056884765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7692320942878723, "reward_std": 0.23788324743509293, "rewards/accuracy_reward": 0.7896402776241302, "rewards/format_reward": 0.9795918166637421, "step": 7034 }, { "completion_length": 164.62244415283203, "epoch": 0.7079245283018868, "grad_norm": 0.866002082824707, "kl": 0.0677490234375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.9437317848205566, "reward_std": 0.03991415724158287, "rewards/accuracy_reward": 0.9437317550182343, "rewards/format_reward": 1.0, "step": 7035 }, { "completion_length": 222.55101776123047, "epoch": 0.7080251572327044, "grad_norm": 0.5099570751190186, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8481866121292114, "reward_std": 0.12594368308782578, "rewards/accuracy_reward": 0.8787989020347595, "rewards/format_reward": 0.9693877398967743, "step": 7036 }, { "completion_length": 270.448974609375, "epoch": 0.7081257861635221, "grad_norm": 0.7073610424995422, "kl": 0.06689453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.738095223903656, "reward_std": 0.24817461520433426, "rewards/accuracy_reward": 0.7585033178329468, "rewards/format_reward": 0.9795918464660645, "step": 7037 }, { "completion_length": 374.02040100097656, "epoch": 0.7082264150943396, "grad_norm": 0.6702665686607361, "kl": 0.058837890625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.599768042564392, "reward_std": 0.19612392038106918, "rewards/accuracy_reward": 0.6099721491336823, "rewards/format_reward": 0.9897959232330322, "step": 7038 }, { "completion_length": 247.2448959350586, "epoch": 0.7083270440251572, "grad_norm": 1.0632742643356323, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7733997702598572, "reward_std": 0.1624353975057602, "rewards/accuracy_reward": 0.773399829864502, "rewards/format_reward": 1.0, "step": 7039 }, { "completion_length": 256.0, "epoch": 0.7084276729559749, "grad_norm": 0.5600221157073975, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8139236569404602, "reward_std": 0.1284337043762207, "rewards/accuracy_reward": 0.813923716545105, "rewards/format_reward": 1.0, "step": 7040 }, { "completion_length": 246.73468780517578, "epoch": 0.7085283018867925, "grad_norm": 0.42134860157966614, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8714285492897034, "reward_std": 0.12639309465885162, "rewards/accuracy_reward": 0.8918367326259613, "rewards/format_reward": 0.9795918464660645, "step": 7041 }, { "completion_length": 228.55101776123047, "epoch": 0.70862893081761, "grad_norm": 0.4875975549221039, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8544217944145203, "reward_std": 0.11620355397462845, "rewards/accuracy_reward": 0.8646258413791656, "rewards/format_reward": 0.9897959232330322, "step": 7042 }, { "completion_length": 350.3775329589844, "epoch": 0.7087295597484277, "grad_norm": 0.6148512363433838, "kl": 0.0650634765625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6493763327598572, "reward_std": 0.24082190543413162, "rewards/accuracy_reward": 0.6697846055030823, "rewards/format_reward": 0.9795918166637421, "step": 7043 }, { "completion_length": 216.16326141357422, "epoch": 0.7088301886792453, "grad_norm": 0.5033717751502991, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.801895022392273, "reward_std": 0.09328317083418369, "rewards/accuracy_reward": 0.8120991289615631, "rewards/format_reward": 0.9897959232330322, "step": 7044 }, { "completion_length": 242.5, "epoch": 0.7089308176100629, "grad_norm": 1.5035743713378906, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6256595849990845, "reward_std": 0.16370853409171104, "rewards/accuracy_reward": 0.6256596148014069, "rewards/format_reward": 1.0, "step": 7045 }, { "completion_length": 243.03060913085938, "epoch": 0.7090314465408805, "grad_norm": 0.8756439685821533, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.847071349620819, "reward_std": 0.11784270778298378, "rewards/accuracy_reward": 0.8572753667831421, "rewards/format_reward": 0.9897959232330322, "step": 7046 }, { "completion_length": 344.3673400878906, "epoch": 0.7091320754716981, "grad_norm": 0.7487448453903198, "kl": 0.102294921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.5586878657341003, "reward_std": 0.2494649700820446, "rewards/accuracy_reward": 0.599504217505455, "rewards/format_reward": 0.9591836631298065, "step": 7047 }, { "completion_length": 276.35713958740234, "epoch": 0.7092327044025157, "grad_norm": 0.6547510623931885, "kl": 0.0672607421875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7135921120643616, "reward_std": 0.18172364681959152, "rewards/accuracy_reward": 0.7340003550052643, "rewards/format_reward": 0.9795918464660645, "step": 7048 }, { "completion_length": 265.7653045654297, "epoch": 0.7093333333333334, "grad_norm": 0.7706139087677002, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8408163189888, "reward_std": 0.14170966669917107, "rewards/accuracy_reward": 0.8612244725227356, "rewards/format_reward": 0.9795918166637421, "step": 7049 }, { "completion_length": 337.83673095703125, "epoch": 0.7094339622641509, "grad_norm": 0.7041263580322266, "kl": 0.0723876953125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6189064979553223, "reward_std": 0.2211986556649208, "rewards/accuracy_reward": 0.6393146514892578, "rewards/format_reward": 0.9795918464660645, "step": 7050 }, { "completion_length": 228.6836700439453, "epoch": 0.7095345911949685, "grad_norm": 0.7506524920463562, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.777587115764618, "reward_std": 0.09681493416428566, "rewards/accuracy_reward": 0.7775870561599731, "rewards/format_reward": 1.0, "step": 7051 }, { "completion_length": 297.9387664794922, "epoch": 0.7096352201257862, "grad_norm": 0.7854177951812744, "kl": 0.112060546875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7795608639717102, "reward_std": 0.09531604871153831, "rewards/accuracy_reward": 0.7897650003433228, "rewards/format_reward": 0.9897959232330322, "step": 7052 }, { "completion_length": 207.15306091308594, "epoch": 0.7097358490566038, "grad_norm": 0.5015435218811035, "kl": 0.12744140625, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.806544542312622, "reward_std": 0.070659089833498, "rewards/accuracy_reward": 0.8065446019172668, "rewards/format_reward": 1.0, "step": 7053 }, { "completion_length": 286.9387664794922, "epoch": 0.7098364779874213, "grad_norm": 0.6436585783958435, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.806824803352356, "reward_std": 0.19700008630752563, "rewards/accuracy_reward": 0.817028820514679, "rewards/format_reward": 0.9897959232330322, "step": 7054 }, { "completion_length": 266.5408172607422, "epoch": 0.709937106918239, "grad_norm": 0.8916599154472351, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7050452828407288, "reward_std": 0.19211416691541672, "rewards/accuracy_reward": 0.7356575727462769, "rewards/format_reward": 0.9693877398967743, "step": 7055 }, { "completion_length": 253.95917510986328, "epoch": 0.7100377358490566, "grad_norm": 0.6643577218055725, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8411741256713867, "reward_std": 0.08990467339754105, "rewards/accuracy_reward": 0.8513782322406769, "rewards/format_reward": 0.9897959232330322, "step": 7056 }, { "completion_length": 270.9183654785156, "epoch": 0.7101383647798742, "grad_norm": 0.545373797416687, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7141637206077576, "reward_std": 0.1824101209640503, "rewards/accuracy_reward": 0.7243678867816925, "rewards/format_reward": 0.9897959232330322, "step": 7057 }, { "completion_length": 237.9081573486328, "epoch": 0.7102389937106919, "grad_norm": 1.3947721719741821, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8446063995361328, "reward_std": 0.15403500199317932, "rewards/accuracy_reward": 0.8650145530700684, "rewards/format_reward": 0.9795918166637421, "step": 7058 }, { "completion_length": 328.2040710449219, "epoch": 0.7103396226415094, "grad_norm": 1.8561742305755615, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.5884308815002441, "reward_std": 0.14538570493459702, "rewards/accuracy_reward": 0.5884309709072113, "rewards/format_reward": 1.0, "step": 7059 }, { "completion_length": 246.3571319580078, "epoch": 0.710440251572327, "grad_norm": 0.6136976480484009, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.77884179353714, "reward_std": 0.1751675233244896, "rewards/accuracy_reward": 0.78904590010643, "rewards/format_reward": 0.9897959232330322, "step": 7060 }, { "completion_length": 242.82652282714844, "epoch": 0.7105408805031447, "grad_norm": 0.7809953093528748, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7191296815872192, "reward_std": 0.15727569162845612, "rewards/accuracy_reward": 0.7191296815872192, "rewards/format_reward": 1.0, "step": 7061 }, { "completion_length": 312.34693908691406, "epoch": 0.7106415094339623, "grad_norm": 0.43288853764533997, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7445659637451172, "reward_std": 0.1707054078578949, "rewards/accuracy_reward": 0.7751781940460205, "rewards/format_reward": 0.9693877398967743, "step": 7062 }, { "completion_length": 206.12244415283203, "epoch": 0.7107421383647798, "grad_norm": 0.6332118511199951, "kl": 0.0987548828125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8168521523475647, "reward_std": 0.16097689792513847, "rewards/accuracy_reward": 0.8270562589168549, "rewards/format_reward": 0.9897959232330322, "step": 7063 }, { "completion_length": 265.37755584716797, "epoch": 0.7108427672955975, "grad_norm": 1.4699329137802124, "kl": 0.158203125, "learning_rate": 1e-06, "loss": 0.0063, "reward": 1.6546525359153748, "reward_std": 0.3088563233613968, "rewards/accuracy_reward": 0.7056730091571808, "rewards/format_reward": 0.9489795565605164, "step": 7064 }, { "completion_length": 212.39795684814453, "epoch": 0.7109433962264151, "grad_norm": 0.7569659352302551, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8558698296546936, "reward_std": 0.08678547665476799, "rewards/accuracy_reward": 0.8558698892593384, "rewards/format_reward": 1.0, "step": 7065 }, { "completion_length": 233.09183502197266, "epoch": 0.7110440251572328, "grad_norm": 0.8453511595726013, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6541131138801575, "reward_std": 0.10720411129295826, "rewards/accuracy_reward": 0.6745212972164154, "rewards/format_reward": 0.9795918464660645, "step": 7066 }, { "completion_length": 331.4183654785156, "epoch": 0.7111446540880503, "grad_norm": 1.0532461404800415, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6773809790611267, "reward_std": 0.31321388483047485, "rewards/accuracy_reward": 0.7181972563266754, "rewards/format_reward": 0.9591836631298065, "step": 7067 }, { "completion_length": 256.2653045654297, "epoch": 0.7112452830188679, "grad_norm": 1.1479182243347168, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.5701592564582825, "reward_std": 0.1446951925754547, "rewards/accuracy_reward": 0.5905674993991852, "rewards/format_reward": 0.9795918464660645, "step": 7068 }, { "completion_length": 312.0102081298828, "epoch": 0.7113459119496855, "grad_norm": 0.508539080619812, "kl": 0.0633544921875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7780590653419495, "reward_std": 0.13033869117498398, "rewards/accuracy_reward": 0.7882631719112396, "rewards/format_reward": 0.9897959232330322, "step": 7069 }, { "completion_length": 303.2346878051758, "epoch": 0.7114465408805032, "grad_norm": 1.8483214378356934, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7326067090034485, "reward_std": 0.2268664315342903, "rewards/accuracy_reward": 0.7530148327350616, "rewards/format_reward": 0.9795918166637421, "step": 7070 }, { "completion_length": 254.81631469726562, "epoch": 0.7115471698113207, "grad_norm": 0.7321781516075134, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7015872597694397, "reward_std": 0.11753438413143158, "rewards/accuracy_reward": 0.7015873193740845, "rewards/format_reward": 1.0, "step": 7071 }, { "completion_length": 215.08162689208984, "epoch": 0.7116477987421383, "grad_norm": 1.750402569770813, "kl": 0.0826416015625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.9183672666549683, "reward_std": 0.15402613580226898, "rewards/accuracy_reward": 0.918367326259613, "rewards/format_reward": 1.0, "step": 7072 }, { "completion_length": 311.1326446533203, "epoch": 0.711748427672956, "grad_norm": 1.3582267761230469, "kl": 0.17041015625, "learning_rate": 1e-06, "loss": 0.0068, "reward": 1.7102247476577759, "reward_std": 0.24493159353733063, "rewards/accuracy_reward": 0.7408370077610016, "rewards/format_reward": 0.9693877398967743, "step": 7073 }, { "completion_length": 274.6326446533203, "epoch": 0.7118490566037736, "grad_norm": 0.7616608738899231, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.714285671710968, "reward_std": 0.14599500223994255, "rewards/accuracy_reward": 0.7244897782802582, "rewards/format_reward": 0.9897959232330322, "step": 7074 }, { "completion_length": 238.6428451538086, "epoch": 0.7119496855345911, "grad_norm": 0.7958627343177795, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6777872443199158, "reward_std": 0.1482791230082512, "rewards/accuracy_reward": 0.6777873039245605, "rewards/format_reward": 1.0, "step": 7075 }, { "completion_length": 271.83673095703125, "epoch": 0.7120503144654088, "grad_norm": 0.7964718341827393, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6368716359138489, "reward_std": 0.14608248323202133, "rewards/accuracy_reward": 0.6470757126808167, "rewards/format_reward": 0.9897959232330322, "step": 7076 }, { "completion_length": 339.29591369628906, "epoch": 0.7121509433962264, "grad_norm": 0.6054609417915344, "kl": 0.0753173828125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6771451830863953, "reward_std": 0.25446245074272156, "rewards/accuracy_reward": 0.7179615199565887, "rewards/format_reward": 0.9591836631298065, "step": 7077 }, { "completion_length": 265.83673095703125, "epoch": 0.7122515723270441, "grad_norm": 0.7729660868644714, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7777460813522339, "reward_std": 0.1892162412405014, "rewards/accuracy_reward": 0.8083582818508148, "rewards/format_reward": 0.9693877398967743, "step": 7078 }, { "completion_length": 280.88775634765625, "epoch": 0.7123522012578616, "grad_norm": 8.545467376708984, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6720387935638428, "reward_std": 0.21426544338464737, "rewards/accuracy_reward": 0.6924469172954559, "rewards/format_reward": 0.9795918464660645, "step": 7079 }, { "completion_length": 263.1734619140625, "epoch": 0.7124528301886792, "grad_norm": 0.9025914669036865, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7764179110527039, "reward_std": 0.21375776082277298, "rewards/accuracy_reward": 0.7968261241912842, "rewards/format_reward": 0.9795918464660645, "step": 7080 }, { "completion_length": 237.0, "epoch": 0.7125534591194969, "grad_norm": 0.8938409686088562, "kl": 0.0731201171875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8329909443855286, "reward_std": 0.17567191645503044, "rewards/accuracy_reward": 0.8636031448841095, "rewards/format_reward": 0.9693877398967743, "step": 7081 }, { "completion_length": 222.1938705444336, "epoch": 0.7126540880503145, "grad_norm": 0.8465266227722168, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8446711897850037, "reward_std": 0.1971864402294159, "rewards/accuracy_reward": 0.8650793433189392, "rewards/format_reward": 0.9795918166637421, "step": 7082 }, { "completion_length": 189.8775405883789, "epoch": 0.7127547169811321, "grad_norm": 0.4576849937438965, "kl": 0.0965576171875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.867192268371582, "reward_std": 0.1207159049808979, "rewards/accuracy_reward": 0.8773963749408722, "rewards/format_reward": 0.9897959232330322, "step": 7083 }, { "completion_length": 210.67346954345703, "epoch": 0.7128553459119497, "grad_norm": 0.9622597098350525, "kl": 0.1494140625, "learning_rate": 1e-06, "loss": 0.006, "reward": 1.6272398829460144, "reward_std": 0.2227119281888008, "rewards/accuracy_reward": 0.678260326385498, "rewards/format_reward": 0.9489795565605164, "step": 7084 }, { "completion_length": 235.9387664794922, "epoch": 0.7129559748427673, "grad_norm": 0.8319945335388184, "kl": 0.05419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7755101323127747, "reward_std": 0.2450793907046318, "rewards/accuracy_reward": 0.8061224222183228, "rewards/format_reward": 0.9693877398967743, "step": 7085 }, { "completion_length": 169.07142639160156, "epoch": 0.7130566037735849, "grad_norm": 0.9306652545928955, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8136054277420044, "reward_std": 0.1938769668340683, "rewards/accuracy_reward": 0.8340136110782623, "rewards/format_reward": 0.9795918464660645, "step": 7086 }, { "completion_length": 213.6836700439453, "epoch": 0.7131572327044026, "grad_norm": 0.3936770558357239, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.9249756932258606, "reward_std": 0.09231215715408325, "rewards/accuracy_reward": 0.9453838765621185, "rewards/format_reward": 0.9795918464660645, "step": 7087 }, { "completion_length": 274.76529693603516, "epoch": 0.7132578616352201, "grad_norm": 0.9869894981384277, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7016789317131042, "reward_std": 0.2487214058637619, "rewards/accuracy_reward": 0.7629035413265228, "rewards/format_reward": 0.938775509595871, "step": 7088 }, { "completion_length": 252.55101776123047, "epoch": 0.7133584905660377, "grad_norm": 0.5499966740608215, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8579767942428589, "reward_std": 0.11581231281161308, "rewards/accuracy_reward": 0.8681810200214386, "rewards/format_reward": 0.9897959232330322, "step": 7089 }, { "completion_length": 202.2551040649414, "epoch": 0.7134591194968554, "grad_norm": 0.9753031134605408, "kl": 0.113037109375, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.834701955318451, "reward_std": 0.20101426914334297, "rewards/accuracy_reward": 0.8653142154216766, "rewards/format_reward": 0.9693877398967743, "step": 7090 }, { "completion_length": 307.6836700439453, "epoch": 0.713559748427673, "grad_norm": 0.9126659631729126, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.782871663570404, "reward_std": 0.25446952879428864, "rewards/accuracy_reward": 0.8134839236736298, "rewards/format_reward": 0.9693877398967743, "step": 7091 }, { "completion_length": 231.9285659790039, "epoch": 0.7136603773584905, "grad_norm": 2.1330277919769287, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8324829936027527, "reward_std": 0.17241720855236053, "rewards/accuracy_reward": 0.8528911769390106, "rewards/format_reward": 0.9795918166637421, "step": 7092 }, { "completion_length": 280.7244873046875, "epoch": 0.7137610062893082, "grad_norm": 1.2936569452285767, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.5928073525428772, "reward_std": 0.19260026887059212, "rewards/accuracy_reward": 0.6030113697052002, "rewards/format_reward": 0.9897959232330322, "step": 7093 }, { "completion_length": 191.55101776123047, "epoch": 0.7138616352201258, "grad_norm": 0.7735041379928589, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.745339035987854, "reward_std": 0.18703249096870422, "rewards/accuracy_reward": 0.7759513258934021, "rewards/format_reward": 0.9693877398967743, "step": 7094 }, { "completion_length": 237.7653045654297, "epoch": 0.7139622641509434, "grad_norm": 3.5539071559906006, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7516291737556458, "reward_std": 0.12430758774280548, "rewards/accuracy_reward": 0.7516291737556458, "rewards/format_reward": 1.0, "step": 7095 }, { "completion_length": 208.86734771728516, "epoch": 0.714062893081761, "grad_norm": 1.7703951597213745, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7599854469299316, "reward_std": 0.2878684252500534, "rewards/accuracy_reward": 0.7905976176261902, "rewards/format_reward": 0.9693877398967743, "step": 7096 }, { "completion_length": 228.1734619140625, "epoch": 0.7141635220125786, "grad_norm": 0.8362542986869812, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8525198698043823, "reward_std": 0.1756669357419014, "rewards/accuracy_reward": 0.8627239763736725, "rewards/format_reward": 0.9897959232330322, "step": 7097 }, { "completion_length": 272.1836624145508, "epoch": 0.7142641509433962, "grad_norm": 0.553778350353241, "kl": 0.129638671875, "learning_rate": 1e-06, "loss": 0.0052, "reward": 1.713612139225006, "reward_std": 0.15782146994024515, "rewards/accuracy_reward": 0.7442244589328766, "rewards/format_reward": 0.9693877398967743, "step": 7098 }, { "completion_length": 201.6326446533203, "epoch": 0.7143647798742139, "grad_norm": 1.021934151649475, "kl": 0.114501953125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.7350490093231201, "reward_std": 0.13317476212978363, "rewards/accuracy_reward": 0.755457192659378, "rewards/format_reward": 0.9795918166637421, "step": 7099 }, { "completion_length": 180.4285659790039, "epoch": 0.7144654088050314, "grad_norm": 0.23563337326049805, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.9897959232330322, "reward_std": 0.026997461915016174, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.9897959232330322, "step": 7100 }, { "completion_length": 200.41836547851562, "epoch": 0.714566037735849, "grad_norm": 1.2874971628189087, "kl": 0.0606689453125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.877551019191742, "reward_std": 0.13498730212450027, "rewards/accuracy_reward": 0.8877550959587097, "rewards/format_reward": 0.9897959232330322, "step": 7101 }, { "completion_length": 240.6020278930664, "epoch": 0.7146666666666667, "grad_norm": 1.2553008794784546, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.677328646183014, "reward_std": 0.1322842724621296, "rewards/accuracy_reward": 0.6875327527523041, "rewards/format_reward": 0.9897959232330322, "step": 7102 }, { "completion_length": 229.7448959350586, "epoch": 0.7147672955974843, "grad_norm": 0.565849781036377, "kl": 0.0758056640625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8367347121238708, "reward_std": 0.10903036221861839, "rewards/accuracy_reward": 0.8571428060531616, "rewards/format_reward": 0.9795918166637421, "step": 7103 }, { "completion_length": 223.04080963134766, "epoch": 0.7148679245283018, "grad_norm": 0.5689104199409485, "kl": 0.110107421875, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7599204778671265, "reward_std": 0.14438876137137413, "rewards/accuracy_reward": 0.7599206268787384, "rewards/format_reward": 1.0, "step": 7104 }, { "completion_length": 267.4183578491211, "epoch": 0.7149685534591195, "grad_norm": 0.6096437573432922, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7190232872962952, "reward_std": 0.1611153781414032, "rewards/accuracy_reward": 0.7292273938655853, "rewards/format_reward": 0.9897959232330322, "step": 7105 }, { "completion_length": 263.0, "epoch": 0.7150691823899371, "grad_norm": 1.1201540231704712, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8371917605400085, "reward_std": 0.137831162661314, "rewards/accuracy_reward": 0.8371918797492981, "rewards/format_reward": 1.0, "step": 7106 }, { "completion_length": 195.07142639160156, "epoch": 0.7151698113207547, "grad_norm": 0.5836694240570068, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8594103455543518, "reward_std": 0.13630184531211853, "rewards/accuracy_reward": 0.8798185884952545, "rewards/format_reward": 0.9795918166637421, "step": 7107 }, { "completion_length": 186.55101776123047, "epoch": 0.7152704402515724, "grad_norm": 0.44754695892333984, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.810080349445343, "reward_std": 0.05420384928584099, "rewards/accuracy_reward": 0.8100804090499878, "rewards/format_reward": 1.0, "step": 7108 }, { "completion_length": 193.05101776123047, "epoch": 0.7153710691823899, "grad_norm": 0.7211006879806519, "kl": 0.1357421875, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.7153517603874207, "reward_std": 0.09567967429757118, "rewards/accuracy_reward": 0.7153517603874207, "rewards/format_reward": 1.0, "step": 7109 }, { "completion_length": 222.57142639160156, "epoch": 0.7154716981132075, "grad_norm": 15.954005241394043, "kl": 0.113525390625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7924774885177612, "reward_std": 0.13519179821014404, "rewards/accuracy_reward": 0.8128856122493744, "rewards/format_reward": 0.9795918464660645, "step": 7110 }, { "completion_length": 198.13265228271484, "epoch": 0.7155723270440252, "grad_norm": 0.7094162106513977, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7056962251663208, "reward_std": 0.16666677594184875, "rewards/accuracy_reward": 0.7159002125263214, "rewards/format_reward": 0.9897959232330322, "step": 7111 }, { "completion_length": 179.0203971862793, "epoch": 0.7156729559748428, "grad_norm": 0.4762299060821533, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.9073984622955322, "reward_std": 0.047262948006391525, "rewards/accuracy_reward": 0.9073984622955322, "rewards/format_reward": 1.0, "step": 7112 }, { "completion_length": 180.948974609375, "epoch": 0.7157735849056603, "grad_norm": 0.4349807798862457, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8115755319595337, "reward_std": 0.05288620665669441, "rewards/accuracy_reward": 0.8115756213665009, "rewards/format_reward": 1.0, "step": 7113 }, { "completion_length": 209.95917510986328, "epoch": 0.715874213836478, "grad_norm": 0.7406825423240662, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7900874614715576, "reward_std": 0.13518477510660887, "rewards/accuracy_reward": 0.7900874018669128, "rewards/format_reward": 1.0, "step": 7114 }, { "completion_length": 247.61224365234375, "epoch": 0.7159748427672956, "grad_norm": 1.0728631019592285, "kl": 0.111572265625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7754431366920471, "reward_std": 0.24226078391075134, "rewards/accuracy_reward": 0.7856472730636597, "rewards/format_reward": 0.9897959232330322, "step": 7115 }, { "completion_length": 212.1530532836914, "epoch": 0.7160754716981133, "grad_norm": 1.1604162454605103, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7305474877357483, "reward_std": 0.1516328640282154, "rewards/accuracy_reward": 0.7509556114673615, "rewards/format_reward": 0.9795918166637421, "step": 7116 }, { "completion_length": 258.5816345214844, "epoch": 0.7161761006289308, "grad_norm": 0.7781994342803955, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.634784460067749, "reward_std": 0.12005173228681087, "rewards/accuracy_reward": 0.6347845047712326, "rewards/format_reward": 1.0, "step": 7117 }, { "completion_length": 271.48978424072266, "epoch": 0.7162767295597484, "grad_norm": 0.5553109049797058, "kl": 0.0594482421875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7251701354980469, "reward_std": 0.10660575330257416, "rewards/accuracy_reward": 0.7353741824626923, "rewards/format_reward": 0.9897959232330322, "step": 7118 }, { "completion_length": 161.63265228271484, "epoch": 0.716377358490566, "grad_norm": 0.8134598135948181, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8445254564285278, "reward_std": 0.07104973495006561, "rewards/accuracy_reward": 0.8445254266262054, "rewards/format_reward": 1.0, "step": 7119 }, { "completion_length": 162.79591369628906, "epoch": 0.7164779874213837, "grad_norm": 1.285135269165039, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8796715140342712, "reward_std": 0.07234731898643076, "rewards/accuracy_reward": 0.8898756504058838, "rewards/format_reward": 0.9897959232330322, "step": 7120 }, { "completion_length": 269.5102081298828, "epoch": 0.7165786163522012, "grad_norm": 0.8925808072090149, "kl": 0.09765625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6847825646400452, "reward_std": 0.15835213661193848, "rewards/accuracy_reward": 0.6949867606163025, "rewards/format_reward": 0.9897959232330322, "step": 7121 }, { "completion_length": 197.89794921875, "epoch": 0.7166792452830189, "grad_norm": 0.6049593687057495, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7051830291748047, "reward_std": 0.08787937834858894, "rewards/accuracy_reward": 0.7051830291748047, "rewards/format_reward": 1.0, "step": 7122 }, { "completion_length": 275.9897766113281, "epoch": 0.7167798742138365, "grad_norm": 0.6792166829109192, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6641089916229248, "reward_std": 0.11005522683262825, "rewards/accuracy_reward": 0.6641090214252472, "rewards/format_reward": 1.0, "step": 7123 }, { "completion_length": 276.29591369628906, "epoch": 0.7168805031446541, "grad_norm": 0.4226991832256317, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.5007848739624023, "reward_std": 0.10199114307761192, "rewards/accuracy_reward": 0.5007849186658859, "rewards/format_reward": 1.0, "step": 7124 }, { "completion_length": 218.35713958740234, "epoch": 0.7169811320754716, "grad_norm": 0.7282012701034546, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8017117977142334, "reward_std": 0.13200143724679947, "rewards/accuracy_reward": 0.811915934085846, "rewards/format_reward": 0.9897959232330322, "step": 7125 }, { "completion_length": 231.28571319580078, "epoch": 0.7170817610062893, "grad_norm": 0.5675990581512451, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7891554236412048, "reward_std": 0.12124720215797424, "rewards/accuracy_reward": 0.7993594706058502, "rewards/format_reward": 0.9897959232330322, "step": 7126 }, { "completion_length": 214.19387817382812, "epoch": 0.7171823899371069, "grad_norm": 0.6976785063743591, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8451165556907654, "reward_std": 0.06448354572057724, "rewards/accuracy_reward": 0.8553206622600555, "rewards/format_reward": 0.9897959232330322, "step": 7127 }, { "completion_length": 253.6836700439453, "epoch": 0.7172830188679246, "grad_norm": 0.5562286376953125, "kl": 0.098876953125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8138116002082825, "reward_std": 0.19029313325881958, "rewards/accuracy_reward": 0.8342197239398956, "rewards/format_reward": 0.9795918464660645, "step": 7128 }, { "completion_length": 215.6326446533203, "epoch": 0.7173836477987421, "grad_norm": 0.38244298100471497, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6887107491493225, "reward_std": 0.09275811910629272, "rewards/accuracy_reward": 0.6989148855209351, "rewards/format_reward": 0.9897959232330322, "step": 7129 }, { "completion_length": 189.78571319580078, "epoch": 0.7174842767295597, "grad_norm": 0.7794194221496582, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7544217705726624, "reward_std": 0.13599827140569687, "rewards/accuracy_reward": 0.7646258473396301, "rewards/format_reward": 0.9897959232330322, "step": 7130 }, { "completion_length": 194.38774871826172, "epoch": 0.7175849056603774, "grad_norm": 0.6055501103401184, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8440232872962952, "reward_std": 0.0965620637871325, "rewards/accuracy_reward": 0.8542273938655853, "rewards/format_reward": 0.9897959232330322, "step": 7131 }, { "completion_length": 268.46937561035156, "epoch": 0.717685534591195, "grad_norm": 1810.5164794921875, "kl": 53.0216064453125, "learning_rate": 1e-06, "loss": 2.118, "reward": 1.8170068860054016, "reward_std": 0.1661926619708538, "rewards/accuracy_reward": 0.8476190567016602, "rewards/format_reward": 0.9693877398967743, "step": 7132 }, { "completion_length": 275.0918273925781, "epoch": 0.7177861635220126, "grad_norm": 0.5908038020133972, "kl": 0.0714111328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.739583969116211, "reward_std": 0.07644582539796829, "rewards/accuracy_reward": 0.7395839691162109, "rewards/format_reward": 1.0, "step": 7133 }, { "completion_length": 198.54080963134766, "epoch": 0.7178867924528302, "grad_norm": 0.34597131609916687, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.9271137118339539, "reward_std": 0.07356392219662666, "rewards/accuracy_reward": 0.9373177587985992, "rewards/format_reward": 0.9897959232330322, "step": 7134 }, { "completion_length": 276.6836700439453, "epoch": 0.7179874213836478, "grad_norm": 0.6254596710205078, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.759183645248413, "reward_std": 0.18198619782924652, "rewards/accuracy_reward": 0.7591836750507355, "rewards/format_reward": 1.0, "step": 7135 }, { "completion_length": 243.2040786743164, "epoch": 0.7180880503144654, "grad_norm": 0.7194877862930298, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8714285492897034, "reward_std": 0.18261022493243217, "rewards/accuracy_reward": 0.8714285790920258, "rewards/format_reward": 1.0, "step": 7136 }, { "completion_length": 247.82653045654297, "epoch": 0.7181886792452831, "grad_norm": 0.6320289969444275, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7310899496078491, "reward_std": 0.10640648752450943, "rewards/accuracy_reward": 0.7310900390148163, "rewards/format_reward": 1.0, "step": 7137 }, { "completion_length": 369.72447204589844, "epoch": 0.7182893081761006, "grad_norm": 0.8173961043357849, "kl": 0.0706787109375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6533631086349487, "reward_std": 0.21259909868240356, "rewards/accuracy_reward": 0.683975338935852, "rewards/format_reward": 0.9693877398967743, "step": 7138 }, { "completion_length": 215.12245178222656, "epoch": 0.7183899371069182, "grad_norm": 0.8935379385948181, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7104769349098206, "reward_std": 0.09812816977500916, "rewards/accuracy_reward": 0.7104769051074982, "rewards/format_reward": 1.0, "step": 7139 }, { "completion_length": 274.72447967529297, "epoch": 0.7184905660377359, "grad_norm": 0.8355969786643982, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8607921600341797, "reward_std": 0.16708610206842422, "rewards/accuracy_reward": 0.8607921600341797, "rewards/format_reward": 1.0, "step": 7140 }, { "completion_length": 265.9183654785156, "epoch": 0.7185911949685535, "grad_norm": 0.45803022384643555, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7191095352172852, "reward_std": 0.06855384260416031, "rewards/accuracy_reward": 0.719109445810318, "rewards/format_reward": 1.0, "step": 7141 }, { "completion_length": 220.93877410888672, "epoch": 0.718691823899371, "grad_norm": 0.4743267297744751, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7762935161590576, "reward_std": 0.10177094116806984, "rewards/accuracy_reward": 0.7762935161590576, "rewards/format_reward": 1.0, "step": 7142 }, { "completion_length": 210.9081573486328, "epoch": 0.7187924528301887, "grad_norm": 0.5359746813774109, "kl": 0.0718994140625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6909620761871338, "reward_std": 0.0695645920932293, "rewards/accuracy_reward": 0.6909620761871338, "rewards/format_reward": 1.0, "step": 7143 }, { "completion_length": 232.86734771728516, "epoch": 0.7188930817610063, "grad_norm": 0.7825474739074707, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6857141852378845, "reward_std": 0.1637197956442833, "rewards/accuracy_reward": 0.6959183812141418, "rewards/format_reward": 0.9897959232330322, "step": 7144 }, { "completion_length": 211.62244415283203, "epoch": 0.7189937106918239, "grad_norm": 0.5035791993141174, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8362488150596619, "reward_std": 0.11884406208992004, "rewards/accuracy_reward": 0.8464528322219849, "rewards/format_reward": 0.9897959232330322, "step": 7145 }, { "completion_length": 231.47958374023438, "epoch": 0.7190943396226415, "grad_norm": 0.20659413933753967, "kl": 0.062744140625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.92942613363266, "reward_std": 0.03077087551355362, "rewards/accuracy_reward": 0.9294261336326599, "rewards/format_reward": 1.0, "step": 7146 }, { "completion_length": 237.82652282714844, "epoch": 0.7191949685534591, "grad_norm": 7.565202236175537, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.75955468416214, "reward_std": 0.11827708780765533, "rewards/accuracy_reward": 0.7799628674983978, "rewards/format_reward": 0.9795918464660645, "step": 7147 }, { "completion_length": 205.12244415283203, "epoch": 0.7192955974842767, "grad_norm": 1.260758399963379, "kl": 0.06689453125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7889941334724426, "reward_std": 0.11049480736255646, "rewards/accuracy_reward": 0.788994163274765, "rewards/format_reward": 1.0, "step": 7148 }, { "completion_length": 192.23468780517578, "epoch": 0.7193962264150944, "grad_norm": 0.23760080337524414, "kl": 0.122802734375, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.9492063522338867, "reward_std": 0.007445101160556078, "rewards/accuracy_reward": 0.9492063522338867, "rewards/format_reward": 1.0, "step": 7149 }, { "completion_length": 260.10203552246094, "epoch": 0.7194968553459119, "grad_norm": 0.6744920015335083, "kl": 0.1044921875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8355861902236938, "reward_std": 0.13065512850880623, "rewards/accuracy_reward": 0.8559943437576294, "rewards/format_reward": 0.9795918464660645, "step": 7150 }, { "completion_length": 274.4183654785156, "epoch": 0.7195974842767295, "grad_norm": 0.5856417417526245, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7451939582824707, "reward_std": 0.22349383682012558, "rewards/accuracy_reward": 0.7656020820140839, "rewards/format_reward": 0.9795918166637421, "step": 7151 }, { "completion_length": 290.0305938720703, "epoch": 0.7196981132075472, "grad_norm": 0.6079718470573425, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7967484593391418, "reward_std": 0.13353026658296585, "rewards/accuracy_reward": 0.7967484891414642, "rewards/format_reward": 1.0, "step": 7152 }, { "completion_length": 259.04080963134766, "epoch": 0.7197987421383648, "grad_norm": 0.467873752117157, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8706462979316711, "reward_std": 0.05177764408290386, "rewards/accuracy_reward": 0.8706463277339935, "rewards/format_reward": 1.0, "step": 7153 }, { "completion_length": 281.3163146972656, "epoch": 0.7198993710691823, "grad_norm": 0.697821319103241, "kl": 0.10986328125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7376136779785156, "reward_std": 0.14525935426354408, "rewards/accuracy_reward": 0.7376136779785156, "rewards/format_reward": 1.0, "step": 7154 }, { "completion_length": 234.38774871826172, "epoch": 0.72, "grad_norm": 0.6597049236297607, "kl": 0.109619140625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8656948804855347, "reward_std": 0.13266698643565178, "rewards/accuracy_reward": 0.8861030042171478, "rewards/format_reward": 0.9795918464660645, "step": 7155 }, { "completion_length": 227.81631469726562, "epoch": 0.7201006289308176, "grad_norm": 0.8176929354667664, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.784076452255249, "reward_std": 0.07218968868255615, "rewards/accuracy_reward": 0.7942805886268616, "rewards/format_reward": 0.9897959232330322, "step": 7156 }, { "completion_length": 244.62244415283203, "epoch": 0.7202012578616352, "grad_norm": 0.8902701735496521, "kl": 0.11865234375, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7927643656730652, "reward_std": 0.15637009590864182, "rewards/accuracy_reward": 0.802968442440033, "rewards/format_reward": 0.9897959232330322, "step": 7157 }, { "completion_length": 258.3061218261719, "epoch": 0.7203018867924529, "grad_norm": 0.761044979095459, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7293002009391785, "reward_std": 0.20313077047467232, "rewards/accuracy_reward": 0.7599124908447266, "rewards/format_reward": 0.9693877398967743, "step": 7158 }, { "completion_length": 300.87754821777344, "epoch": 0.7204025157232704, "grad_norm": 0.6486003994941711, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6418852806091309, "reward_std": 0.13364249095320702, "rewards/accuracy_reward": 0.6520893573760986, "rewards/format_reward": 0.9897959232330322, "step": 7159 }, { "completion_length": 243.41836547851562, "epoch": 0.720503144654088, "grad_norm": 0.529831051826477, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8304169178009033, "reward_std": 0.14300421625375748, "rewards/accuracy_reward": 0.8508250713348389, "rewards/format_reward": 0.9795918464660645, "step": 7160 }, { "completion_length": 202.7040786743164, "epoch": 0.7206037735849057, "grad_norm": 0.9878495931625366, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.850123643875122, "reward_std": 0.10396205633878708, "rewards/accuracy_reward": 0.8501236438751221, "rewards/format_reward": 1.0, "step": 7161 }, { "completion_length": 195.82652282714844, "epoch": 0.7207044025157233, "grad_norm": 1.930823802947998, "kl": 0.126220703125, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.7899790406227112, "reward_std": 0.2351505532860756, "rewards/accuracy_reward": 0.8409994542598724, "rewards/format_reward": 0.9489795565605164, "step": 7162 }, { "completion_length": 240.9183578491211, "epoch": 0.7208050314465408, "grad_norm": 0.41183239221572876, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.872529923915863, "reward_std": 0.10440503060817719, "rewards/accuracy_reward": 0.872529923915863, "rewards/format_reward": 1.0, "step": 7163 }, { "completion_length": 237.41837310791016, "epoch": 0.7209056603773585, "grad_norm": 1.0730183124542236, "kl": 0.1259765625, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.5748775005340576, "reward_std": 0.24592071771621704, "rewards/accuracy_reward": 0.585081622004509, "rewards/format_reward": 0.9897959232330322, "step": 7164 }, { "completion_length": 239.9795913696289, "epoch": 0.7210062893081761, "grad_norm": 0.8444168567657471, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.844995141029358, "reward_std": 0.11278489604592323, "rewards/accuracy_reward": 0.8449951112270355, "rewards/format_reward": 1.0, "step": 7165 }, { "completion_length": 212.61224365234375, "epoch": 0.7211069182389938, "grad_norm": 0.47702986001968384, "kl": 0.054931640625, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7874149680137634, "reward_std": 0.1364540383219719, "rewards/accuracy_reward": 0.787414938211441, "rewards/format_reward": 1.0, "step": 7166 }, { "completion_length": 246.53060913085938, "epoch": 0.7212075471698113, "grad_norm": 0.4494304955005646, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.811069905757904, "reward_std": 0.10297854617238045, "rewards/accuracy_reward": 0.8212739825248718, "rewards/format_reward": 0.9897959232330322, "step": 7167 }, { "completion_length": 359.34693908691406, "epoch": 0.7213081761006289, "grad_norm": 0.38107773661613464, "kl": 0.0579833984375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6246808767318726, "reward_std": 0.12286417186260223, "rewards/accuracy_reward": 0.6552930921316147, "rewards/format_reward": 0.9693877398967743, "step": 7168 }, { "completion_length": 253.15306091308594, "epoch": 0.7214088050314466, "grad_norm": 0.8223977088928223, "kl": 0.0716552734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8571428656578064, "reward_std": 0.11805073544383049, "rewards/accuracy_reward": 0.8775509893894196, "rewards/format_reward": 0.9795918166637421, "step": 7169 }, { "completion_length": 218.78571319580078, "epoch": 0.7215094339622642, "grad_norm": 1.1897910833358765, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8608489632606506, "reward_std": 0.13482137769460678, "rewards/accuracy_reward": 0.8608490228652954, "rewards/format_reward": 1.0, "step": 7170 }, { "completion_length": 326.5918273925781, "epoch": 0.7216100628930817, "grad_norm": 0.5464408993721008, "kl": 0.101318359375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6961421370506287, "reward_std": 0.22303472459316254, "rewards/accuracy_reward": 0.7165502905845642, "rewards/format_reward": 0.9795918464660645, "step": 7171 }, { "completion_length": 318.79591369628906, "epoch": 0.7217106918238994, "grad_norm": 0.6065411567687988, "kl": 0.09423828125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.769698679447174, "reward_std": 0.16676265746355057, "rewards/accuracy_reward": 0.7901068329811096, "rewards/format_reward": 0.9795918166637421, "step": 7172 }, { "completion_length": 268.1734619140625, "epoch": 0.721811320754717, "grad_norm": 0.5515361428260803, "kl": 0.143310546875, "learning_rate": 1e-06, "loss": 0.0057, "reward": 1.7363017797470093, "reward_std": 0.04527577944099903, "rewards/accuracy_reward": 0.7363017499446869, "rewards/format_reward": 1.0, "step": 7173 }, { "completion_length": 253.89795684814453, "epoch": 0.7219119496855346, "grad_norm": 0.7652489542961121, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6502671241760254, "reward_std": 0.1333342231810093, "rewards/accuracy_reward": 0.6604712903499603, "rewards/format_reward": 0.9897959232330322, "step": 7174 }, { "completion_length": 286.1020278930664, "epoch": 0.7220125786163522, "grad_norm": 0.8846428990364075, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6579283475875854, "reward_std": 0.1716310977935791, "rewards/accuracy_reward": 0.6681324392557144, "rewards/format_reward": 0.9897959232330322, "step": 7175 }, { "completion_length": 285.0816345214844, "epoch": 0.7221132075471698, "grad_norm": 0.9391003251075745, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.745925784111023, "reward_std": 0.15879449248313904, "rewards/accuracy_reward": 0.7663339972496033, "rewards/format_reward": 0.9795918166637421, "step": 7176 }, { "completion_length": 298.0918426513672, "epoch": 0.7222138364779874, "grad_norm": 0.27298226952552795, "kl": 0.0657958984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6469387412071228, "reward_std": 0.04299160372465849, "rewards/accuracy_reward": 0.6673469245433807, "rewards/format_reward": 0.9795918166637421, "step": 7177 }, { "completion_length": 288.7142791748047, "epoch": 0.7223144654088051, "grad_norm": 0.9334774613380432, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.629450500011444, "reward_std": 0.21361514180898666, "rewards/accuracy_reward": 0.6702668070793152, "rewards/format_reward": 0.9591836333274841, "step": 7178 }, { "completion_length": 235.60203552246094, "epoch": 0.7224150943396226, "grad_norm": 1.2776747941970825, "kl": 0.109619140625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8120799660682678, "reward_std": 0.08202394843101501, "rewards/accuracy_reward": 0.8120799958705902, "rewards/format_reward": 1.0, "step": 7179 }, { "completion_length": 280.37754821777344, "epoch": 0.7225157232704402, "grad_norm": 0.7569302320480347, "kl": 0.1614990234375, "learning_rate": 1e-06, "loss": 0.0065, "reward": 1.8683982491493225, "reward_std": 0.09604423679411411, "rewards/accuracy_reward": 0.8888063728809357, "rewards/format_reward": 0.9795918464660645, "step": 7180 }, { "completion_length": 225.1734619140625, "epoch": 0.7226163522012579, "grad_norm": 1.0465342998504639, "kl": 0.091064453125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8115645051002502, "reward_std": 0.19326404482126236, "rewards/accuracy_reward": 0.831972748041153, "rewards/format_reward": 0.9795918166637421, "step": 7181 }, { "completion_length": 261.87754821777344, "epoch": 0.7227169811320755, "grad_norm": 1.029756784439087, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.799034059047699, "reward_std": 0.18570678681135178, "rewards/accuracy_reward": 0.8092381060123444, "rewards/format_reward": 0.9897959232330322, "step": 7182 }, { "completion_length": 343.27549743652344, "epoch": 0.7228176100628931, "grad_norm": 0.6034689545631409, "kl": 0.0693359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8304678797721863, "reward_std": 0.19649037346243858, "rewards/accuracy_reward": 0.8508761525154114, "rewards/format_reward": 0.9795918166637421, "step": 7183 }, { "completion_length": 365.1836700439453, "epoch": 0.7229182389937107, "grad_norm": 0.9279220104217529, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7327502369880676, "reward_std": 0.14294131845235825, "rewards/accuracy_reward": 0.742954283952713, "rewards/format_reward": 0.9897959232330322, "step": 7184 }, { "completion_length": 204.26529693603516, "epoch": 0.7230188679245283, "grad_norm": 0.7680574655532837, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.9387754797935486, "reward_std": 0.06185103580355644, "rewards/accuracy_reward": 0.938775509595871, "rewards/format_reward": 1.0, "step": 7185 }, { "completion_length": 224.49999237060547, "epoch": 0.7231194968553459, "grad_norm": 1.6123532056808472, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6942864060401917, "reward_std": 0.16794444620609283, "rewards/accuracy_reward": 0.7044905424118042, "rewards/format_reward": 0.9897959232330322, "step": 7186 }, { "completion_length": 207.0204086303711, "epoch": 0.7232201257861636, "grad_norm": 0.29654863476753235, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8182641863822937, "reward_std": 0.0746152251958847, "rewards/accuracy_reward": 0.8182642459869385, "rewards/format_reward": 1.0, "step": 7187 }, { "completion_length": 289.0306091308594, "epoch": 0.7233207547169811, "grad_norm": 100.19015502929688, "kl": 2.3759765625, "learning_rate": 1e-06, "loss": 0.0947, "reward": 1.890767753124237, "reward_std": 0.12040568329393864, "rewards/accuracy_reward": 0.8907677233219147, "rewards/format_reward": 1.0, "step": 7188 }, { "completion_length": 213.2959213256836, "epoch": 0.7234213836477987, "grad_norm": 1.090782880783081, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7492225170135498, "reward_std": 0.19138893485069275, "rewards/accuracy_reward": 0.7798347771167755, "rewards/format_reward": 0.9693877398967743, "step": 7189 }, { "completion_length": 280.89795684814453, "epoch": 0.7235220125786164, "grad_norm": 1.0090481042861938, "kl": 0.0714111328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7715109586715698, "reward_std": 0.1674092933535576, "rewards/accuracy_reward": 0.78171506524086, "rewards/format_reward": 0.9897959232330322, "step": 7190 }, { "completion_length": 224.96937561035156, "epoch": 0.723622641509434, "grad_norm": 1.734898328781128, "kl": 0.098876953125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6961590647697449, "reward_std": 0.17648477479815483, "rewards/accuracy_reward": 0.7063632011413574, "rewards/format_reward": 0.9897959232330322, "step": 7191 }, { "completion_length": 212.9285659790039, "epoch": 0.7237232704402515, "grad_norm": 0.5795485973358154, "kl": 0.11083984375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.715708076953888, "reward_std": 0.08239935338497162, "rewards/accuracy_reward": 0.746320366859436, "rewards/format_reward": 0.9693877398967743, "step": 7192 }, { "completion_length": 223.1530532836914, "epoch": 0.7238238993710692, "grad_norm": 0.4278144836425781, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8124664425849915, "reward_std": 0.1992785856127739, "rewards/accuracy_reward": 0.8226705491542816, "rewards/format_reward": 0.9897959232330322, "step": 7193 }, { "completion_length": 234.08163452148438, "epoch": 0.7239245283018868, "grad_norm": 0.5799892544746399, "kl": 0.0601806640625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8782283067703247, "reward_std": 0.07080480456352234, "rewards/accuracy_reward": 0.8782283067703247, "rewards/format_reward": 1.0, "step": 7194 }, { "completion_length": 261.6836700439453, "epoch": 0.7240251572327044, "grad_norm": 0.5798205733299255, "kl": 0.104736328125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7835165858268738, "reward_std": 0.12339992448687553, "rewards/accuracy_reward": 0.7835166156291962, "rewards/format_reward": 1.0, "step": 7195 }, { "completion_length": 235.65306091308594, "epoch": 0.724125786163522, "grad_norm": 0.5914596319198608, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8456736207008362, "reward_std": 0.1830991506576538, "rewards/accuracy_reward": 0.855877697467804, "rewards/format_reward": 0.9897959232330322, "step": 7196 }, { "completion_length": 201.4183578491211, "epoch": 0.7242264150943396, "grad_norm": 1.3728981018066406, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7820451259613037, "reward_std": 0.1656009927392006, "rewards/accuracy_reward": 0.7820451855659485, "rewards/format_reward": 1.0, "step": 7197 }, { "completion_length": 270.1734619140625, "epoch": 0.7243270440251572, "grad_norm": 0.7872005701065063, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8542847633361816, "reward_std": 0.17082379758358002, "rewards/accuracy_reward": 0.8746930062770844, "rewards/format_reward": 0.9795918166637421, "step": 7198 }, { "completion_length": 220.2142791748047, "epoch": 0.7244276729559749, "grad_norm": 0.7210969924926758, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8477397561073303, "reward_std": 0.15867971256375313, "rewards/accuracy_reward": 0.8783520758152008, "rewards/format_reward": 0.9693877398967743, "step": 7199 }, { "completion_length": 188.4795913696289, "epoch": 0.7245283018867924, "grad_norm": 0.3308786153793335, "kl": 0.1064453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7984674572944641, "reward_std": 0.03996309172362089, "rewards/accuracy_reward": 0.7984674870967865, "rewards/format_reward": 1.0, "step": 7200 }, { "completion_length": 172.6326446533203, "epoch": 0.72462893081761, "grad_norm": 1.421807885169983, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8771460056304932, "reward_std": 0.12301318254321814, "rewards/accuracy_reward": 0.8771460652351379, "rewards/format_reward": 1.0, "step": 7201 }, { "completion_length": 304.7550964355469, "epoch": 0.7247295597484277, "grad_norm": 2.470614433288574, "kl": 0.1484375, "learning_rate": 1e-06, "loss": 0.0059, "reward": 1.7113500833511353, "reward_std": 0.2084863930940628, "rewards/accuracy_reward": 0.741962343454361, "rewards/format_reward": 0.9693877398967743, "step": 7202 }, { "completion_length": 218.7142791748047, "epoch": 0.7248301886792453, "grad_norm": 1.1475272178649902, "kl": 0.1171875, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7056121826171875, "reward_std": 0.1869823858141899, "rewards/accuracy_reward": 0.7260203659534454, "rewards/format_reward": 0.9795918166637421, "step": 7203 }, { "completion_length": 266.34693145751953, "epoch": 0.7249308176100628, "grad_norm": 0.9815234541893005, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6851691007614136, "reward_std": 0.12303945794701576, "rewards/accuracy_reward": 0.685169130563736, "rewards/format_reward": 1.0, "step": 7204 }, { "completion_length": 256.82652282714844, "epoch": 0.7250314465408805, "grad_norm": 0.7587894797325134, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.742309331893921, "reward_std": 0.21863047033548355, "rewards/accuracy_reward": 0.7525134682655334, "rewards/format_reward": 0.9897959232330322, "step": 7205 }, { "completion_length": 335.18365478515625, "epoch": 0.7251320754716981, "grad_norm": 1.3505834341049194, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.678741455078125, "reward_std": 0.20899240486323833, "rewards/accuracy_reward": 0.6991496384143829, "rewards/format_reward": 0.9795918166637421, "step": 7206 }, { "completion_length": 200.77550506591797, "epoch": 0.7252327044025157, "grad_norm": 0.7250488996505737, "kl": 0.099609375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8251870274543762, "reward_std": 0.09489957243204117, "rewards/accuracy_reward": 0.8353910148143768, "rewards/format_reward": 0.9897959232330322, "step": 7207 }, { "completion_length": 162.21428680419922, "epoch": 0.7253333333333334, "grad_norm": 1.0867210626602173, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.854339838027954, "reward_std": 0.1444571278989315, "rewards/accuracy_reward": 0.8645438253879547, "rewards/format_reward": 0.9897959232330322, "step": 7208 }, { "completion_length": 213.37754821777344, "epoch": 0.7254339622641509, "grad_norm": 0.6712003946304321, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7915451526641846, "reward_std": 0.13380203396081924, "rewards/accuracy_reward": 0.8119533061981201, "rewards/format_reward": 0.9795918464660645, "step": 7209 }, { "completion_length": 177.89795684814453, "epoch": 0.7255345911949685, "grad_norm": 1.4341285228729248, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7499999403953552, "reward_std": 0.10335781797766685, "rewards/accuracy_reward": 0.7499999701976776, "rewards/format_reward": 1.0, "step": 7210 }, { "completion_length": 265.9183654785156, "epoch": 0.7256352201257862, "grad_norm": 0.3233524560928345, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8387754559516907, "reward_std": 0.07204378396272659, "rewards/accuracy_reward": 0.8387755155563354, "rewards/format_reward": 1.0, "step": 7211 }, { "completion_length": 176.04081344604492, "epoch": 0.7257358490566038, "grad_norm": 0.7358527779579163, "kl": 0.09326171875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7346938848495483, "reward_std": 0.11917255818843842, "rewards/accuracy_reward": 0.734693855047226, "rewards/format_reward": 1.0, "step": 7212 }, { "completion_length": 278.98978424072266, "epoch": 0.7258364779874213, "grad_norm": 0.4994863271713257, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6163265109062195, "reward_std": 0.20298133045434952, "rewards/accuracy_reward": 0.6673469543457031, "rewards/format_reward": 0.9489795565605164, "step": 7213 }, { "completion_length": 269.51019287109375, "epoch": 0.725937106918239, "grad_norm": 0.5170620679855347, "kl": 0.0604248046875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7483060359954834, "reward_std": 0.10185422003269196, "rewards/accuracy_reward": 0.7483060657978058, "rewards/format_reward": 1.0, "step": 7214 }, { "completion_length": 240.79591369628906, "epoch": 0.7260377358490566, "grad_norm": 0.673886775970459, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7428350448608398, "reward_std": 0.15452108904719353, "rewards/accuracy_reward": 0.7428350448608398, "rewards/format_reward": 1.0, "step": 7215 }, { "completion_length": 302.1224365234375, "epoch": 0.7261383647798743, "grad_norm": 4.580272197723389, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7151216268539429, "reward_std": 0.173635333776474, "rewards/accuracy_reward": 0.7253257632255554, "rewards/format_reward": 0.9897959232330322, "step": 7216 }, { "completion_length": 204.7448959350586, "epoch": 0.7262389937106918, "grad_norm": 1.1016393899917603, "kl": 0.0501708984375, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.8469387292861938, "reward_std": 0.1573527380824089, "rewards/accuracy_reward": 0.8469387590885162, "rewards/format_reward": 1.0, "step": 7217 }, { "completion_length": 253.57142639160156, "epoch": 0.7263396226415094, "grad_norm": 0.7009649276733398, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6709920167922974, "reward_std": 0.21131908893585205, "rewards/accuracy_reward": 0.7016042172908783, "rewards/format_reward": 0.9693877398967743, "step": 7218 }, { "completion_length": 217.09183502197266, "epoch": 0.7264402515723271, "grad_norm": 0.22788865864276886, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8827458024024963, "reward_std": 0.051682550460100174, "rewards/accuracy_reward": 0.8929499387741089, "rewards/format_reward": 0.9897959232330322, "step": 7219 }, { "completion_length": 258.09183502197266, "epoch": 0.7265408805031447, "grad_norm": 0.9685907959938049, "kl": 0.11083984375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.760953426361084, "reward_std": 0.14109481684863567, "rewards/accuracy_reward": 0.7711576223373413, "rewards/format_reward": 0.9897959232330322, "step": 7220 }, { "completion_length": 255.18366241455078, "epoch": 0.7266415094339622, "grad_norm": 0.934255063533783, "kl": 0.103271484375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7762230038642883, "reward_std": 0.1693984493613243, "rewards/accuracy_reward": 0.7762229740619659, "rewards/format_reward": 1.0, "step": 7221 }, { "completion_length": 228.2142791748047, "epoch": 0.7267421383647799, "grad_norm": 80.80815887451172, "kl": 3.85595703125, "learning_rate": 1e-06, "loss": 0.1542, "reward": 1.7653497457504272, "reward_std": 0.1602717861533165, "rewards/accuracy_reward": 0.7857578694820404, "rewards/format_reward": 0.9795918166637421, "step": 7222 }, { "completion_length": 245.69387817382812, "epoch": 0.7268427672955975, "grad_norm": 0.7149009704589844, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7855905890464783, "reward_std": 0.17672067135572433, "rewards/accuracy_reward": 0.8059987425804138, "rewards/format_reward": 0.9795918464660645, "step": 7223 }, { "completion_length": 203.24488830566406, "epoch": 0.7269433962264151, "grad_norm": 0.38589221239089966, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.820699691772461, "reward_std": 0.03856780473142862, "rewards/accuracy_reward": 0.8206996619701385, "rewards/format_reward": 1.0, "step": 7224 }, { "completion_length": 297.7142791748047, "epoch": 0.7270440251572327, "grad_norm": 0.7638319134712219, "kl": 0.10693359375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.6449756622314453, "reward_std": 0.18561600893735886, "rewards/accuracy_reward": 0.6449756622314453, "rewards/format_reward": 1.0, "step": 7225 }, { "completion_length": 233.6326446533203, "epoch": 0.7271446540880503, "grad_norm": 0.8749904036521912, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8905689120292664, "reward_std": 0.15602947026491165, "rewards/accuracy_reward": 0.9007730185985565, "rewards/format_reward": 0.9897959232330322, "step": 7226 }, { "completion_length": 243.09183502197266, "epoch": 0.7272452830188679, "grad_norm": 0.5627118349075317, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8679653406143188, "reward_std": 0.13899165391921997, "rewards/accuracy_reward": 0.8883735239505768, "rewards/format_reward": 0.9795918464660645, "step": 7227 }, { "completion_length": 239.4285659790039, "epoch": 0.7273459119496856, "grad_norm": 0.3416551947593689, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7848418951034546, "reward_std": 0.053479790687561035, "rewards/accuracy_reward": 0.7950459122657776, "rewards/format_reward": 0.9897959232330322, "step": 7228 }, { "completion_length": 208.60203552246094, "epoch": 0.7274465408805031, "grad_norm": 0.9224137663841248, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8225702047348022, "reward_std": 0.10982638597488403, "rewards/accuracy_reward": 0.83277428150177, "rewards/format_reward": 0.9897959232330322, "step": 7229 }, { "completion_length": 201.61223602294922, "epoch": 0.7275471698113207, "grad_norm": 0.7685241103172302, "kl": 0.10400390625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8018847703933716, "reward_std": 0.18907327950000763, "rewards/accuracy_reward": 0.812088817358017, "rewards/format_reward": 0.9897959232330322, "step": 7230 }, { "completion_length": 286.6428527832031, "epoch": 0.7276477987421384, "grad_norm": 0.8548588156700134, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6941131353378296, "reward_std": 0.22865618765354156, "rewards/accuracy_reward": 0.7145212888717651, "rewards/format_reward": 0.9795918166637421, "step": 7231 }, { "completion_length": 279.9387664794922, "epoch": 0.727748427672956, "grad_norm": 0.7389600872993469, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.6022624373435974, "reward_std": 0.17289279401302338, "rewards/accuracy_reward": 0.6328746676445007, "rewards/format_reward": 0.9693877398967743, "step": 7232 }, { "completion_length": 217.5408172607422, "epoch": 0.7278490566037736, "grad_norm": 0.7513461112976074, "kl": 0.0809326171875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7264333963394165, "reward_std": 0.10628298297524452, "rewards/accuracy_reward": 0.7366374433040619, "rewards/format_reward": 0.9897959232330322, "step": 7233 }, { "completion_length": 201.65306091308594, "epoch": 0.7279496855345912, "grad_norm": 1.0231125354766846, "kl": 0.128173828125, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.7400433421134949, "reward_std": 0.1711188554763794, "rewards/accuracy_reward": 0.7808595597743988, "rewards/format_reward": 0.9591836631298065, "step": 7234 }, { "completion_length": 188.34693145751953, "epoch": 0.7280503144654088, "grad_norm": 4.205319881439209, "kl": 0.10693359375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.8399814367294312, "reward_std": 0.3113112896680832, "rewards/accuracy_reward": 0.8807977735996246, "rewards/format_reward": 0.9591836333274841, "step": 7235 }, { "completion_length": 239.06121826171875, "epoch": 0.7281509433962264, "grad_norm": 1.0431264638900757, "kl": 0.0709228515625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7704081535339355, "reward_std": 0.21697954833507538, "rewards/accuracy_reward": 0.7908163368701935, "rewards/format_reward": 0.9795918464660645, "step": 7236 }, { "completion_length": 158.448974609375, "epoch": 0.7282515723270441, "grad_norm": 1.0547467470169067, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.9370747804641724, "reward_std": 0.15035628899931908, "rewards/accuracy_reward": 0.9574829936027527, "rewards/format_reward": 0.9795918166637421, "step": 7237 }, { "completion_length": 305.58162689208984, "epoch": 0.7283522012578616, "grad_norm": 1.036134123802185, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.678004503250122, "reward_std": 0.22776715457439423, "rewards/accuracy_reward": 0.7086167931556702, "rewards/format_reward": 0.9693877398967743, "step": 7238 }, { "completion_length": 232.2040786743164, "epoch": 0.7284528301886792, "grad_norm": 0.6373802423477173, "kl": 0.110107421875, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.6977046728134155, "reward_std": 0.12395461276173592, "rewards/accuracy_reward": 0.7079087197780609, "rewards/format_reward": 0.9897959232330322, "step": 7239 }, { "completion_length": 309.9081573486328, "epoch": 0.7285534591194969, "grad_norm": 0.42449435591697693, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6249008774757385, "reward_std": 0.11749103665351868, "rewards/accuracy_reward": 0.6453089714050293, "rewards/format_reward": 0.9795918166637421, "step": 7240 }, { "completion_length": 178.32652282714844, "epoch": 0.7286540880503145, "grad_norm": 0.5975636839866638, "kl": 0.0765380859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8428571224212646, "reward_std": 0.09904124587774277, "rewards/accuracy_reward": 0.8530611991882324, "rewards/format_reward": 0.9897959232330322, "step": 7241 }, { "completion_length": 228.36734008789062, "epoch": 0.728754716981132, "grad_norm": 0.649900496006012, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8337220549583435, "reward_std": 0.1527635157108307, "rewards/accuracy_reward": 0.8439261317253113, "rewards/format_reward": 0.9897959232330322, "step": 7242 }, { "completion_length": 226.68366241455078, "epoch": 0.7288553459119497, "grad_norm": 0.47625410556793213, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7439760565757751, "reward_std": 0.22400283813476562, "rewards/accuracy_reward": 0.784792423248291, "rewards/format_reward": 0.9591836333274841, "step": 7243 }, { "completion_length": 183.39795684814453, "epoch": 0.7289559748427673, "grad_norm": 0.8171378374099731, "kl": 0.0614013671875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8673469424247742, "reward_std": 0.09217509999871254, "rewards/accuracy_reward": 0.8775510191917419, "rewards/format_reward": 0.9897959232330322, "step": 7244 }, { "completion_length": 228.7142791748047, "epoch": 0.7290566037735849, "grad_norm": 0.7662693858146667, "kl": 0.097900390625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7541049122810364, "reward_std": 0.19749311357736588, "rewards/accuracy_reward": 0.7745131254196167, "rewards/format_reward": 0.9795918464660645, "step": 7245 }, { "completion_length": 191.16326141357422, "epoch": 0.7291572327044025, "grad_norm": 1.2126227617263794, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7642371654510498, "reward_std": 0.2830779552459717, "rewards/accuracy_reward": 0.7846452593803406, "rewards/format_reward": 0.9795918464660645, "step": 7246 }, { "completion_length": 224.948974609375, "epoch": 0.7292578616352201, "grad_norm": 1.0212377309799194, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7240669131278992, "reward_std": 0.07920574024319649, "rewards/accuracy_reward": 0.7342709898948669, "rewards/format_reward": 0.9897959232330322, "step": 7247 }, { "completion_length": 151.77550506591797, "epoch": 0.7293584905660377, "grad_norm": 0.5700552463531494, "kl": 0.097900390625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.9081632494926453, "reward_std": 0.05399492383003235, "rewards/accuracy_reward": 0.9183673560619354, "rewards/format_reward": 0.9897959232330322, "step": 7248 }, { "completion_length": 193.90816497802734, "epoch": 0.7294591194968554, "grad_norm": 0.8169623017311096, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8585196137428284, "reward_std": 0.1143595390021801, "rewards/accuracy_reward": 0.8585196435451508, "rewards/format_reward": 1.0, "step": 7249 }, { "completion_length": 208.1836700439453, "epoch": 0.7295597484276729, "grad_norm": 0.5404518842697144, "kl": 0.0643310546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7755101323127747, "reward_std": 0.15855563804507256, "rewards/accuracy_reward": 0.7857142686843872, "rewards/format_reward": 0.9897959232330322, "step": 7250 }, { "completion_length": 178.448974609375, "epoch": 0.7296603773584905, "grad_norm": 0.8991478681564331, "kl": 0.12158203125, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.8432281613349915, "reward_std": 0.13323249807581306, "rewards/accuracy_reward": 0.8432282507419586, "rewards/format_reward": 1.0, "step": 7251 }, { "completion_length": 215.52040100097656, "epoch": 0.7297610062893082, "grad_norm": 2.861848831176758, "kl": 0.132080078125, "learning_rate": 1e-06, "loss": 0.0053, "reward": 1.7782843112945557, "reward_std": 0.11557240039110184, "rewards/accuracy_reward": 0.788488358259201, "rewards/format_reward": 0.9897959232330322, "step": 7252 }, { "completion_length": 247.89795684814453, "epoch": 0.7298616352201258, "grad_norm": 0.8129749298095703, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7137495875358582, "reward_std": 0.18440651893615723, "rewards/accuracy_reward": 0.7137496769428253, "rewards/format_reward": 1.0, "step": 7253 }, { "completion_length": 237.02039337158203, "epoch": 0.7299622641509433, "grad_norm": 0.6573910117149353, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7422740459442139, "reward_std": 0.08432181552052498, "rewards/accuracy_reward": 0.7422740161418915, "rewards/format_reward": 1.0, "step": 7254 }, { "completion_length": 270.57141876220703, "epoch": 0.730062893081761, "grad_norm": 0.8744077086448669, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.5156933665275574, "reward_std": 0.20697824656963348, "rewards/accuracy_reward": 0.525897428393364, "rewards/format_reward": 0.9897959232330322, "step": 7255 }, { "completion_length": 160.87754821777344, "epoch": 0.7301635220125786, "grad_norm": 0.48172155022621155, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.9101068377494812, "reward_std": 0.02881556563079357, "rewards/accuracy_reward": 0.9101068675518036, "rewards/format_reward": 1.0, "step": 7256 }, { "completion_length": 237.2755126953125, "epoch": 0.7302641509433963, "grad_norm": 0.46617060899734497, "kl": 0.1259765625, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.7923623323440552, "reward_std": 0.15293867886066437, "rewards/accuracy_reward": 0.8025664687156677, "rewards/format_reward": 0.9897959232330322, "step": 7257 }, { "completion_length": 224.32653045654297, "epoch": 0.7303647798742139, "grad_norm": 0.7233709692955017, "kl": 0.11767578125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.6925258040428162, "reward_std": 0.15545420348644257, "rewards/accuracy_reward": 0.7129339873790741, "rewards/format_reward": 0.9795918464660645, "step": 7258 }, { "completion_length": 175.33673095703125, "epoch": 0.7304654088050314, "grad_norm": 1.4347071647644043, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6830859780311584, "reward_std": 0.19442053884267807, "rewards/accuracy_reward": 0.7136982083320618, "rewards/format_reward": 0.9693877398967743, "step": 7259 }, { "completion_length": 263.86734771728516, "epoch": 0.730566037735849, "grad_norm": 0.6239347457885742, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8021084666252136, "reward_std": 0.1432151459157467, "rewards/accuracy_reward": 0.8123125731945038, "rewards/format_reward": 0.9897959232330322, "step": 7260 }, { "completion_length": 176.90816497802734, "epoch": 0.7306666666666667, "grad_norm": 1.6556041240692139, "kl": 0.120361328125, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7574356198310852, "reward_std": 0.1208169125020504, "rewards/accuracy_reward": 0.7778437733650208, "rewards/format_reward": 0.9795918166637421, "step": 7261 }, { "completion_length": 200.81632232666016, "epoch": 0.7307672955974843, "grad_norm": 0.4442814290523529, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8469387888908386, "reward_std": 0.08099238574504852, "rewards/accuracy_reward": 0.8571428656578064, "rewards/format_reward": 0.9897959232330322, "step": 7262 }, { "completion_length": 234.57141876220703, "epoch": 0.7308679245283018, "grad_norm": 1.0114473104476929, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7631171941757202, "reward_std": 0.10515728406608105, "rewards/accuracy_reward": 0.7631173431873322, "rewards/format_reward": 1.0, "step": 7263 }, { "completion_length": 178.22447967529297, "epoch": 0.7309685534591195, "grad_norm": 1.0914912223815918, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8137755393981934, "reward_std": 0.12588080391287804, "rewards/accuracy_reward": 0.8239795565605164, "rewards/format_reward": 0.9897959232330322, "step": 7264 }, { "completion_length": 256.1632614135742, "epoch": 0.7310691823899371, "grad_norm": 0.7041655778884888, "kl": 0.10302734375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.727968454360962, "reward_std": 0.17096707224845886, "rewards/accuracy_reward": 0.7483765780925751, "rewards/format_reward": 0.9795918464660645, "step": 7265 }, { "completion_length": 258.1938781738281, "epoch": 0.7311698113207548, "grad_norm": 0.8058981895446777, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8260611295700073, "reward_std": 0.1693994328379631, "rewards/accuracy_reward": 0.8362652361392975, "rewards/format_reward": 0.9897959232330322, "step": 7266 }, { "completion_length": 251.0204086303711, "epoch": 0.7312704402515723, "grad_norm": 0.9884995818138123, "kl": 0.111328125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8061842918395996, "reward_std": 0.2015942856669426, "rewards/accuracy_reward": 0.8265924751758575, "rewards/format_reward": 0.9795918166637421, "step": 7267 }, { "completion_length": 205.06122589111328, "epoch": 0.7313710691823899, "grad_norm": 1.2031484842300415, "kl": 0.108154296875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.8267735242843628, "reward_std": 0.2404862344264984, "rewards/accuracy_reward": 0.8369776606559753, "rewards/format_reward": 0.9897959232330322, "step": 7268 }, { "completion_length": 287.73468017578125, "epoch": 0.7314716981132076, "grad_norm": 0.41639500856399536, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6762521862983704, "reward_std": 0.18439799547195435, "rewards/accuracy_reward": 0.7068645358085632, "rewards/format_reward": 0.9693877398967743, "step": 7269 }, { "completion_length": 237.448974609375, "epoch": 0.7315723270440252, "grad_norm": 1.8488192558288574, "kl": 0.091064453125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7654363512992859, "reward_std": 0.13144641742110252, "rewards/accuracy_reward": 0.7756404280662537, "rewards/format_reward": 0.9897959232330322, "step": 7270 }, { "completion_length": 251.85714721679688, "epoch": 0.7316729559748427, "grad_norm": 1.194434404373169, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7030547261238098, "reward_std": 0.11367347836494446, "rewards/accuracy_reward": 0.7234629392623901, "rewards/format_reward": 0.9795918464660645, "step": 7271 }, { "completion_length": 216.6734619140625, "epoch": 0.7317735849056604, "grad_norm": 0.6213732361793518, "kl": 0.130859375, "learning_rate": 1e-06, "loss": 0.0052, "reward": 1.8253710865974426, "reward_std": 0.11615332588553429, "rewards/accuracy_reward": 0.8253710865974426, "rewards/format_reward": 1.0, "step": 7272 }, { "completion_length": 276.9081573486328, "epoch": 0.731874213836478, "grad_norm": 0.460145503282547, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6478191614151, "reward_std": 0.08144684322178364, "rewards/accuracy_reward": 0.6478192508220673, "rewards/format_reward": 1.0, "step": 7273 }, { "completion_length": 276.29591369628906, "epoch": 0.7319748427672956, "grad_norm": 1.0935051441192627, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6879913806915283, "reward_std": 0.15164801105856895, "rewards/accuracy_reward": 0.6981954872608185, "rewards/format_reward": 0.9897959232330322, "step": 7274 }, { "completion_length": 222.02040100097656, "epoch": 0.7320754716981132, "grad_norm": 0.5667738914489746, "kl": 0.105224609375, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.654507339000702, "reward_std": 0.16222478449344635, "rewards/accuracy_reward": 0.6647114455699921, "rewards/format_reward": 0.9897959232330322, "step": 7275 }, { "completion_length": 255.8775405883789, "epoch": 0.7321761006289308, "grad_norm": 0.6639463901519775, "kl": 0.122314453125, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.7084813714027405, "reward_std": 0.18815027922391891, "rewards/accuracy_reward": 0.7186853885650635, "rewards/format_reward": 0.9897959232330322, "step": 7276 }, { "completion_length": 263.82652282714844, "epoch": 0.7322767295597484, "grad_norm": 0.6437299847602844, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7448456287384033, "reward_std": 0.13314592465758324, "rewards/accuracy_reward": 0.7550497055053711, "rewards/format_reward": 0.9897959232330322, "step": 7277 }, { "completion_length": 185.29591369628906, "epoch": 0.7323773584905661, "grad_norm": 1.6199655532836914, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8203087449073792, "reward_std": 0.16713541001081467, "rewards/accuracy_reward": 0.8305128514766693, "rewards/format_reward": 0.9897959232330322, "step": 7278 }, { "completion_length": 260.1530532836914, "epoch": 0.7324779874213837, "grad_norm": 1.2273898124694824, "kl": 0.131103515625, "learning_rate": 1e-06, "loss": 0.0052, "reward": 1.7937988638877869, "reward_std": 0.21757879108190536, "rewards/accuracy_reward": 0.834615170955658, "rewards/format_reward": 0.9591836631298065, "step": 7279 }, { "completion_length": 241.57141876220703, "epoch": 0.7325786163522012, "grad_norm": 0.7688660621643066, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8288806676864624, "reward_std": 0.19501742720603943, "rewards/accuracy_reward": 0.8492888510227203, "rewards/format_reward": 0.9795918166637421, "step": 7280 }, { "completion_length": 294.9897918701172, "epoch": 0.7326792452830189, "grad_norm": 0.5331536531448364, "kl": 0.09716796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.768480658531189, "reward_std": 0.1739475019276142, "rewards/accuracy_reward": 0.7990929782390594, "rewards/format_reward": 0.9693877398967743, "step": 7281 }, { "completion_length": 245.52040100097656, "epoch": 0.7327798742138365, "grad_norm": 0.3297519087791443, "kl": 0.098388671875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8506273031234741, "reward_std": 0.042278019711375237, "rewards/accuracy_reward": 0.850627213716507, "rewards/format_reward": 1.0, "step": 7282 }, { "completion_length": 233.86734008789062, "epoch": 0.7328805031446541, "grad_norm": 1.3561897277832031, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7408486604690552, "reward_std": 0.21573762595653534, "rewards/accuracy_reward": 0.7510527670383453, "rewards/format_reward": 0.9897959232330322, "step": 7283 }, { "completion_length": 326.7142791748047, "epoch": 0.7329811320754717, "grad_norm": 0.5320444107055664, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.796371877193451, "reward_std": 0.13879750669002533, "rewards/accuracy_reward": 0.7963718771934509, "rewards/format_reward": 1.0, "step": 7284 }, { "completion_length": 298.1836700439453, "epoch": 0.7330817610062893, "grad_norm": 0.9725347757339478, "kl": 0.0966796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7349489331245422, "reward_std": 0.18976694345474243, "rewards/accuracy_reward": 0.7451530694961548, "rewards/format_reward": 0.9897959232330322, "step": 7285 }, { "completion_length": 322.7449035644531, "epoch": 0.7331823899371069, "grad_norm": 0.5472360253334045, "kl": 0.093994140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8129029870033264, "reward_std": 0.1533258557319641, "rewards/accuracy_reward": 0.823107123374939, "rewards/format_reward": 0.9897959232330322, "step": 7286 }, { "completion_length": 236.33672332763672, "epoch": 0.7332830188679246, "grad_norm": 0.5747036337852478, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8757317662239075, "reward_std": 0.12086459621787071, "rewards/accuracy_reward": 0.8757317662239075, "rewards/format_reward": 1.0, "step": 7287 }, { "completion_length": 234.29591369628906, "epoch": 0.7333836477987421, "grad_norm": 0.5439446568489075, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7927010655403137, "reward_std": 0.12215199507772923, "rewards/accuracy_reward": 0.8029051125049591, "rewards/format_reward": 0.9897959232330322, "step": 7288 }, { "completion_length": 260.1836700439453, "epoch": 0.7334842767295597, "grad_norm": 0.4351954460144043, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.853432297706604, "reward_std": 0.15234894305467606, "rewards/accuracy_reward": 0.8942485749721527, "rewards/format_reward": 0.9591836631298065, "step": 7289 }, { "completion_length": 299.3367233276367, "epoch": 0.7335849056603774, "grad_norm": 0.707130491733551, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7086601853370667, "reward_std": 0.20987140387296677, "rewards/accuracy_reward": 0.7392724454402924, "rewards/format_reward": 0.9693877398967743, "step": 7290 }, { "completion_length": 266.28570556640625, "epoch": 0.733685534591195, "grad_norm": 0.7563750147819519, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.5851919651031494, "reward_std": 0.16295088455080986, "rewards/accuracy_reward": 0.595396101474762, "rewards/format_reward": 0.9897959232330322, "step": 7291 }, { "completion_length": 259.6428527832031, "epoch": 0.7337861635220125, "grad_norm": 0.880607545375824, "kl": 0.1083984375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.8903870582580566, "reward_std": 0.09911375120282173, "rewards/accuracy_reward": 0.9005911648273468, "rewards/format_reward": 0.9897959232330322, "step": 7292 }, { "completion_length": 242.30611419677734, "epoch": 0.7338867924528302, "grad_norm": 0.802690327167511, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7554422616958618, "reward_std": 0.13298538699746132, "rewards/accuracy_reward": 0.755442202091217, "rewards/format_reward": 1.0, "step": 7293 }, { "completion_length": 325.8673400878906, "epoch": 0.7339874213836478, "grad_norm": 0.6737797856330872, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.676996886730194, "reward_std": 0.21452172845602036, "rewards/accuracy_reward": 0.7178132832050323, "rewards/format_reward": 0.9591836631298065, "step": 7294 }, { "completion_length": 277.0102005004883, "epoch": 0.7340880503144654, "grad_norm": 0.48170316219329834, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7674397826194763, "reward_std": 0.11824247613549232, "rewards/accuracy_reward": 0.7776437997817993, "rewards/format_reward": 0.9897959232330322, "step": 7295 }, { "completion_length": 327.83673095703125, "epoch": 0.734188679245283, "grad_norm": 0.3930598199367523, "kl": 0.0655517578125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5928804278373718, "reward_std": 0.0877805296331644, "rewards/accuracy_reward": 0.6030846238136292, "rewards/format_reward": 0.9897959232330322, "step": 7296 }, { "completion_length": 296.02039337158203, "epoch": 0.7342893081761006, "grad_norm": 0.45275062322616577, "kl": 0.0582275390625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7925170063972473, "reward_std": 0.12651263549923897, "rewards/accuracy_reward": 0.8027210533618927, "rewards/format_reward": 0.9897959232330322, "step": 7297 }, { "completion_length": 300.8163299560547, "epoch": 0.7343899371069182, "grad_norm": 0.6881774663925171, "kl": 0.06689453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7763303518295288, "reward_std": 0.1922212764620781, "rewards/accuracy_reward": 0.7967385053634644, "rewards/format_reward": 0.9795918166637421, "step": 7298 }, { "completion_length": 248.27549743652344, "epoch": 0.7344905660377359, "grad_norm": 0.7516162395477295, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8552875518798828, "reward_std": 0.051167186349630356, "rewards/accuracy_reward": 0.8552875518798828, "rewards/format_reward": 1.0, "step": 7299 }, { "completion_length": 292.12244415283203, "epoch": 0.7345911949685534, "grad_norm": 0.3491104245185852, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7857142686843872, "reward_std": 0.08884849399328232, "rewards/accuracy_reward": 0.7857142686843872, "rewards/format_reward": 1.0, "step": 7300 }, { "completion_length": 405.7550964355469, "epoch": 0.734691823899371, "grad_norm": 0.5450453162193298, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7353078126907349, "reward_std": 0.2541331797838211, "rewards/accuracy_reward": 0.75571608543396, "rewards/format_reward": 0.9795918166637421, "step": 7301 }, { "completion_length": 261.2244873046875, "epoch": 0.7347924528301887, "grad_norm": 1.0578382015228271, "kl": 0.13720703125, "learning_rate": 1e-06, "loss": 0.0055, "reward": 1.8250144720077515, "reward_std": 0.187993872910738, "rewards/accuracy_reward": 0.8352185785770416, "rewards/format_reward": 0.9897959232330322, "step": 7302 }, { "completion_length": 274.6836700439453, "epoch": 0.7348930817610063, "grad_norm": 0.6844890713691711, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8091881275177002, "reward_std": 0.12973178923130035, "rewards/accuracy_reward": 0.8193921744823456, "rewards/format_reward": 0.9897959232330322, "step": 7303 }, { "completion_length": 344.12245178222656, "epoch": 0.734993710691824, "grad_norm": 1.0815305709838867, "kl": 0.062744140625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6885899305343628, "reward_std": 0.23156555742025375, "rewards/accuracy_reward": 0.6987940669059753, "rewards/format_reward": 0.9897959232330322, "step": 7304 }, { "completion_length": 220.51020050048828, "epoch": 0.7350943396226415, "grad_norm": 0.4075354337692261, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8962298035621643, "reward_std": 0.10044600069522858, "rewards/accuracy_reward": 0.9064338803291321, "rewards/format_reward": 0.9897959232330322, "step": 7305 }, { "completion_length": 317.5816192626953, "epoch": 0.7351949685534591, "grad_norm": 0.7620850205421448, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8288187384605408, "reward_std": 0.2033403292298317, "rewards/accuracy_reward": 0.8390228450298309, "rewards/format_reward": 0.9897959232330322, "step": 7306 }, { "completion_length": 307.2244873046875, "epoch": 0.7352955974842768, "grad_norm": 0.6479451060295105, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8001158237457275, "reward_std": 0.1725841723382473, "rewards/accuracy_reward": 0.8205239474773407, "rewards/format_reward": 0.9795918464660645, "step": 7307 }, { "completion_length": 240.60204315185547, "epoch": 0.7353962264150944, "grad_norm": 2.1667280197143555, "kl": 0.148193359375, "learning_rate": 1e-06, "loss": 0.0059, "reward": 1.5791061520576477, "reward_std": 0.2601955905556679, "rewards/accuracy_reward": 0.6097183525562286, "rewards/format_reward": 0.9693877398967743, "step": 7308 }, { "completion_length": 262.60203552246094, "epoch": 0.7354968553459119, "grad_norm": 0.4533674418926239, "kl": 0.102294921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7828307747840881, "reward_std": 0.16777930036187172, "rewards/accuracy_reward": 0.8236471116542816, "rewards/format_reward": 0.9591836631298065, "step": 7309 }, { "completion_length": 219.44896697998047, "epoch": 0.7355974842767296, "grad_norm": 0.5927174091339111, "kl": 0.091552734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8227447271347046, "reward_std": 0.13247333467006683, "rewards/accuracy_reward": 0.822744756937027, "rewards/format_reward": 1.0, "step": 7310 }, { "completion_length": 180.1938705444336, "epoch": 0.7356981132075472, "grad_norm": 0.9169923663139343, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8997302055358887, "reward_std": 0.0924900509417057, "rewards/accuracy_reward": 0.8997302353382111, "rewards/format_reward": 1.0, "step": 7311 }, { "completion_length": 320.27549743652344, "epoch": 0.7357987421383648, "grad_norm": 0.6771814227104187, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7701665759086609, "reward_std": 0.24939871579408646, "rewards/accuracy_reward": 0.8109830021858215, "rewards/format_reward": 0.9591836631298065, "step": 7312 }, { "completion_length": 310.0714111328125, "epoch": 0.7358993710691824, "grad_norm": 0.5832561254501343, "kl": 0.0538330078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.8357627391815186, "reward_std": 0.10955962538719177, "rewards/accuracy_reward": 0.8357628285884857, "rewards/format_reward": 1.0, "step": 7313 }, { "completion_length": 381.8673400878906, "epoch": 0.736, "grad_norm": 0.48834457993507385, "kl": 0.06689453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7096938490867615, "reward_std": 0.23185943067073822, "rewards/accuracy_reward": 0.7198979556560516, "rewards/format_reward": 0.9897959232330322, "step": 7314 }, { "completion_length": 246.36734008789062, "epoch": 0.7361006289308176, "grad_norm": 0.9529852867126465, "kl": 0.0643310546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7452344298362732, "reward_std": 0.1867123693227768, "rewards/accuracy_reward": 0.7452344000339508, "rewards/format_reward": 1.0, "step": 7315 }, { "completion_length": 206.27550506591797, "epoch": 0.7362012578616353, "grad_norm": 0.3861009180545807, "kl": 0.119873046875, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.8394733667373657, "reward_std": 0.11347592994570732, "rewards/accuracy_reward": 0.8700856268405914, "rewards/format_reward": 0.9693877398967743, "step": 7316 }, { "completion_length": 240.25509643554688, "epoch": 0.7363018867924528, "grad_norm": 2.839797019958496, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.836734652519226, "reward_std": 0.1573527380824089, "rewards/accuracy_reward": 0.8469387590885162, "rewards/format_reward": 0.9897959232330322, "step": 7317 }, { "completion_length": 281.2550964355469, "epoch": 0.7364025157232704, "grad_norm": 1.7961949110031128, "kl": 0.059326171875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8495280146598816, "reward_std": 0.10901331901550293, "rewards/accuracy_reward": 0.8495280146598816, "rewards/format_reward": 1.0, "step": 7318 }, { "completion_length": 268.38775634765625, "epoch": 0.7365031446540881, "grad_norm": 0.5841841697692871, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7828959822654724, "reward_std": 0.186570318415761, "rewards/accuracy_reward": 0.8033041954040527, "rewards/format_reward": 0.9795918166637421, "step": 7319 }, { "completion_length": 252.31632232666016, "epoch": 0.7366037735849057, "grad_norm": 0.5911027789115906, "kl": 0.0692138671875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7982031106948853, "reward_std": 0.06078913062810898, "rewards/accuracy_reward": 0.7982031404972076, "rewards/format_reward": 1.0, "step": 7320 }, { "completion_length": 288.67346954345703, "epoch": 0.7367044025157232, "grad_norm": 0.5354681015014648, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8172519207000732, "reward_std": 0.14236343652009964, "rewards/accuracy_reward": 0.8376601338386536, "rewards/format_reward": 0.9795918464660645, "step": 7321 }, { "completion_length": 291.26529693603516, "epoch": 0.7368050314465409, "grad_norm": 0.5953422784805298, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7451409101486206, "reward_std": 0.13392624631524086, "rewards/accuracy_reward": 0.7451409101486206, "rewards/format_reward": 1.0, "step": 7322 }, { "completion_length": 291.8571319580078, "epoch": 0.7369056603773585, "grad_norm": 0.8379742503166199, "kl": 0.1123046875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7451915740966797, "reward_std": 0.22610396146774292, "rewards/accuracy_reward": 0.7860079109668732, "rewards/format_reward": 0.9591836631298065, "step": 7323 }, { "completion_length": 310.948974609375, "epoch": 0.7370062893081761, "grad_norm": 0.6825137734413147, "kl": 0.0670166015625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7671613097190857, "reward_std": 0.1076153814792633, "rewards/accuracy_reward": 0.7773654460906982, "rewards/format_reward": 0.9897959232330322, "step": 7324 }, { "completion_length": 243.44898223876953, "epoch": 0.7371069182389937, "grad_norm": 3.858905553817749, "kl": 0.149169921875, "learning_rate": 1e-06, "loss": 0.006, "reward": 1.7010988593101501, "reward_std": 0.176595326513052, "rewards/accuracy_reward": 0.7113029658794403, "rewards/format_reward": 0.9897959232330322, "step": 7325 }, { "completion_length": 268.07141876220703, "epoch": 0.7372075471698113, "grad_norm": 0.655671238899231, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.830704927444458, "reward_std": 0.17721257731318474, "rewards/accuracy_reward": 0.8409090340137482, "rewards/format_reward": 0.9897959232330322, "step": 7326 }, { "completion_length": 174.43877410888672, "epoch": 0.7373081761006289, "grad_norm": 1.3396024703979492, "kl": 0.0787353515625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7601858377456665, "reward_std": 0.08450158312916756, "rewards/accuracy_reward": 0.7601858377456665, "rewards/format_reward": 1.0, "step": 7327 }, { "completion_length": 346.57142639160156, "epoch": 0.7374088050314466, "grad_norm": 1.1264982223510742, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6626372933387756, "reward_std": 0.22567562945187092, "rewards/accuracy_reward": 0.6728413999080658, "rewards/format_reward": 0.9897959232330322, "step": 7328 }, { "completion_length": 305.37754821777344, "epoch": 0.7375094339622642, "grad_norm": 0.4781002998352051, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.715291678905487, "reward_std": 0.08664567396044731, "rewards/accuracy_reward": 0.7254956960678101, "rewards/format_reward": 0.9897959232330322, "step": 7329 }, { "completion_length": 299.7550964355469, "epoch": 0.7376100628930817, "grad_norm": 0.6278247237205505, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7229127287864685, "reward_std": 0.1788472793996334, "rewards/accuracy_reward": 0.733116865158081, "rewards/format_reward": 0.9897959232330322, "step": 7330 }, { "completion_length": 219.76529693603516, "epoch": 0.7377106918238994, "grad_norm": 1.1772396564483643, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7595375180244446, "reward_std": 0.23548922687768936, "rewards/accuracy_reward": 0.7799456715583801, "rewards/format_reward": 0.9795918464660645, "step": 7331 }, { "completion_length": 283.2653045654297, "epoch": 0.737811320754717, "grad_norm": 0.4455680251121521, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.853726863861084, "reward_std": 0.09079740196466446, "rewards/accuracy_reward": 0.8537268936634064, "rewards/format_reward": 1.0, "step": 7332 }, { "completion_length": 273.9285659790039, "epoch": 0.7379119496855346, "grad_norm": 1.0168792009353638, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7646384239196777, "reward_std": 0.08498090133070946, "rewards/accuracy_reward": 0.7748425006866455, "rewards/format_reward": 0.9897959232330322, "step": 7333 }, { "completion_length": 276.11224365234375, "epoch": 0.7380125786163522, "grad_norm": 0.6668614149093628, "kl": 0.0689697265625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7325512766838074, "reward_std": 0.14904595911502838, "rewards/accuracy_reward": 0.7529595196247101, "rewards/format_reward": 0.9795918464660645, "step": 7334 }, { "completion_length": 216.1938705444336, "epoch": 0.7381132075471698, "grad_norm": 0.6649454832077026, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.762424349784851, "reward_std": 0.04254044592380524, "rewards/accuracy_reward": 0.7624244391918182, "rewards/format_reward": 1.0, "step": 7335 }, { "completion_length": 296.54080963134766, "epoch": 0.7382138364779874, "grad_norm": 0.4319455623626709, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8273588418960571, "reward_std": 0.06760056409984827, "rewards/accuracy_reward": 0.8273588120937347, "rewards/format_reward": 1.0, "step": 7336 }, { "completion_length": 316.32652282714844, "epoch": 0.7383144654088051, "grad_norm": 2.4296934604644775, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7755101323127747, "reward_std": 0.19295761734247208, "rewards/accuracy_reward": 0.795918345451355, "rewards/format_reward": 0.9795918464660645, "step": 7337 }, { "completion_length": 201.73468780517578, "epoch": 0.7384150943396226, "grad_norm": 0.6769357323646545, "kl": 0.11767578125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.8446711897850037, "reward_std": 0.10636232048273087, "rewards/accuracy_reward": 0.8446711897850037, "rewards/format_reward": 1.0, "step": 7338 }, { "completion_length": 204.7244873046875, "epoch": 0.7385157232704402, "grad_norm": 0.7675618529319763, "kl": 0.0794677734375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.799171507358551, "reward_std": 0.13309310097247362, "rewards/accuracy_reward": 0.7991714775562286, "rewards/format_reward": 1.0, "step": 7339 }, { "completion_length": 190.54080963134766, "epoch": 0.7386163522012579, "grad_norm": 0.4299165904521942, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.9605441689491272, "reward_std": 0.01089851651340723, "rewards/accuracy_reward": 0.9605441987514496, "rewards/format_reward": 1.0, "step": 7340 }, { "completion_length": 233.46939086914062, "epoch": 0.7387169811320755, "grad_norm": 1.2273428440093994, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7035179138183594, "reward_std": 0.12012911960482597, "rewards/accuracy_reward": 0.7137219905853271, "rewards/format_reward": 0.9897959232330322, "step": 7341 }, { "completion_length": 257.9591827392578, "epoch": 0.738817610062893, "grad_norm": 0.4508943259716034, "kl": 0.11279296875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7817221879959106, "reward_std": 0.10776009783148766, "rewards/accuracy_reward": 0.7919263541698456, "rewards/format_reward": 0.9897959232330322, "step": 7342 }, { "completion_length": 212.36734008789062, "epoch": 0.7389182389937107, "grad_norm": 1.0772850513458252, "kl": 0.126953125, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.8600288033485413, "reward_std": 0.1778390072286129, "rewards/accuracy_reward": 0.890641063451767, "rewards/format_reward": 0.9693877398967743, "step": 7343 }, { "completion_length": 193.11224365234375, "epoch": 0.7390188679245283, "grad_norm": 1.0773372650146484, "kl": 0.1064453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.8647039532661438, "reward_std": 0.15568984672427177, "rewards/accuracy_reward": 0.874908059835434, "rewards/format_reward": 0.9897959232330322, "step": 7344 }, { "completion_length": 301.9183654785156, "epoch": 0.739119496855346, "grad_norm": 0.7546032071113586, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7826234698295593, "reward_std": 0.2737870216369629, "rewards/accuracy_reward": 0.8030316531658173, "rewards/format_reward": 0.9795918166637421, "step": 7345 }, { "completion_length": 253.82652282714844, "epoch": 0.7392201257861635, "grad_norm": 0.6953586339950562, "kl": 0.1220703125, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.8137820363044739, "reward_std": 0.22196999937295914, "rewards/accuracy_reward": 0.8239862024784088, "rewards/format_reward": 0.9897959232330322, "step": 7346 }, { "completion_length": 262.4285659790039, "epoch": 0.7393207547169811, "grad_norm": 0.490334153175354, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.803427815437317, "reward_std": 0.10581738501787186, "rewards/accuracy_reward": 0.8136319220066071, "rewards/format_reward": 0.9897959232330322, "step": 7347 }, { "completion_length": 223.19386291503906, "epoch": 0.7394213836477987, "grad_norm": 0.6195043325424194, "kl": 0.098876953125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7345082759857178, "reward_std": 0.06250148173421621, "rewards/accuracy_reward": 0.7345083057880402, "rewards/format_reward": 1.0, "step": 7348 }, { "completion_length": 237.55101013183594, "epoch": 0.7395220125786164, "grad_norm": 0.3417641818523407, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7757936716079712, "reward_std": 0.08833055943250656, "rewards/accuracy_reward": 0.785997748374939, "rewards/format_reward": 0.9897959232330322, "step": 7349 }, { "completion_length": 234.2244873046875, "epoch": 0.7396226415094339, "grad_norm": 1.7640759944915771, "kl": 0.1044921875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8478830456733704, "reward_std": 0.11145690083503723, "rewards/accuracy_reward": 0.8580871820449829, "rewards/format_reward": 0.9897959232330322, "step": 7350 }, { "completion_length": 238.6428451538086, "epoch": 0.7397232704402515, "grad_norm": 4.782225131988525, "kl": 0.09423828125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8248894810676575, "reward_std": 0.19131248071789742, "rewards/accuracy_reward": 0.8452976942062378, "rewards/format_reward": 0.9795918166637421, "step": 7351 }, { "completion_length": 348.83673095703125, "epoch": 0.7398238993710692, "grad_norm": 0.5673884749412537, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.656242311000824, "reward_std": 0.1942664012312889, "rewards/accuracy_reward": 0.6868546009063721, "rewards/format_reward": 0.9693877398967743, "step": 7352 }, { "completion_length": 239.18366241455078, "epoch": 0.7399245283018868, "grad_norm": 0.5250377058982849, "kl": 0.0777587890625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8094361424446106, "reward_std": 0.10534293577075005, "rewards/accuracy_reward": 0.8196402192115784, "rewards/format_reward": 0.9897959232330322, "step": 7353 }, { "completion_length": 206.36734771728516, "epoch": 0.7400251572327045, "grad_norm": 0.3444487452507019, "kl": 0.0731201171875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.862609326839447, "reward_std": 0.053196851164102554, "rewards/accuracy_reward": 0.8728134036064148, "rewards/format_reward": 0.9897959232330322, "step": 7354 }, { "completion_length": 206.7142791748047, "epoch": 0.740125786163522, "grad_norm": 1.5388356447219849, "kl": 0.140625, "learning_rate": 1e-06, "loss": 0.0056, "reward": 1.826044738292694, "reward_std": 0.17758852988481522, "rewards/accuracy_reward": 0.856656938791275, "rewards/format_reward": 0.9693877398967743, "step": 7355 }, { "completion_length": 270.5102005004883, "epoch": 0.7402264150943396, "grad_norm": 0.701774001121521, "kl": 0.094970703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.640063226222992, "reward_std": 0.16231492161750793, "rewards/accuracy_reward": 0.6604713797569275, "rewards/format_reward": 0.9795918166637421, "step": 7356 }, { "completion_length": 243.45917510986328, "epoch": 0.7403270440251573, "grad_norm": 1.5650099515914917, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7556445598602295, "reward_std": 0.16551544144749641, "rewards/accuracy_reward": 0.7658486664295197, "rewards/format_reward": 0.9897959232330322, "step": 7357 }, { "completion_length": 198.17346954345703, "epoch": 0.7404276729559749, "grad_norm": 0.841178834438324, "kl": 0.0635986328125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7879818081855774, "reward_std": 0.20532219856977463, "rewards/accuracy_reward": 0.7879818379878998, "rewards/format_reward": 1.0, "step": 7358 }, { "completion_length": 233.32652282714844, "epoch": 0.7405283018867924, "grad_norm": 2.5746474266052246, "kl": 0.11474609375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.8264788389205933, "reward_std": 0.11852159351110458, "rewards/accuracy_reward": 0.8264787495136261, "rewards/format_reward": 1.0, "step": 7359 }, { "completion_length": 261.7142791748047, "epoch": 0.74062893081761, "grad_norm": 1.5037111043930054, "kl": 0.115478515625, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7509253025054932, "reward_std": 0.18332581222057343, "rewards/accuracy_reward": 0.7611294686794281, "rewards/format_reward": 0.9897959232330322, "step": 7360 }, { "completion_length": 293.4081573486328, "epoch": 0.7407295597484277, "grad_norm": 0.5366891026496887, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7625629305839539, "reward_std": 0.16776952147483826, "rewards/accuracy_reward": 0.7625629007816315, "rewards/format_reward": 1.0, "step": 7361 }, { "completion_length": 287.51019287109375, "epoch": 0.7408301886792453, "grad_norm": 0.7172937393188477, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8424720764160156, "reward_std": 0.11424637958407402, "rewards/accuracy_reward": 0.8730843663215637, "rewards/format_reward": 0.9693877398967743, "step": 7362 }, { "completion_length": 243.74488830566406, "epoch": 0.7409308176100629, "grad_norm": 0.5098550915718079, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8142239451408386, "reward_std": 0.10657937824726105, "rewards/accuracy_reward": 0.8142238855361938, "rewards/format_reward": 1.0, "step": 7363 }, { "completion_length": 218.55101776123047, "epoch": 0.7410314465408805, "grad_norm": 1.2800289392471313, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.9036442637443542, "reward_std": 0.1597098708152771, "rewards/accuracy_reward": 0.9342565536499023, "rewards/format_reward": 0.9693877398967743, "step": 7364 }, { "completion_length": 234.5204086303711, "epoch": 0.7411320754716981, "grad_norm": 0.9186762571334839, "kl": 0.0948486328125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6930314898490906, "reward_std": 0.25215157866477966, "rewards/accuracy_reward": 0.7236437797546387, "rewards/format_reward": 0.9693877398967743, "step": 7365 }, { "completion_length": 318.5816345214844, "epoch": 0.7412327044025158, "grad_norm": 0.9514591693878174, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7223145961761475, "reward_std": 0.15660563111305237, "rewards/accuracy_reward": 0.7631310224533081, "rewards/format_reward": 0.9591836631298065, "step": 7366 }, { "completion_length": 261.1836624145508, "epoch": 0.7413333333333333, "grad_norm": 0.7671111226081848, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.772108793258667, "reward_std": 0.1831093281507492, "rewards/accuracy_reward": 0.772108793258667, "rewards/format_reward": 1.0, "step": 7367 }, { "completion_length": 223.91836547851562, "epoch": 0.7414339622641509, "grad_norm": 0.3852023780345917, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.841010570526123, "reward_std": 0.10744602605700493, "rewards/accuracy_reward": 0.851214587688446, "rewards/format_reward": 0.9897959232330322, "step": 7368 }, { "completion_length": 195.89795684814453, "epoch": 0.7415345911949686, "grad_norm": 0.5242933630943298, "kl": 0.1190185546875, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.868276596069336, "reward_std": 0.05153052136301994, "rewards/accuracy_reward": 0.8682765662670135, "rewards/format_reward": 1.0, "step": 7369 }, { "completion_length": 311.1938705444336, "epoch": 0.7416352201257862, "grad_norm": 0.5810715556144714, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7412204146385193, "reward_std": 0.18702194839715958, "rewards/accuracy_reward": 0.7616285383701324, "rewards/format_reward": 0.9795918166637421, "step": 7370 }, { "completion_length": 364.6836700439453, "epoch": 0.7417358490566037, "grad_norm": 0.6657962203025818, "kl": 0.0738525390625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7809935808181763, "reward_std": 0.23492423444986343, "rewards/accuracy_reward": 0.7911977171897888, "rewards/format_reward": 0.9897959232330322, "step": 7371 }, { "completion_length": 257.6632537841797, "epoch": 0.7418364779874214, "grad_norm": 1.6693083047866821, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.860544204711914, "reward_std": 0.10904193669557571, "rewards/accuracy_reward": 0.8707482516765594, "rewards/format_reward": 0.9897959232330322, "step": 7372 }, { "completion_length": 190.33673095703125, "epoch": 0.741937106918239, "grad_norm": 0.7457026839256287, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.9353742003440857, "reward_std": 0.1582230180501938, "rewards/accuracy_reward": 0.9455781877040863, "rewards/format_reward": 0.9897959232330322, "step": 7373 }, { "completion_length": 315.10203552246094, "epoch": 0.7420377358490566, "grad_norm": 0.8672521710395813, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7046322226524353, "reward_std": 0.2469855174422264, "rewards/accuracy_reward": 0.7250404953956604, "rewards/format_reward": 0.9795918464660645, "step": 7374 }, { "completion_length": 204.05101013183594, "epoch": 0.7421383647798742, "grad_norm": 4.011114597320557, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8503401279449463, "reward_std": 0.19116578996181488, "rewards/accuracy_reward": 0.8809523582458496, "rewards/format_reward": 0.9693877398967743, "step": 7375 }, { "completion_length": 282.7550964355469, "epoch": 0.7422389937106918, "grad_norm": 1.120659351348877, "kl": 0.1025390625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6717167496681213, "reward_std": 0.10468153282999992, "rewards/accuracy_reward": 0.6717167794704437, "rewards/format_reward": 1.0, "step": 7376 }, { "completion_length": 283.2142791748047, "epoch": 0.7423396226415094, "grad_norm": 0.8627381920814514, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.764636218547821, "reward_std": 0.21351801604032516, "rewards/accuracy_reward": 0.785044401884079, "rewards/format_reward": 0.9795918464660645, "step": 7377 }, { "completion_length": 235.85713958740234, "epoch": 0.7424402515723271, "grad_norm": 0.4543250501155853, "kl": 0.098876953125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8426664471626282, "reward_std": 0.09923643618822098, "rewards/accuracy_reward": 0.8426664471626282, "rewards/format_reward": 1.0, "step": 7378 }, { "completion_length": 208.1836700439453, "epoch": 0.7425408805031447, "grad_norm": 0.7494449019432068, "kl": 0.065673828125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7989073991775513, "reward_std": 0.15616361796855927, "rewards/accuracy_reward": 0.809111475944519, "rewards/format_reward": 0.9897959232330322, "step": 7379 }, { "completion_length": 273.52040100097656, "epoch": 0.7426415094339622, "grad_norm": 0.7797496914863586, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7188112139701843, "reward_std": 0.2456578090786934, "rewards/accuracy_reward": 0.7392194271087646, "rewards/format_reward": 0.9795918464660645, "step": 7380 }, { "completion_length": 194.29591369628906, "epoch": 0.7427421383647799, "grad_norm": 1.2627626657485962, "kl": 0.119873046875, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7736446261405945, "reward_std": 0.17265219241380692, "rewards/accuracy_reward": 0.7838486731052399, "rewards/format_reward": 0.9897959232330322, "step": 7381 }, { "completion_length": 218.05101013183594, "epoch": 0.7428427672955975, "grad_norm": 1.6275370121002197, "kl": 0.138427734375, "learning_rate": 1e-06, "loss": 0.0055, "reward": 1.6907536387443542, "reward_std": 0.1562684252858162, "rewards/accuracy_reward": 0.700957864522934, "rewards/format_reward": 0.9897959232330322, "step": 7382 }, { "completion_length": 263.27550506591797, "epoch": 0.7429433962264151, "grad_norm": 0.4332583248615265, "kl": 0.0675048828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.9030611515045166, "reward_std": 0.02654126100242138, "rewards/accuracy_reward": 0.9030612707138062, "rewards/format_reward": 1.0, "step": 7383 }, { "completion_length": 170.13265228271484, "epoch": 0.7430440251572327, "grad_norm": 1.0269436836242676, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8396500945091248, "reward_std": 0.17882464081048965, "rewards/accuracy_reward": 0.8600582778453827, "rewards/format_reward": 0.9795918166637421, "step": 7384 }, { "completion_length": 238.16326904296875, "epoch": 0.7431446540880503, "grad_norm": 0.5965881943702698, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.9714285135269165, "reward_std": 0.0755928922444582, "rewards/accuracy_reward": 0.981632649898529, "rewards/format_reward": 0.9897959232330322, "step": 7385 }, { "completion_length": 191.948974609375, "epoch": 0.7432452830188679, "grad_norm": 0.6691675782203674, "kl": 0.10205078125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8232893347740173, "reward_std": 0.15459857136011124, "rewards/accuracy_reward": 0.833493560552597, "rewards/format_reward": 0.9897959232330322, "step": 7386 }, { "completion_length": 187.47958374023438, "epoch": 0.7433459119496856, "grad_norm": 0.9232305884361267, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.841395616531372, "reward_std": 0.1587076298892498, "rewards/accuracy_reward": 0.86180379986763, "rewards/format_reward": 0.9795918166637421, "step": 7387 }, { "completion_length": 221.16326141357422, "epoch": 0.7434465408805031, "grad_norm": 3.9349756240844727, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.832712709903717, "reward_std": 0.10762882232666016, "rewards/accuracy_reward": 0.8429167568683624, "rewards/format_reward": 0.9897959232330322, "step": 7388 }, { "completion_length": 196.60203552246094, "epoch": 0.7435471698113207, "grad_norm": 1.1173051595687866, "kl": 0.0889892578125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7476630210876465, "reward_std": 0.13155679032206535, "rewards/accuracy_reward": 0.7578670978546143, "rewards/format_reward": 0.9897959232330322, "step": 7389 }, { "completion_length": 232.1530532836914, "epoch": 0.7436477987421384, "grad_norm": 0.6442086696624756, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.817541241645813, "reward_std": 0.13242438063025475, "rewards/accuracy_reward": 0.8481535315513611, "rewards/format_reward": 0.9693877398967743, "step": 7390 }, { "completion_length": 219.6530532836914, "epoch": 0.743748427672956, "grad_norm": 0.2952791452407837, "kl": 0.05810546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8040221333503723, "reward_std": 0.031107370741665363, "rewards/accuracy_reward": 0.8040221929550171, "rewards/format_reward": 1.0, "step": 7391 }, { "completion_length": 130.9897918701172, "epoch": 0.7438490566037735, "grad_norm": 1.387942910194397, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8095238208770752, "reward_std": 0.13344328850507736, "rewards/accuracy_reward": 0.8095237612724304, "rewards/format_reward": 1.0, "step": 7392 }, { "completion_length": 152.17346954345703, "epoch": 0.7439496855345912, "grad_norm": 1.0340853929519653, "kl": 0.091064453125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8478824496269226, "reward_std": 0.13340824097394943, "rewards/accuracy_reward": 0.8478824198246002, "rewards/format_reward": 1.0, "step": 7393 }, { "completion_length": 273.27549743652344, "epoch": 0.7440503144654088, "grad_norm": 2.255492925643921, "kl": 0.137451171875, "learning_rate": 1e-06, "loss": 0.0055, "reward": 1.6429539322853088, "reward_std": 0.3017628788948059, "rewards/accuracy_reward": 0.6837702989578247, "rewards/format_reward": 0.9591836333274841, "step": 7394 }, { "completion_length": 181.64285278320312, "epoch": 0.7441509433962264, "grad_norm": 0.9039982557296753, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7173393964767456, "reward_std": 0.13667722418904305, "rewards/accuracy_reward": 0.7275435328483582, "rewards/format_reward": 0.9897959232330322, "step": 7395 }, { "completion_length": 291.5918273925781, "epoch": 0.744251572327044, "grad_norm": 0.7207122445106506, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6674860715866089, "reward_std": 0.11548366770148277, "rewards/accuracy_reward": 0.6674860715866089, "rewards/format_reward": 1.0, "step": 7396 }, { "completion_length": 225.66326904296875, "epoch": 0.7443522012578616, "grad_norm": 1.7722910642623901, "kl": 0.125244140625, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.787806212902069, "reward_std": 0.11405743658542633, "rewards/accuracy_reward": 0.7878062129020691, "rewards/format_reward": 1.0, "step": 7397 }, { "completion_length": 211.88774871826172, "epoch": 0.7444528301886792, "grad_norm": 0.47867077589035034, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.60588538646698, "reward_std": 0.052482787519693375, "rewards/accuracy_reward": 0.6160894632339478, "rewards/format_reward": 0.9897959232330322, "step": 7398 }, { "completion_length": 231.0102081298828, "epoch": 0.7445534591194969, "grad_norm": 0.8011390566825867, "kl": 0.1220703125, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.7386303544044495, "reward_std": 0.10672809928655624, "rewards/accuracy_reward": 0.7386303544044495, "rewards/format_reward": 1.0, "step": 7399 }, { "completion_length": 220.7244873046875, "epoch": 0.7446540880503144, "grad_norm": 1.4627901315689087, "kl": 0.0858154296875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8653061389923096, "reward_std": 0.11559410812333226, "rewards/accuracy_reward": 0.8857142627239227, "rewards/format_reward": 0.9795918166637421, "step": 7400 }, { "completion_length": 152.86734008789062, "epoch": 0.744754716981132, "grad_norm": 1.3010525703430176, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7744445204734802, "reward_std": 0.10496991127729416, "rewards/accuracy_reward": 0.774444580078125, "rewards/format_reward": 1.0, "step": 7401 }, { "completion_length": 171.65306091308594, "epoch": 0.7448553459119497, "grad_norm": 1.2203149795532227, "kl": 0.175537109375, "learning_rate": 1e-06, "loss": 0.007, "reward": 1.7728200554847717, "reward_std": 0.19680801779031754, "rewards/accuracy_reward": 0.7728200256824493, "rewards/format_reward": 1.0, "step": 7402 }, { "completion_length": 254.9897918701172, "epoch": 0.7449559748427673, "grad_norm": 0.7134788036346436, "kl": 0.117919921875, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7708520889282227, "reward_std": 0.15036949515342712, "rewards/accuracy_reward": 0.7810561656951904, "rewards/format_reward": 0.9897959232330322, "step": 7403 }, { "completion_length": 232.9591827392578, "epoch": 0.745056603773585, "grad_norm": 0.6155134439468384, "kl": 0.114990234375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.8228972554206848, "reward_std": 0.09210261702537537, "rewards/accuracy_reward": 0.8228973746299744, "rewards/format_reward": 1.0, "step": 7404 }, { "completion_length": 221.83673095703125, "epoch": 0.7451572327044025, "grad_norm": 0.5293638706207275, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.755333960056305, "reward_std": 0.16364650800824165, "rewards/accuracy_reward": 0.7655380368232727, "rewards/format_reward": 0.9897959232330322, "step": 7405 }, { "completion_length": 243.55101776123047, "epoch": 0.7452578616352201, "grad_norm": 0.862612783908844, "kl": 0.0872802734375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.67881441116333, "reward_std": 0.13042057678103447, "rewards/accuracy_reward": 0.6788143813610077, "rewards/format_reward": 1.0, "step": 7406 }, { "completion_length": 256.58162689208984, "epoch": 0.7453584905660378, "grad_norm": 0.8567844033241272, "kl": 0.103759765625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7469369173049927, "reward_std": 0.1589285209774971, "rewards/accuracy_reward": 0.7469369471073151, "rewards/format_reward": 1.0, "step": 7407 }, { "completion_length": 205.9285659790039, "epoch": 0.7454591194968554, "grad_norm": 0.7992783188819885, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5530611276626587, "reward_std": 0.0910726934671402, "rewards/accuracy_reward": 0.5632652789354324, "rewards/format_reward": 0.9897959232330322, "step": 7408 }, { "completion_length": 185.6326446533203, "epoch": 0.7455597484276729, "grad_norm": 0.8248860836029053, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8162941336631775, "reward_std": 0.16205722093582153, "rewards/accuracy_reward": 0.8264981806278229, "rewards/format_reward": 0.9897959232330322, "step": 7409 }, { "completion_length": 228.91836547851562, "epoch": 0.7456603773584906, "grad_norm": 1.002895474433899, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.844851553440094, "reward_std": 0.19714906066656113, "rewards/accuracy_reward": 0.8856679201126099, "rewards/format_reward": 0.9591836631298065, "step": 7410 }, { "completion_length": 254.68366241455078, "epoch": 0.7457610062893082, "grad_norm": 0.48973679542541504, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.729873776435852, "reward_std": 0.14221162348985672, "rewards/accuracy_reward": 0.7400778830051422, "rewards/format_reward": 0.9897959232330322, "step": 7411 }, { "completion_length": 242.01020050048828, "epoch": 0.7458616352201258, "grad_norm": 0.785206139087677, "kl": 0.110107421875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7691012620925903, "reward_std": 0.12944461405277252, "rewards/accuracy_reward": 0.7793053686618805, "rewards/format_reward": 0.9897959232330322, "step": 7412 }, { "completion_length": 227.6836700439453, "epoch": 0.7459622641509434, "grad_norm": 0.963265061378479, "kl": 0.142822265625, "learning_rate": 1e-06, "loss": 0.0057, "reward": 1.9147472977638245, "reward_std": 0.11252019926905632, "rewards/accuracy_reward": 0.924951434135437, "rewards/format_reward": 0.9897959232330322, "step": 7413 }, { "completion_length": 237.49999237060547, "epoch": 0.746062893081761, "grad_norm": 0.5274738669395447, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.5661715865135193, "reward_std": 0.06910106353461742, "rewards/accuracy_reward": 0.5661715567111969, "rewards/format_reward": 1.0, "step": 7414 }, { "completion_length": 249.90814971923828, "epoch": 0.7461635220125786, "grad_norm": 0.9988827705383301, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.798907458782196, "reward_std": 0.12901538610458374, "rewards/accuracy_reward": 0.798907458782196, "rewards/format_reward": 1.0, "step": 7415 }, { "completion_length": 242.60203552246094, "epoch": 0.7462641509433963, "grad_norm": 0.6465755701065063, "kl": 0.128173828125, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.669180154800415, "reward_std": 0.09024084452539682, "rewards/accuracy_reward": 0.669180154800415, "rewards/format_reward": 1.0, "step": 7416 }, { "completion_length": 236.45917510986328, "epoch": 0.7463647798742138, "grad_norm": 0.8616350293159485, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.886166512966156, "reward_std": 0.21955933421850204, "rewards/accuracy_reward": 0.9167787730693817, "rewards/format_reward": 0.9693877398967743, "step": 7417 }, { "completion_length": 214.1632537841797, "epoch": 0.7464654088050314, "grad_norm": 0.8837851285934448, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7517006993293762, "reward_std": 0.1011742502450943, "rewards/accuracy_reward": 0.7517006397247314, "rewards/format_reward": 1.0, "step": 7418 }, { "completion_length": 252.56122589111328, "epoch": 0.7465660377358491, "grad_norm": 0.9609212279319763, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6305106282234192, "reward_std": 0.10060584917664528, "rewards/accuracy_reward": 0.6407146751880646, "rewards/format_reward": 0.9897959232330322, "step": 7419 }, { "completion_length": 281.0408172607422, "epoch": 0.7466666666666667, "grad_norm": 0.9062355756759644, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.782225251197815, "reward_std": 0.15741395205259323, "rewards/accuracy_reward": 0.812837541103363, "rewards/format_reward": 0.9693877398967743, "step": 7420 }, { "completion_length": 223.10203552246094, "epoch": 0.7467672955974842, "grad_norm": 0.8473858833312988, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8147615194320679, "reward_std": 0.20621446520090103, "rewards/accuracy_reward": 0.8249655961990356, "rewards/format_reward": 0.9897959232330322, "step": 7421 }, { "completion_length": 306.83673095703125, "epoch": 0.7468679245283019, "grad_norm": 0.5234261751174927, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.5861162543296814, "reward_std": 0.11974921077489853, "rewards/accuracy_reward": 0.5963203459978104, "rewards/format_reward": 0.9897959232330322, "step": 7422 }, { "completion_length": 185.66326141357422, "epoch": 0.7469685534591195, "grad_norm": 0.7730229496955872, "kl": 0.113037109375, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7959896922111511, "reward_std": 0.14557841047644615, "rewards/accuracy_reward": 0.8061937689781189, "rewards/format_reward": 0.9897959232330322, "step": 7423 }, { "completion_length": 240.59183502197266, "epoch": 0.7470691823899371, "grad_norm": 1.0131847858428955, "kl": 0.1513671875, "learning_rate": 1e-06, "loss": 0.006, "reward": 1.80647873878479, "reward_std": 0.13475338369607925, "rewards/accuracy_reward": 0.8268868923187256, "rewards/format_reward": 0.9795918166637421, "step": 7424 }, { "completion_length": 266.2346878051758, "epoch": 0.7471698113207547, "grad_norm": 0.535319983959198, "kl": 0.0684814453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7251700162887573, "reward_std": 0.1440260112285614, "rewards/accuracy_reward": 0.7659863531589508, "rewards/format_reward": 0.9591836631298065, "step": 7425 }, { "completion_length": 232.80612182617188, "epoch": 0.7472704402515723, "grad_norm": 0.4478834569454193, "kl": 0.0736083984375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.817315936088562, "reward_std": 0.23019827902317047, "rewards/accuracy_reward": 0.8479282557964325, "rewards/format_reward": 0.9693877398967743, "step": 7426 }, { "completion_length": 286.8163299560547, "epoch": 0.7473710691823899, "grad_norm": 1.2810819149017334, "kl": 0.107421875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.5197577476501465, "reward_std": 0.2145918682217598, "rewards/accuracy_reward": 0.5707782208919525, "rewards/format_reward": 0.9489795565605164, "step": 7427 }, { "completion_length": 205.39794921875, "epoch": 0.7474716981132076, "grad_norm": 0.6106846928596497, "kl": 0.103759765625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8360543251037598, "reward_std": 0.14469320699572563, "rewards/accuracy_reward": 0.8564625680446625, "rewards/format_reward": 0.9795918464660645, "step": 7428 }, { "completion_length": 245.06121826171875, "epoch": 0.7475723270440252, "grad_norm": 0.920517086982727, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.8840143084526062, "reward_std": 0.18205254524946213, "rewards/accuracy_reward": 0.8942184150218964, "rewards/format_reward": 0.9897959232330322, "step": 7429 }, { "completion_length": 195.03060913085938, "epoch": 0.7476729559748427, "grad_norm": 1.0434391498565674, "kl": 0.11669921875, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.689828336238861, "reward_std": 0.1755182035267353, "rewards/accuracy_reward": 0.7102364599704742, "rewards/format_reward": 0.9795918464660645, "step": 7430 }, { "completion_length": 231.6530532836914, "epoch": 0.7477735849056604, "grad_norm": 1.38631010055542, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6722015738487244, "reward_std": 0.22146886587142944, "rewards/accuracy_reward": 0.6824056506156921, "rewards/format_reward": 0.9897959232330322, "step": 7431 }, { "completion_length": 244.66326141357422, "epoch": 0.747874213836478, "grad_norm": 0.6948116421699524, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7329623103141785, "reward_std": 0.13436882197856903, "rewards/accuracy_reward": 0.7431663572788239, "rewards/format_reward": 0.9897959232330322, "step": 7432 }, { "completion_length": 184.9795913696289, "epoch": 0.7479748427672956, "grad_norm": 0.7984715700149536, "kl": 0.103759765625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.882189154624939, "reward_std": 0.144390307366848, "rewards/accuracy_reward": 0.8821892142295837, "rewards/format_reward": 1.0, "step": 7433 }, { "completion_length": 248.38774871826172, "epoch": 0.7480754716981132, "grad_norm": 3.648538827896118, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.860544204711914, "reward_std": 0.07984934002161026, "rewards/accuracy_reward": 0.8605442345142365, "rewards/format_reward": 1.0, "step": 7434 }, { "completion_length": 236.9795913696289, "epoch": 0.7481761006289308, "grad_norm": 0.490778386592865, "kl": 0.11083984375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.9162668585777283, "reward_std": 0.060311805456876755, "rewards/accuracy_reward": 0.9162668585777283, "rewards/format_reward": 1.0, "step": 7435 }, { "completion_length": 203.40816497802734, "epoch": 0.7482767295597484, "grad_norm": 0.5343335270881653, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8096227049827576, "reward_std": 0.09559293277561665, "rewards/accuracy_reward": 0.8096228837966919, "rewards/format_reward": 1.0, "step": 7436 }, { "completion_length": 199.39795684814453, "epoch": 0.7483773584905661, "grad_norm": 0.3727015256881714, "kl": 0.113037109375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.8734851479530334, "reward_std": 0.033649214543402195, "rewards/accuracy_reward": 0.8734852075576782, "rewards/format_reward": 1.0, "step": 7437 }, { "completion_length": 257.81632232666016, "epoch": 0.7484779874213836, "grad_norm": 1.064249038696289, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7892060279846191, "reward_std": 0.1492563709616661, "rewards/accuracy_reward": 0.8198182582855225, "rewards/format_reward": 0.9693877398967743, "step": 7438 }, { "completion_length": 185.29591369628906, "epoch": 0.7485786163522012, "grad_norm": 1.289664626121521, "kl": 0.1337890625, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.815192699432373, "reward_std": 0.26156187802553177, "rewards/accuracy_reward": 0.8560090363025665, "rewards/format_reward": 0.9591836333274841, "step": 7439 }, { "completion_length": 253.15306091308594, "epoch": 0.7486792452830189, "grad_norm": 0.35421857237815857, "kl": 0.10791015625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.896289348602295, "reward_std": 0.09264988452196121, "rewards/accuracy_reward": 0.9064934849739075, "rewards/format_reward": 0.9897959232330322, "step": 7440 }, { "completion_length": 308.29591369628906, "epoch": 0.7487798742138365, "grad_norm": 0.5634960532188416, "kl": 0.105712890625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7879061102867126, "reward_std": 0.12088912352919579, "rewards/accuracy_reward": 0.798110157251358, "rewards/format_reward": 0.9897959232330322, "step": 7441 }, { "completion_length": 286.1428451538086, "epoch": 0.748880503144654, "grad_norm": 0.6614720821380615, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7851045727729797, "reward_std": 0.17517628148198128, "rewards/accuracy_reward": 0.7953087389469147, "rewards/format_reward": 0.9897959232330322, "step": 7442 }, { "completion_length": 152.4285659790039, "epoch": 0.7489811320754717, "grad_norm": 1.2164665460586548, "kl": 0.154296875, "learning_rate": 1e-06, "loss": 0.0062, "reward": 1.841187298297882, "reward_std": 0.2162860929965973, "rewards/accuracy_reward": 0.86159548163414, "rewards/format_reward": 0.9795918166637421, "step": 7443 }, { "completion_length": 239.80612182617188, "epoch": 0.7490817610062893, "grad_norm": 1.0818579196929932, "kl": 0.119140625, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.8085519671440125, "reward_std": 0.14162448793649673, "rewards/accuracy_reward": 0.8187560439109802, "rewards/format_reward": 0.9897959232330322, "step": 7444 }, { "completion_length": 217.9285659790039, "epoch": 0.749182389937107, "grad_norm": 1.3769681453704834, "kl": 0.107421875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7094104290008545, "reward_std": 0.18272941559553146, "rewards/accuracy_reward": 0.7094103693962097, "rewards/format_reward": 1.0, "step": 7445 }, { "completion_length": 270.2653045654297, "epoch": 0.7492830188679245, "grad_norm": 0.4968995451927185, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.589812695980072, "reward_std": 0.10645403154194355, "rewards/accuracy_reward": 0.6000167727470398, "rewards/format_reward": 0.9897959232330322, "step": 7446 }, { "completion_length": 192.23468780517578, "epoch": 0.7493836477987421, "grad_norm": 0.8393098711967468, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7903112769126892, "reward_std": 0.17180858552455902, "rewards/accuracy_reward": 0.8107194602489471, "rewards/format_reward": 0.9795918464660645, "step": 7447 }, { "completion_length": 260.84693908691406, "epoch": 0.7494842767295598, "grad_norm": 0.6245037913322449, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7799164652824402, "reward_std": 0.09957988187670708, "rewards/accuracy_reward": 0.790120542049408, "rewards/format_reward": 0.9897959232330322, "step": 7448 }, { "completion_length": 364.2550964355469, "epoch": 0.7495849056603774, "grad_norm": 0.581663966178894, "kl": 0.124755859375, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.6088928580284119, "reward_std": 0.2977071851491928, "rewards/accuracy_reward": 0.6599132418632507, "rewards/format_reward": 0.9489795565605164, "step": 7449 }, { "completion_length": 269.67346954345703, "epoch": 0.7496855345911949, "grad_norm": 0.6898571848869324, "kl": 0.121826171875, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.7399904131889343, "reward_std": 0.15401863120496273, "rewards/accuracy_reward": 0.760398656129837, "rewards/format_reward": 0.9795918166637421, "step": 7450 }, { "completion_length": 251.38774871826172, "epoch": 0.7497861635220125, "grad_norm": 1.931182861328125, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8637136220932007, "reward_std": 0.12811000645160675, "rewards/accuracy_reward": 0.8943259119987488, "rewards/format_reward": 0.9693877398967743, "step": 7451 }, { "completion_length": 313.74488830566406, "epoch": 0.7498867924528302, "grad_norm": 0.5410856604576111, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7841681838035583, "reward_std": 0.21668971329927444, "rewards/accuracy_reward": 0.8351886570453644, "rewards/format_reward": 0.9489795863628387, "step": 7452 }, { "completion_length": 239.81631469726562, "epoch": 0.7499874213836478, "grad_norm": 0.5841272473335266, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7987123727798462, "reward_std": 0.09411870688199997, "rewards/accuracy_reward": 0.8089163899421692, "rewards/format_reward": 0.9897959232330322, "step": 7453 }, { "completion_length": 275.3775329589844, "epoch": 0.7500880503144655, "grad_norm": 0.730103611946106, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.653255581855774, "reward_std": 0.20277955383062363, "rewards/accuracy_reward": 0.6634596288204193, "rewards/format_reward": 0.9897959232330322, "step": 7454 }, { "completion_length": 241.48978424072266, "epoch": 0.750188679245283, "grad_norm": 0.4081350266933441, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8673468828201294, "reward_std": 0.10003121197223663, "rewards/accuracy_reward": 0.8775510191917419, "rewards/format_reward": 0.9897959232330322, "step": 7455 }, { "completion_length": 260.83673095703125, "epoch": 0.7502893081761006, "grad_norm": 0.9321144819259644, "kl": 0.0941162109375, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7074829936027527, "reward_std": 0.11762854643166065, "rewards/accuracy_reward": 0.7278911173343658, "rewards/format_reward": 0.9795918166637421, "step": 7456 }, { "completion_length": 233.33673095703125, "epoch": 0.7503899371069183, "grad_norm": 0.5402332544326782, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7337673902511597, "reward_std": 0.12008598819375038, "rewards/accuracy_reward": 0.7439715564250946, "rewards/format_reward": 0.9897959232330322, "step": 7457 }, { "completion_length": 218.6734619140625, "epoch": 0.7504905660377359, "grad_norm": 4.437101364135742, "kl": 0.21142578125, "learning_rate": 1e-06, "loss": 0.0085, "reward": 1.706462562084198, "reward_std": 0.2192060649394989, "rewards/accuracy_reward": 0.7166666686534882, "rewards/format_reward": 0.9897959232330322, "step": 7458 }, { "completion_length": 201.95917510986328, "epoch": 0.7505911949685534, "grad_norm": 1.4322571754455566, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7785714268684387, "reward_std": 0.20892097800970078, "rewards/accuracy_reward": 0.7989795506000519, "rewards/format_reward": 0.9795918464660645, "step": 7459 }, { "completion_length": 196.0204086303711, "epoch": 0.7506918238993711, "grad_norm": 1.1194285154342651, "kl": 0.091552734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.785893976688385, "reward_std": 0.09494351595640182, "rewards/accuracy_reward": 0.7858939170837402, "rewards/format_reward": 1.0, "step": 7460 }, { "completion_length": 226.9285659790039, "epoch": 0.7507924528301887, "grad_norm": 2.143526077270508, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8281909823417664, "reward_std": 0.157084122300148, "rewards/accuracy_reward": 0.8485991656780243, "rewards/format_reward": 0.9795918166637421, "step": 7461 }, { "completion_length": 205.4081573486328, "epoch": 0.7508930817610063, "grad_norm": 1.1820303201675415, "kl": 0.1279296875, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.7557568550109863, "reward_std": 0.18821940571069717, "rewards/accuracy_reward": 0.776165097951889, "rewards/format_reward": 0.9795918166637421, "step": 7462 }, { "completion_length": 280.39794921875, "epoch": 0.7509937106918239, "grad_norm": 0.6439759135246277, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.728571355342865, "reward_std": 0.16815172135829926, "rewards/accuracy_reward": 0.7489795684814453, "rewards/format_reward": 0.9795918166637421, "step": 7463 }, { "completion_length": 203.15306091308594, "epoch": 0.7510943396226415, "grad_norm": 0.6313992738723755, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7534088492393494, "reward_std": 0.19050460681319237, "rewards/accuracy_reward": 0.7636129260063171, "rewards/format_reward": 0.9897959232330322, "step": 7464 }, { "completion_length": 197.51020050048828, "epoch": 0.7511949685534591, "grad_norm": 1.1208407878875732, "kl": 0.089111328125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7744220495224, "reward_std": 0.15984784066677094, "rewards/accuracy_reward": 0.7846261262893677, "rewards/format_reward": 0.9897959232330322, "step": 7465 }, { "completion_length": 180.7244873046875, "epoch": 0.7512955974842768, "grad_norm": 0.44363805651664734, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7714285254478455, "reward_std": 0.07442733459174633, "rewards/accuracy_reward": 0.7714285850524902, "rewards/format_reward": 1.0, "step": 7466 }, { "completion_length": 213.5, "epoch": 0.7513962264150943, "grad_norm": 0.4703902304172516, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7925479412078857, "reward_std": 0.08353029750287533, "rewards/accuracy_reward": 0.7925479114055634, "rewards/format_reward": 1.0, "step": 7467 }, { "completion_length": 187.1836700439453, "epoch": 0.7514968553459119, "grad_norm": 1.0029798746109009, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.782069981098175, "reward_std": 0.16453280299901962, "rewards/accuracy_reward": 0.7922740578651428, "rewards/format_reward": 0.9897959232330322, "step": 7468 }, { "completion_length": 221.22449493408203, "epoch": 0.7515974842767296, "grad_norm": 1.261692762374878, "kl": 0.100830078125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8401360511779785, "reward_std": 0.16122955828905106, "rewards/accuracy_reward": 0.8707482814788818, "rewards/format_reward": 0.9693877398967743, "step": 7469 }, { "completion_length": 226.6326446533203, "epoch": 0.7516981132075472, "grad_norm": 0.8376520872116089, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8877550959587097, "reward_std": 0.14284342154860497, "rewards/accuracy_reward": 0.8979591429233551, "rewards/format_reward": 0.9897959232330322, "step": 7470 }, { "completion_length": 238.31632232666016, "epoch": 0.7517987421383647, "grad_norm": 0.4770005941390991, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8311466574668884, "reward_std": 0.10067816078662872, "rewards/accuracy_reward": 0.8617589175701141, "rewards/format_reward": 0.9693877398967743, "step": 7471 }, { "completion_length": 294.9387664794922, "epoch": 0.7518993710691824, "grad_norm": 0.9314574003219604, "kl": 0.0723876953125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8345956206321716, "reward_std": 0.20155760645866394, "rewards/accuracy_reward": 0.844799816608429, "rewards/format_reward": 0.9897959232330322, "step": 7472 }, { "completion_length": 260.84693908691406, "epoch": 0.752, "grad_norm": 0.5056350827217102, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7660803198814392, "reward_std": 0.14332053065299988, "rewards/accuracy_reward": 0.7864884436130524, "rewards/format_reward": 0.9795918464660645, "step": 7473 }, { "completion_length": 283.5408172607422, "epoch": 0.7521006289308176, "grad_norm": 0.8209218382835388, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6948979496955872, "reward_std": 0.1449630670249462, "rewards/accuracy_reward": 0.6948979496955872, "rewards/format_reward": 1.0, "step": 7474 }, { "completion_length": 205.5408172607422, "epoch": 0.7522012578616353, "grad_norm": 1.0340758562088013, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7539708614349365, "reward_std": 0.16039611399173737, "rewards/accuracy_reward": 0.7641749382019043, "rewards/format_reward": 0.9897959232330322, "step": 7475 }, { "completion_length": 316.23468017578125, "epoch": 0.7523018867924528, "grad_norm": 0.8015957474708557, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7109997868537903, "reward_std": 0.20277702808380127, "rewards/accuracy_reward": 0.7212038636207581, "rewards/format_reward": 0.9897959232330322, "step": 7476 }, { "completion_length": 210.83673095703125, "epoch": 0.7524025157232704, "grad_norm": 0.8559679985046387, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8460994362831116, "reward_std": 0.07360735349357128, "rewards/accuracy_reward": 0.8460994362831116, "rewards/format_reward": 1.0, "step": 7477 }, { "completion_length": 157.87754821777344, "epoch": 0.7525031446540881, "grad_norm": 0.8550240993499756, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.764723002910614, "reward_std": 0.15212933346629143, "rewards/accuracy_reward": 0.785131186246872, "rewards/format_reward": 0.9795918166637421, "step": 7478 }, { "completion_length": 236.93877410888672, "epoch": 0.7526037735849057, "grad_norm": 1.2675316333770752, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8096938133239746, "reward_std": 0.10541799291968346, "rewards/accuracy_reward": 0.8096938729286194, "rewards/format_reward": 1.0, "step": 7479 }, { "completion_length": 223.88774871826172, "epoch": 0.7527044025157232, "grad_norm": 0.6457496881484985, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.732653021812439, "reward_std": 0.15942562371492386, "rewards/accuracy_reward": 0.7326530516147614, "rewards/format_reward": 1.0, "step": 7480 }, { "completion_length": 251.948974609375, "epoch": 0.7528050314465409, "grad_norm": 0.41229015588760376, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7390140891075134, "reward_std": 0.13968760520219803, "rewards/accuracy_reward": 0.7594222724437714, "rewards/format_reward": 0.9795918464660645, "step": 7481 }, { "completion_length": 203.86734771728516, "epoch": 0.7529056603773585, "grad_norm": 0.46657347679138184, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8163264989852905, "reward_std": 0.15402613580226898, "rewards/accuracy_reward": 0.8265306055545807, "rewards/format_reward": 0.9897959232330322, "step": 7482 }, { "completion_length": 267.8571319580078, "epoch": 0.7530062893081761, "grad_norm": 0.7952439785003662, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7036031484603882, "reward_std": 0.2279226928949356, "rewards/accuracy_reward": 0.7138071656227112, "rewards/format_reward": 0.9897959232330322, "step": 7483 }, { "completion_length": 233.2244873046875, "epoch": 0.7531069182389937, "grad_norm": 0.9533845782279968, "kl": 0.10693359375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.631283164024353, "reward_std": 0.24768835306167603, "rewards/accuracy_reward": 0.6516912877559662, "rewards/format_reward": 0.9795918464660645, "step": 7484 }, { "completion_length": 239.08162689208984, "epoch": 0.7532075471698113, "grad_norm": 1.0959988832473755, "kl": 0.11669921875, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.6642848253250122, "reward_std": 0.3068840429186821, "rewards/accuracy_reward": 0.6948970854282379, "rewards/format_reward": 0.9693877398967743, "step": 7485 }, { "completion_length": 283.5204086303711, "epoch": 0.753308176100629, "grad_norm": 7.316006183624268, "kl": 0.098388671875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6696130633354187, "reward_std": 0.2591204196214676, "rewards/accuracy_reward": 0.6900211870670319, "rewards/format_reward": 0.9795918166637421, "step": 7486 }, { "completion_length": 221.73468780517578, "epoch": 0.7534088050314466, "grad_norm": 1.4537159204483032, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.69217848777771, "reward_std": 0.20140183717012405, "rewards/accuracy_reward": 0.7125866711139679, "rewards/format_reward": 0.9795918464660645, "step": 7487 }, { "completion_length": 182.80611419677734, "epoch": 0.7535094339622641, "grad_norm": 1.2015751600265503, "kl": 0.120361328125, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.8043603897094727, "reward_std": 0.12050947127863765, "rewards/accuracy_reward": 0.8043604493141174, "rewards/format_reward": 1.0, "step": 7488 }, { "completion_length": 224.10203552246094, "epoch": 0.7536100628930817, "grad_norm": 0.8253113031387329, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7639886736869812, "reward_std": 0.09848594665527344, "rewards/accuracy_reward": 0.7639886438846588, "rewards/format_reward": 1.0, "step": 7489 }, { "completion_length": 233.23468780517578, "epoch": 0.7537106918238994, "grad_norm": 1.0608571767807007, "kl": 0.134033203125, "learning_rate": 1e-06, "loss": 0.0053, "reward": 1.7318387627601624, "reward_std": 0.20651407539844513, "rewards/accuracy_reward": 0.7624509930610657, "rewards/format_reward": 0.9693877398967743, "step": 7490 }, { "completion_length": 213.99999237060547, "epoch": 0.753811320754717, "grad_norm": 1.5681838989257812, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7124263644218445, "reward_std": 0.14496111497282982, "rewards/accuracy_reward": 0.712426483631134, "rewards/format_reward": 1.0, "step": 7491 }, { "completion_length": 246.89794921875, "epoch": 0.7539119496855345, "grad_norm": 1.1289825439453125, "kl": 0.0726318359375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8061224222183228, "reward_std": 0.11584595590829849, "rewards/accuracy_reward": 0.8163264989852905, "rewards/format_reward": 0.9897959232330322, "step": 7492 }, { "completion_length": 196.54080963134766, "epoch": 0.7540125786163522, "grad_norm": 1.045139193534851, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7464928030967712, "reward_std": 0.20325159281492233, "rewards/accuracy_reward": 0.7669009566307068, "rewards/format_reward": 0.9795918464660645, "step": 7493 }, { "completion_length": 178.58163452148438, "epoch": 0.7541132075471698, "grad_norm": 0.9945525527000427, "kl": 0.109130859375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8088170289993286, "reward_std": 0.12213753163814545, "rewards/accuracy_reward": 0.819021075963974, "rewards/format_reward": 0.9897959232330322, "step": 7494 }, { "completion_length": 236.67346954345703, "epoch": 0.7542138364779875, "grad_norm": 2.053584337234497, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8192419409751892, "reward_std": 0.09091071411967278, "rewards/accuracy_reward": 0.829446017742157, "rewards/format_reward": 0.9897959232330322, "step": 7495 }, { "completion_length": 201.73468780517578, "epoch": 0.754314465408805, "grad_norm": 0.7126086950302124, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.714223861694336, "reward_std": 0.18889287114143372, "rewards/accuracy_reward": 0.7244279682636261, "rewards/format_reward": 0.9897959232330322, "step": 7496 }, { "completion_length": 190.1326446533203, "epoch": 0.7544150943396226, "grad_norm": 0.3162676990032196, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.831714928150177, "reward_std": 0.026855651289224625, "rewards/accuracy_reward": 0.8317151069641113, "rewards/format_reward": 1.0, "step": 7497 }, { "completion_length": 195.23468780517578, "epoch": 0.7545157232704403, "grad_norm": 1.0713062286376953, "kl": 0.0986328125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7133716940879822, "reward_std": 0.09469174593687057, "rewards/accuracy_reward": 0.7133717834949493, "rewards/format_reward": 1.0, "step": 7498 }, { "completion_length": 158.78571319580078, "epoch": 0.7546163522012579, "grad_norm": 0.35902610421180725, "kl": 0.0836181640625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.9591836333274841, "reward_std": 0.08884849399328232, "rewards/accuracy_reward": 0.9693877398967743, "rewards/format_reward": 0.9897959232330322, "step": 7499 }, { "completion_length": 206.05101776123047, "epoch": 0.7547169811320755, "grad_norm": 1.0746408700942993, "kl": 0.098876953125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7851914763450623, "reward_std": 0.1545170247554779, "rewards/accuracy_reward": 0.8055996596813202, "rewards/format_reward": 0.9795918464660645, "step": 7500 }, { "completion_length": 180.01020050048828, "epoch": 0.754817610062893, "grad_norm": 0.9070863723754883, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7887917160987854, "reward_std": 0.1684190109372139, "rewards/accuracy_reward": 0.8091998398303986, "rewards/format_reward": 0.9795918166637421, "step": 7501 }, { "completion_length": 176.14285278320312, "epoch": 0.7549182389937107, "grad_norm": 0.6110453605651855, "kl": 0.138671875, "learning_rate": 1e-06, "loss": 0.0055, "reward": 1.844842553138733, "reward_std": 0.10213738307356834, "rewards/accuracy_reward": 0.8550465703010559, "rewards/format_reward": 0.9897959232330322, "step": 7502 }, { "completion_length": 250.03060913085938, "epoch": 0.7550188679245283, "grad_norm": 0.37275224924087524, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.850600004196167, "reward_std": 0.09237417578697205, "rewards/accuracy_reward": 0.8608041107654572, "rewards/format_reward": 0.9897959232330322, "step": 7503 }, { "completion_length": 228.2551040649414, "epoch": 0.755119496855346, "grad_norm": 0.6666489243507385, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7459890842437744, "reward_std": 0.18990408629179, "rewards/accuracy_reward": 0.7663972675800323, "rewards/format_reward": 0.9795918464660645, "step": 7504 }, { "completion_length": 195.58162689208984, "epoch": 0.7552201257861635, "grad_norm": 1.0566829442977905, "kl": 0.0599365234375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.640399992465973, "reward_std": 0.1989515870809555, "rewards/accuracy_reward": 0.6506040692329407, "rewards/format_reward": 0.9897959232330322, "step": 7505 }, { "completion_length": 291.99998474121094, "epoch": 0.7553207547169811, "grad_norm": 0.34299394488334656, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7551020383834839, "reward_std": 0.08099238574504852, "rewards/accuracy_reward": 0.7755101919174194, "rewards/format_reward": 0.9795918166637421, "step": 7506 }, { "completion_length": 272.27550506591797, "epoch": 0.7554213836477988, "grad_norm": 0.43199875950813293, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7703179717063904, "reward_std": 0.06284253671765327, "rewards/accuracy_reward": 0.770317941904068, "rewards/format_reward": 1.0, "step": 7507 }, { "completion_length": 184.46938705444336, "epoch": 0.7555220125786164, "grad_norm": 0.7684250473976135, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8641928434371948, "reward_std": 0.0837159976363182, "rewards/accuracy_reward": 0.8641929030418396, "rewards/format_reward": 1.0, "step": 7508 }, { "completion_length": 243.31632232666016, "epoch": 0.7556226415094339, "grad_norm": 0.6524930000305176, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7557318210601807, "reward_std": 0.17218627035617828, "rewards/accuracy_reward": 0.7659360468387604, "rewards/format_reward": 0.9897959232330322, "step": 7509 }, { "completion_length": 170.57142639160156, "epoch": 0.7557232704402516, "grad_norm": 0.400743305683136, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8661478161811829, "reward_std": 0.09570474922657013, "rewards/accuracy_reward": 0.8865559697151184, "rewards/format_reward": 0.9795918464660645, "step": 7510 }, { "completion_length": 209.4693832397461, "epoch": 0.7558238993710692, "grad_norm": 0.9759964346885681, "kl": 0.111083984375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8147990107536316, "reward_std": 0.14282869547605515, "rewards/accuracy_reward": 0.8352072536945343, "rewards/format_reward": 0.9795918166637421, "step": 7511 }, { "completion_length": 161.948974609375, "epoch": 0.7559245283018868, "grad_norm": 0.5440390110015869, "kl": 0.0882568359375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8718543648719788, "reward_std": 0.1168598160147667, "rewards/accuracy_reward": 0.8820584416389465, "rewards/format_reward": 0.9897959232330322, "step": 7512 }, { "completion_length": 243.6836700439453, "epoch": 0.7560251572327044, "grad_norm": 0.6697885990142822, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.74906724691391, "reward_std": 0.11916085705161095, "rewards/accuracy_reward": 0.7592712938785553, "rewards/format_reward": 0.9897959232330322, "step": 7513 }, { "completion_length": 226.4081573486328, "epoch": 0.756125786163522, "grad_norm": 1.5396045446395874, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7420151829719543, "reward_std": 0.24105332046747208, "rewards/accuracy_reward": 0.7930355966091156, "rewards/format_reward": 0.9489795565605164, "step": 7514 }, { "completion_length": 218.84693145751953, "epoch": 0.7562264150943396, "grad_norm": 4.553775310516357, "kl": 0.215087890625, "learning_rate": 1e-06, "loss": 0.0086, "reward": 1.6931129097938538, "reward_std": 0.2543284595012665, "rewards/accuracy_reward": 0.7237251698970795, "rewards/format_reward": 0.9693877398967743, "step": 7515 }, { "completion_length": 214.27550506591797, "epoch": 0.7563270440251573, "grad_norm": 0.6623004674911499, "kl": 0.10888671875, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7979609966278076, "reward_std": 0.11418173462152481, "rewards/accuracy_reward": 0.8081651329994202, "rewards/format_reward": 0.9897959232330322, "step": 7516 }, { "completion_length": 185.12244415283203, "epoch": 0.7564276729559748, "grad_norm": 0.9085807800292969, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.9387754797935486, "reward_std": 0.14284341782331467, "rewards/accuracy_reward": 0.9591836631298065, "rewards/format_reward": 0.9795918166637421, "step": 7517 }, { "completion_length": 260.7142791748047, "epoch": 0.7565283018867924, "grad_norm": 1.0064687728881836, "kl": 0.108154296875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7629870176315308, "reward_std": 0.29868707805871964, "rewards/accuracy_reward": 0.7935992479324341, "rewards/format_reward": 0.9693877398967743, "step": 7518 }, { "completion_length": 260.05101013183594, "epoch": 0.7566289308176101, "grad_norm": 0.46330732107162476, "kl": 0.0545654296875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7829607725143433, "reward_std": 0.12921453639864922, "rewards/accuracy_reward": 0.7829608023166656, "rewards/format_reward": 1.0, "step": 7519 }, { "completion_length": 268.27550506591797, "epoch": 0.7567295597484277, "grad_norm": 0.6184827089309692, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6734694242477417, "reward_std": 0.16118236258625984, "rewards/accuracy_reward": 0.6734693646430969, "rewards/format_reward": 1.0, "step": 7520 }, { "completion_length": 214.2040786743164, "epoch": 0.7568301886792452, "grad_norm": 1.3551794290542603, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7497094869613647, "reward_std": 0.16830559819936752, "rewards/accuracy_reward": 0.7599136233329773, "rewards/format_reward": 0.9897959232330322, "step": 7521 }, { "completion_length": 234.79591369628906, "epoch": 0.7569308176100629, "grad_norm": 0.8320236802101135, "kl": 0.0723876953125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8269636631011963, "reward_std": 0.14440114796161652, "rewards/accuracy_reward": 0.857575923204422, "rewards/format_reward": 0.9693877398967743, "step": 7522 }, { "completion_length": 197.3775405883789, "epoch": 0.7570314465408805, "grad_norm": 1.0175046920776367, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.760907769203186, "reward_std": 0.17339109629392624, "rewards/accuracy_reward": 0.7711118161678314, "rewards/format_reward": 0.9897959232330322, "step": 7523 }, { "completion_length": 220.09182739257812, "epoch": 0.7571320754716981, "grad_norm": 0.6379559636116028, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8999027609825134, "reward_std": 0.14946797490119934, "rewards/accuracy_reward": 0.9203109443187714, "rewards/format_reward": 0.9795918166637421, "step": 7524 }, { "completion_length": 212.9795913696289, "epoch": 0.7572327044025158, "grad_norm": 0.7456223368644714, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8112854957580566, "reward_std": 0.07505989074707031, "rewards/accuracy_reward": 0.8112855553627014, "rewards/format_reward": 1.0, "step": 7525 }, { "completion_length": 235.57142639160156, "epoch": 0.7573333333333333, "grad_norm": 0.4429088830947876, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8833190202713013, "reward_std": 0.08805302530527115, "rewards/accuracy_reward": 0.8833190500736237, "rewards/format_reward": 1.0, "step": 7526 }, { "completion_length": 253.52040100097656, "epoch": 0.7574339622641509, "grad_norm": 1.310076355934143, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7604580521583557, "reward_std": 0.27061326801776886, "rewards/accuracy_reward": 0.7910703122615814, "rewards/format_reward": 0.9693877398967743, "step": 7527 }, { "completion_length": 246.80611419677734, "epoch": 0.7575345911949686, "grad_norm": 0.9957230091094971, "kl": 0.0623779296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7900065183639526, "reward_std": 0.16374315321445465, "rewards/accuracy_reward": 0.8002105355262756, "rewards/format_reward": 0.9897959232330322, "step": 7528 }, { "completion_length": 243.40816497802734, "epoch": 0.7576352201257862, "grad_norm": 0.7199671268463135, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.827782928943634, "reward_std": 0.15211254730820656, "rewards/accuracy_reward": 0.848191112279892, "rewards/format_reward": 0.9795918166637421, "step": 7529 }, { "completion_length": 206.4081573486328, "epoch": 0.7577358490566037, "grad_norm": 0.8189510107040405, "kl": 0.1357421875, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.764503300189972, "reward_std": 0.20426661521196365, "rewards/accuracy_reward": 0.7951156497001648, "rewards/format_reward": 0.9693877398967743, "step": 7530 }, { "completion_length": 177.38774871826172, "epoch": 0.7578364779874214, "grad_norm": 0.6433958411216736, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7265305519104004, "reward_std": 0.1012108214199543, "rewards/accuracy_reward": 0.7469387352466583, "rewards/format_reward": 0.9795918166637421, "step": 7531 }, { "completion_length": 357.2244873046875, "epoch": 0.757937106918239, "grad_norm": 0.7676022052764893, "kl": 0.12548828125, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.6597238779067993, "reward_std": 0.2422114983201027, "rewards/accuracy_reward": 0.7107443809509277, "rewards/format_reward": 0.9489795565605164, "step": 7532 }, { "completion_length": 331.0, "epoch": 0.7580377358490566, "grad_norm": 0.5672006011009216, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.769590973854065, "reward_std": 0.15627074986696243, "rewards/accuracy_reward": 0.7899991273880005, "rewards/format_reward": 0.9795918464660645, "step": 7533 }, { "completion_length": 257.27549743652344, "epoch": 0.7581383647798742, "grad_norm": 0.8456624746322632, "kl": 0.109375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.9285714030265808, "reward_std": 0.08317594230175018, "rewards/accuracy_reward": 0.9285714328289032, "rewards/format_reward": 1.0, "step": 7534 }, { "completion_length": 215.4081573486328, "epoch": 0.7582389937106918, "grad_norm": 0.8435761332511902, "kl": 0.13427734375, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.8702014684677124, "reward_std": 0.13777323067188263, "rewards/accuracy_reward": 0.8804056644439697, "rewards/format_reward": 0.9897959232330322, "step": 7535 }, { "completion_length": 282.97957611083984, "epoch": 0.7583396226415094, "grad_norm": 0.7068774104118347, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6332644820213318, "reward_std": 0.22920598834753036, "rewards/accuracy_reward": 0.6434686183929443, "rewards/format_reward": 0.9897959232330322, "step": 7536 }, { "completion_length": 245.9591827392578, "epoch": 0.7584402515723271, "grad_norm": 0.9853916764259338, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6204081773757935, "reward_std": 0.19832457602024078, "rewards/accuracy_reward": 0.6408163011074066, "rewards/format_reward": 0.9795918464660645, "step": 7537 }, { "completion_length": 246.54080963134766, "epoch": 0.7585408805031446, "grad_norm": 0.6625910401344299, "kl": 0.1025390625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6579228639602661, "reward_std": 0.16429885476827621, "rewards/accuracy_reward": 0.6681269109249115, "rewards/format_reward": 0.9897959232330322, "step": 7538 }, { "completion_length": 282.6428527832031, "epoch": 0.7586415094339622, "grad_norm": 0.5866534113883972, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6896553039550781, "reward_std": 0.18907494843006134, "rewards/accuracy_reward": 0.7304717600345612, "rewards/format_reward": 0.9591836333274841, "step": 7539 }, { "completion_length": 284.6734619140625, "epoch": 0.7587421383647799, "grad_norm": 0.576056957244873, "kl": 0.0716552734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7526282668113708, "reward_std": 0.13607216626405716, "rewards/accuracy_reward": 0.7526283264160156, "rewards/format_reward": 1.0, "step": 7540 }, { "completion_length": 244.04080963134766, "epoch": 0.7588427672955975, "grad_norm": 0.8468701839447021, "kl": 0.09814453125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.769750714302063, "reward_std": 0.18593450635671616, "rewards/accuracy_reward": 0.7901588678359985, "rewards/format_reward": 0.9795918166637421, "step": 7541 }, { "completion_length": 304.82652282714844, "epoch": 0.758943396226415, "grad_norm": 0.8030463457107544, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8948165774345398, "reward_std": 0.2375512346625328, "rewards/accuracy_reward": 0.9356328845024109, "rewards/format_reward": 0.9591836333274841, "step": 7542 }, { "completion_length": 243.11224365234375, "epoch": 0.7590440251572327, "grad_norm": 0.5206514000892639, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8135554790496826, "reward_std": 0.08442693948745728, "rewards/accuracy_reward": 0.8237595856189728, "rewards/format_reward": 0.9897959232330322, "step": 7543 }, { "completion_length": 315.7755126953125, "epoch": 0.7591446540880503, "grad_norm": 0.9439854025840759, "kl": 0.10693359375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7991429567337036, "reward_std": 0.2623923495411873, "rewards/accuracy_reward": 0.7991430163383484, "rewards/format_reward": 1.0, "step": 7544 }, { "completion_length": 238.23468780517578, "epoch": 0.759245283018868, "grad_norm": 0.9690129160881042, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7312085628509521, "reward_std": 0.22485966980457306, "rewards/accuracy_reward": 0.7312085628509521, "rewards/format_reward": 1.0, "step": 7545 }, { "completion_length": 281.2346954345703, "epoch": 0.7593459119496855, "grad_norm": 0.40328189730644226, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7823157906532288, "reward_std": 0.20974205434322357, "rewards/accuracy_reward": 0.8333361744880676, "rewards/format_reward": 0.9489795565605164, "step": 7546 }, { "completion_length": 263.34693145751953, "epoch": 0.7594465408805031, "grad_norm": 0.7668805122375488, "kl": 0.10986328125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7017006874084473, "reward_std": 0.2172130011022091, "rewards/accuracy_reward": 0.7221088409423828, "rewards/format_reward": 0.9795918166637421, "step": 7547 }, { "completion_length": 245.8775405883789, "epoch": 0.7595471698113208, "grad_norm": 0.6922615766525269, "kl": 0.0902099609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.681498646736145, "reward_std": 0.18292183429002762, "rewards/accuracy_reward": 0.7019068598747253, "rewards/format_reward": 0.9795918464660645, "step": 7548 }, { "completion_length": 248.60204315185547, "epoch": 0.7596477987421384, "grad_norm": 0.4087236821651459, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7370129227638245, "reward_std": 0.14761308580636978, "rewards/accuracy_reward": 0.747217059135437, "rewards/format_reward": 0.9897959232330322, "step": 7549 }, { "completion_length": 314.2244873046875, "epoch": 0.759748427672956, "grad_norm": 1.0381004810333252, "kl": 0.0799560546875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6888311505317688, "reward_std": 0.28520385175943375, "rewards/accuracy_reward": 0.7194434404373169, "rewards/format_reward": 0.9693877398967743, "step": 7550 }, { "completion_length": 231.06121826171875, "epoch": 0.7598490566037736, "grad_norm": 0.7363719940185547, "kl": 0.130859375, "learning_rate": 1e-06, "loss": 0.0052, "reward": 1.667326271533966, "reward_std": 0.08003421872854233, "rewards/accuracy_reward": 0.6673263311386108, "rewards/format_reward": 1.0, "step": 7551 }, { "completion_length": 264.64286041259766, "epoch": 0.7599496855345912, "grad_norm": 1.270902156829834, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7328017950057983, "reward_std": 0.09474376402795315, "rewards/accuracy_reward": 0.7430058419704437, "rewards/format_reward": 0.9897959232330322, "step": 7552 }, { "completion_length": 280.6836700439453, "epoch": 0.7600503144654088, "grad_norm": 0.577488899230957, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8102040886878967, "reward_std": 0.2529129385948181, "rewards/accuracy_reward": 0.8306122422218323, "rewards/format_reward": 0.9795918464660645, "step": 7553 }, { "completion_length": 196.75509643554688, "epoch": 0.7601509433962265, "grad_norm": 0.6974973678588867, "kl": 0.13525390625, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.8409863114356995, "reward_std": 0.1382664479315281, "rewards/accuracy_reward": 0.851190447807312, "rewards/format_reward": 0.9897959232330322, "step": 7554 }, { "completion_length": 313.9591827392578, "epoch": 0.760251572327044, "grad_norm": 0.8255303502082825, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7977736592292786, "reward_std": 0.24882426857948303, "rewards/accuracy_reward": 0.7977736294269562, "rewards/format_reward": 1.0, "step": 7555 }, { "completion_length": 259.7244873046875, "epoch": 0.7603522012578616, "grad_norm": 2.199215888977051, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8401361107826233, "reward_std": 0.21325845271348953, "rewards/accuracy_reward": 0.8605442047119141, "rewards/format_reward": 0.9795918464660645, "step": 7556 }, { "completion_length": 298.7653045654297, "epoch": 0.7604528301886793, "grad_norm": 0.6573279500007629, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8241496086120605, "reward_std": 0.2177976816892624, "rewards/accuracy_reward": 0.8547618389129639, "rewards/format_reward": 0.9693877398967743, "step": 7557 }, { "completion_length": 147.19387817382812, "epoch": 0.7605534591194969, "grad_norm": 0.70831698179245, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.9626365900039673, "reward_std": 0.08623021468520164, "rewards/accuracy_reward": 0.9728406369686127, "rewards/format_reward": 0.9897959232330322, "step": 7558 }, { "completion_length": 220.05101013183594, "epoch": 0.7606540880503144, "grad_norm": 0.7426782846450806, "kl": 0.12646484375, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.6240281462669373, "reward_std": 0.1288326159119606, "rewards/accuracy_reward": 0.6444363296031952, "rewards/format_reward": 0.9795918166637421, "step": 7559 }, { "completion_length": 243.37754821777344, "epoch": 0.7607547169811321, "grad_norm": 0.4039209485054016, "kl": 0.0855712890625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7866573929786682, "reward_std": 0.056738268584012985, "rewards/accuracy_reward": 0.7866574227809906, "rewards/format_reward": 1.0, "step": 7560 }, { "completion_length": 195.2653045654297, "epoch": 0.7608553459119497, "grad_norm": 0.905555009841919, "kl": 0.098876953125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8599618077278137, "reward_std": 0.1799742877483368, "rewards/accuracy_reward": 0.8803699910640717, "rewards/format_reward": 0.9795918464660645, "step": 7561 }, { "completion_length": 213.9183578491211, "epoch": 0.7609559748427673, "grad_norm": 1.0549345016479492, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7517006993293762, "reward_std": 0.18325404077768326, "rewards/accuracy_reward": 0.7517006993293762, "rewards/format_reward": 1.0, "step": 7562 }, { "completion_length": 252.32653045654297, "epoch": 0.7610566037735849, "grad_norm": 0.5613099336624146, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.859183669090271, "reward_std": 0.09311423124745488, "rewards/accuracy_reward": 0.8693877458572388, "rewards/format_reward": 0.9897959232330322, "step": 7563 }, { "completion_length": 243.02040100097656, "epoch": 0.7611572327044025, "grad_norm": 0.9543222188949585, "kl": 0.1005859375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.870624601840973, "reward_std": 0.15943368524312973, "rewards/accuracy_reward": 0.8808286786079407, "rewards/format_reward": 0.9897959232330322, "step": 7564 }, { "completion_length": 256.6122360229492, "epoch": 0.7612578616352201, "grad_norm": 0.5047045946121216, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.723329484462738, "reward_std": 0.07726302370429039, "rewards/accuracy_reward": 0.7335335612297058, "rewards/format_reward": 0.9897959232330322, "step": 7565 }, { "completion_length": 215.9387664794922, "epoch": 0.7613584905660378, "grad_norm": 1.166867733001709, "kl": 0.11083984375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7326054573059082, "reward_std": 0.20828214287757874, "rewards/accuracy_reward": 0.7530136406421661, "rewards/format_reward": 0.9795918166637421, "step": 7566 }, { "completion_length": 247.78571319580078, "epoch": 0.7614591194968553, "grad_norm": 0.7391228079795837, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8530943393707275, "reward_std": 0.10221169888973236, "rewards/accuracy_reward": 0.8632984459400177, "rewards/format_reward": 0.9897959232330322, "step": 7567 }, { "completion_length": 317.2040710449219, "epoch": 0.7615597484276729, "grad_norm": 0.4123092591762543, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.69489586353302, "reward_std": 0.11752507835626602, "rewards/accuracy_reward": 0.7050999701023102, "rewards/format_reward": 0.9897959232330322, "step": 7568 }, { "completion_length": 238.7448959350586, "epoch": 0.7616603773584906, "grad_norm": 0.6951234340667725, "kl": 0.100830078125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7886473536491394, "reward_std": 0.09607364609837532, "rewards/accuracy_reward": 0.7988514602184296, "rewards/format_reward": 0.9897959232330322, "step": 7569 }, { "completion_length": 292.5408172607422, "epoch": 0.7617610062893082, "grad_norm": 0.7129680514335632, "kl": 0.11865234375, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7300246953964233, "reward_std": 0.1534112822264433, "rewards/accuracy_reward": 0.7402287721633911, "rewards/format_reward": 0.9897959232330322, "step": 7570 }, { "completion_length": 184.32653045654297, "epoch": 0.7618616352201257, "grad_norm": 0.7979307770729065, "kl": 0.11767578125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7691229581832886, "reward_std": 0.10654493607580662, "rewards/accuracy_reward": 0.7691229283809662, "rewards/format_reward": 1.0, "step": 7571 }, { "completion_length": 199.06122589111328, "epoch": 0.7619622641509434, "grad_norm": 1.2858506441116333, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6810778379440308, "reward_std": 0.1719379499554634, "rewards/accuracy_reward": 0.6912818551063538, "rewards/format_reward": 0.9897959232330322, "step": 7572 }, { "completion_length": 234.74488830566406, "epoch": 0.762062893081761, "grad_norm": 1.1069256067276, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.780407428741455, "reward_std": 0.16814132034778595, "rewards/accuracy_reward": 0.7906115651130676, "rewards/format_reward": 0.9897959232330322, "step": 7573 }, { "completion_length": 346.28570556640625, "epoch": 0.7621635220125786, "grad_norm": 0.5660962462425232, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.4533881545066833, "reward_std": 0.16920219361782074, "rewards/accuracy_reward": 0.4533880800008774, "rewards/format_reward": 1.0, "step": 7574 }, { "completion_length": 242.6938705444336, "epoch": 0.7622641509433963, "grad_norm": 0.5897194743156433, "kl": 0.1201171875, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7726216912269592, "reward_std": 0.09997399151325226, "rewards/accuracy_reward": 0.7828257381916046, "rewards/format_reward": 0.9897959232330322, "step": 7575 }, { "completion_length": 251.81631469726562, "epoch": 0.7623647798742138, "grad_norm": 5.994319438934326, "kl": 0.0689697265625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7551258206367493, "reward_std": 0.12630026042461395, "rewards/accuracy_reward": 0.7755340039730072, "rewards/format_reward": 0.9795918464660645, "step": 7576 }, { "completion_length": 263.28570556640625, "epoch": 0.7624654088050314, "grad_norm": 0.7019866108894348, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6640669703483582, "reward_std": 0.21708041429519653, "rewards/accuracy_reward": 0.6742710769176483, "rewards/format_reward": 0.9897959232330322, "step": 7577 }, { "completion_length": 239.9183578491211, "epoch": 0.7625660377358491, "grad_norm": 0.4825810492038727, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8407978415489197, "reward_std": 0.09577721357345581, "rewards/accuracy_reward": 0.8407979011535645, "rewards/format_reward": 1.0, "step": 7578 }, { "completion_length": 243.86734008789062, "epoch": 0.7626666666666667, "grad_norm": 0.6169489622116089, "kl": 0.109130859375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8819108605384827, "reward_std": 0.09862302988767624, "rewards/accuracy_reward": 0.9023191034793854, "rewards/format_reward": 0.9795918464660645, "step": 7579 }, { "completion_length": 219.08162689208984, "epoch": 0.7627672955974842, "grad_norm": 1.019680142402649, "kl": 0.14208984375, "learning_rate": 1e-06, "loss": 0.0058, "reward": 1.646773874759674, "reward_std": 0.2889258414506912, "rewards/accuracy_reward": 0.6875901520252228, "rewards/format_reward": 0.9591836333274841, "step": 7580 }, { "completion_length": 227.57142639160156, "epoch": 0.7628679245283019, "grad_norm": 0.7161943316459656, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8213799595832825, "reward_std": 0.188314538449049, "rewards/accuracy_reward": 0.8417880833148956, "rewards/format_reward": 0.9795918464660645, "step": 7581 }, { "completion_length": 214.12244415283203, "epoch": 0.7629685534591195, "grad_norm": 1.1382685899734497, "kl": 0.117431640625, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.840036153793335, "reward_std": 0.15795083343982697, "rewards/accuracy_reward": 0.8604443073272705, "rewards/format_reward": 0.9795918166637421, "step": 7582 }, { "completion_length": 241.5204086303711, "epoch": 0.7630691823899372, "grad_norm": 0.3734399378299713, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7145965695381165, "reward_std": 0.09289892762899399, "rewards/accuracy_reward": 0.7145965993404388, "rewards/format_reward": 1.0, "step": 7583 }, { "completion_length": 219.57141876220703, "epoch": 0.7631698113207547, "grad_norm": 0.25356820225715637, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.887755036354065, "reward_std": 0.08319716155529022, "rewards/accuracy_reward": 0.9081632494926453, "rewards/format_reward": 0.9795918166637421, "step": 7584 }, { "completion_length": 306.29591369628906, "epoch": 0.7632704402515723, "grad_norm": 0.5652891397476196, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.754991888999939, "reward_std": 0.21305803954601288, "rewards/accuracy_reward": 0.7856041491031647, "rewards/format_reward": 0.9693877398967743, "step": 7585 }, { "completion_length": 212.81632232666016, "epoch": 0.76337106918239, "grad_norm": 1.0133765935897827, "kl": 0.09814453125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7377859950065613, "reward_std": 0.17638903856277466, "rewards/accuracy_reward": 0.7786023318767548, "rewards/format_reward": 0.9591836333274841, "step": 7586 }, { "completion_length": 289.10203552246094, "epoch": 0.7634716981132076, "grad_norm": 0.6995899081230164, "kl": 0.0587158203125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8290550708770752, "reward_std": 0.09062104672193527, "rewards/accuracy_reward": 0.8494631946086884, "rewards/format_reward": 0.9795918464660645, "step": 7587 }, { "completion_length": 291.3061065673828, "epoch": 0.7635723270440251, "grad_norm": 0.5057153701782227, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7472531199455261, "reward_std": 0.09803740680217743, "rewards/accuracy_reward": 0.7472530901432037, "rewards/format_reward": 1.0, "step": 7588 }, { "completion_length": 251.2142791748047, "epoch": 0.7636729559748427, "grad_norm": 0.8665660619735718, "kl": 0.0760498046875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8562925457954407, "reward_std": 0.23461037129163742, "rewards/accuracy_reward": 0.8869048058986664, "rewards/format_reward": 0.9693877398967743, "step": 7589 }, { "completion_length": 217.86734008789062, "epoch": 0.7637735849056604, "grad_norm": 0.9413653612136841, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6321995258331299, "reward_std": 0.20385459065437317, "rewards/accuracy_reward": 0.6321995258331299, "rewards/format_reward": 1.0, "step": 7590 }, { "completion_length": 208.58163452148438, "epoch": 0.763874213836478, "grad_norm": 0.7765293121337891, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7170310020446777, "reward_std": 0.15231634676456451, "rewards/accuracy_reward": 0.7374392151832581, "rewards/format_reward": 0.9795918464660645, "step": 7591 }, { "completion_length": 248.78570556640625, "epoch": 0.7639748427672955, "grad_norm": 0.8258434534072876, "kl": 0.1187744140625, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.8315964937210083, "reward_std": 0.055779874324798584, "rewards/accuracy_reward": 0.8315966129302979, "rewards/format_reward": 1.0, "step": 7592 }, { "completion_length": 239.79591369628906, "epoch": 0.7640754716981132, "grad_norm": 0.8488466143608093, "kl": 0.09912109375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7130467295646667, "reward_std": 0.16202953457832336, "rewards/accuracy_reward": 0.7334547936916351, "rewards/format_reward": 0.9795918464660645, "step": 7593 }, { "completion_length": 260.83673095703125, "epoch": 0.7641761006289308, "grad_norm": 0.5205745697021484, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.784095585346222, "reward_std": 0.15152832120656967, "rewards/accuracy_reward": 0.7942996621131897, "rewards/format_reward": 0.9897959232330322, "step": 7594 }, { "completion_length": 235.56122589111328, "epoch": 0.7642767295597485, "grad_norm": 0.36169949173927307, "kl": 0.066162109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7468098998069763, "reward_std": 0.05962027795612812, "rewards/accuracy_reward": 0.7468099296092987, "rewards/format_reward": 1.0, "step": 7595 }, { "completion_length": 311.948974609375, "epoch": 0.764377358490566, "grad_norm": 0.7141547799110413, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.5256630778312683, "reward_std": 0.21158049255609512, "rewards/accuracy_reward": 0.5664794445037842, "rewards/format_reward": 0.9591836631298065, "step": 7596 }, { "completion_length": 209.6428451538086, "epoch": 0.7644779874213836, "grad_norm": 0.31828194856643677, "kl": 0.099609375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8348175883293152, "reward_std": 0.012726725079119205, "rewards/accuracy_reward": 0.8348175585269928, "rewards/format_reward": 1.0, "step": 7597 }, { "completion_length": 287.6836700439453, "epoch": 0.7645786163522013, "grad_norm": 0.4784647524356842, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.797933042049408, "reward_std": 0.06651574373245239, "rewards/accuracy_reward": 0.8081371486186981, "rewards/format_reward": 0.9897959232330322, "step": 7598 }, { "completion_length": 292.3061065673828, "epoch": 0.7646792452830189, "grad_norm": 0.41439053416252136, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8062254786491394, "reward_std": 0.1292657069861889, "rewards/accuracy_reward": 0.8164296448230743, "rewards/format_reward": 0.9897959232330322, "step": 7599 }, { "completion_length": 210.12244415283203, "epoch": 0.7647798742138365, "grad_norm": 0.7382646799087524, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8673468828201294, "reward_std": 0.17769698798656464, "rewards/accuracy_reward": 0.8775509893894196, "rewards/format_reward": 0.9897959232330322, "step": 7600 }, { "completion_length": 240.31632232666016, "epoch": 0.7648805031446541, "grad_norm": 41.58851623535156, "kl": 0.12353515625, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.7743141651153564, "reward_std": 0.20709989964962006, "rewards/accuracy_reward": 0.7845182120800018, "rewards/format_reward": 0.9897959232330322, "step": 7601 }, { "completion_length": 239.09183502197266, "epoch": 0.7649811320754717, "grad_norm": 0.47766736149787903, "kl": 0.1082763671875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.8239177465438843, "reward_std": 0.11222352413460612, "rewards/accuracy_reward": 0.8239177167415619, "rewards/format_reward": 1.0, "step": 7602 }, { "completion_length": 233.72447967529297, "epoch": 0.7650817610062893, "grad_norm": 0.3461768329143524, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8536759614944458, "reward_std": 0.07741439715027809, "rewards/accuracy_reward": 0.853675901889801, "rewards/format_reward": 1.0, "step": 7603 }, { "completion_length": 321.32652282714844, "epoch": 0.765182389937107, "grad_norm": 0.9307733774185181, "kl": 0.114990234375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.6141569018363953, "reward_std": 0.2793293744325638, "rewards/accuracy_reward": 0.6549732685089111, "rewards/format_reward": 0.9591836333274841, "step": 7604 }, { "completion_length": 235.73468780517578, "epoch": 0.7652830188679245, "grad_norm": 0.7984287738800049, "kl": 0.125244140625, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.8646449446678162, "reward_std": 0.22008004784584045, "rewards/accuracy_reward": 0.8952572047710419, "rewards/format_reward": 0.9693877398967743, "step": 7605 }, { "completion_length": 233.7448959350586, "epoch": 0.7653836477987421, "grad_norm": 1.1926865577697754, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8557823300361633, "reward_std": 0.15916182100772858, "rewards/accuracy_reward": 0.8863945305347443, "rewards/format_reward": 0.9693877398967743, "step": 7606 }, { "completion_length": 190.78571319580078, "epoch": 0.7654842767295598, "grad_norm": 0.9561316967010498, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8826825022697449, "reward_std": 0.1442306861281395, "rewards/accuracy_reward": 0.8928865492343903, "rewards/format_reward": 0.9897959232330322, "step": 7607 }, { "completion_length": 249.85713958740234, "epoch": 0.7655849056603774, "grad_norm": 0.8191724419593811, "kl": 0.147216796875, "learning_rate": 1e-06, "loss": 0.0059, "reward": 1.7545934319496155, "reward_std": 0.15955089405179024, "rewards/accuracy_reward": 0.7852056622505188, "rewards/format_reward": 0.9693877398967743, "step": 7608 }, { "completion_length": 245.25509643554688, "epoch": 0.7656855345911949, "grad_norm": 1.6226049661636353, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7381161451339722, "reward_std": 0.22842518240213394, "rewards/accuracy_reward": 0.7687284648418427, "rewards/format_reward": 0.9693877398967743, "step": 7609 }, { "completion_length": 245.4285659790039, "epoch": 0.7657861635220126, "grad_norm": 0.6126834154129028, "kl": 0.097412109375, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8059678673744202, "reward_std": 0.19041073322296143, "rewards/accuracy_reward": 0.8365800976753235, "rewards/format_reward": 0.9693877398967743, "step": 7610 }, { "completion_length": 231.48979949951172, "epoch": 0.7658867924528302, "grad_norm": 0.47844067215919495, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.723922848701477, "reward_std": 0.07647200860083103, "rewards/accuracy_reward": 0.7239229083061218, "rewards/format_reward": 1.0, "step": 7611 }, { "completion_length": 296.69386291503906, "epoch": 0.7659874213836478, "grad_norm": 0.8002999424934387, "kl": 0.1259765625, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.6965361833572388, "reward_std": 0.22151728719472885, "rewards/accuracy_reward": 0.7169443666934967, "rewards/format_reward": 0.9795918464660645, "step": 7612 }, { "completion_length": 311.17346954345703, "epoch": 0.7660880503144654, "grad_norm": 0.638687252998352, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.751515507698059, "reward_std": 0.24016210436820984, "rewards/accuracy_reward": 0.771923691034317, "rewards/format_reward": 0.9795918464660645, "step": 7613 }, { "completion_length": 254.448974609375, "epoch": 0.766188679245283, "grad_norm": 0.8690004944801331, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7526885271072388, "reward_std": 0.17378765903413296, "rewards/accuracy_reward": 0.7628926932811737, "rewards/format_reward": 0.9897959232330322, "step": 7614 }, { "completion_length": 210.54080963134766, "epoch": 0.7662893081761006, "grad_norm": 0.9915165901184082, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8300556540489197, "reward_std": 0.1732485070824623, "rewards/accuracy_reward": 0.8402597308158875, "rewards/format_reward": 0.9897959232330322, "step": 7615 }, { "completion_length": 193.36734008789062, "epoch": 0.7663899371069183, "grad_norm": 1.3734235763549805, "kl": 0.12353515625, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.7755101919174194, "reward_std": 0.28568682819604874, "rewards/accuracy_reward": 0.8163265287876129, "rewards/format_reward": 0.9591836631298065, "step": 7616 }, { "completion_length": 328.15306091308594, "epoch": 0.7664905660377358, "grad_norm": 0.6555095911026001, "kl": 0.0653076171875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8052639365196228, "reward_std": 0.13309557735919952, "rewards/accuracy_reward": 0.8256721794605255, "rewards/format_reward": 0.9795918464660645, "step": 7617 }, { "completion_length": 211.60203552246094, "epoch": 0.7665911949685534, "grad_norm": 0.36188051104545593, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8758988976478577, "reward_std": 0.05332885030657053, "rewards/accuracy_reward": 0.8963070809841156, "rewards/format_reward": 0.9795918166637421, "step": 7618 }, { "completion_length": 244.79591369628906, "epoch": 0.7666918238993711, "grad_norm": 0.5990508794784546, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8390832543373108, "reward_std": 0.14484064280986786, "rewards/accuracy_reward": 0.8594914078712463, "rewards/format_reward": 0.9795918166637421, "step": 7619 }, { "completion_length": 265.55101776123047, "epoch": 0.7667924528301887, "grad_norm": 0.7579560279846191, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6256903409957886, "reward_std": 0.21467824280261993, "rewards/accuracy_reward": 0.6563026010990143, "rewards/format_reward": 0.9693877398967743, "step": 7620 }, { "completion_length": 241.4795913696289, "epoch": 0.7668930817610062, "grad_norm": 2.0356531143188477, "kl": 0.09765625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7479283213615417, "reward_std": 0.21935118734836578, "rewards/accuracy_reward": 0.7683364152908325, "rewards/format_reward": 0.9795918166637421, "step": 7621 }, { "completion_length": 228.32652282714844, "epoch": 0.7669937106918239, "grad_norm": 1.1480637788772583, "kl": 0.09814453125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7059309482574463, "reward_std": 0.1052483469247818, "rewards/accuracy_reward": 0.7059310078620911, "rewards/format_reward": 1.0, "step": 7622 }, { "completion_length": 234.66326904296875, "epoch": 0.7670943396226415, "grad_norm": 0.6309643387794495, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8214285373687744, "reward_std": 0.18668005242943764, "rewards/accuracy_reward": 0.8316326439380646, "rewards/format_reward": 0.9897959232330322, "step": 7623 }, { "completion_length": 245.56121826171875, "epoch": 0.7671949685534591, "grad_norm": 0.8942287564277649, "kl": 0.1204833984375, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7654950022697449, "reward_std": 0.27602532505989075, "rewards/accuracy_reward": 0.796107292175293, "rewards/format_reward": 0.9693877398967743, "step": 7624 }, { "completion_length": 237.73468780517578, "epoch": 0.7672955974842768, "grad_norm": 0.5488008856773376, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8400056958198547, "reward_std": 0.12366711720824242, "rewards/accuracy_reward": 0.8502097725868225, "rewards/format_reward": 0.9897959232330322, "step": 7625 }, { "completion_length": 210.56122589111328, "epoch": 0.7673962264150943, "grad_norm": 0.7004402279853821, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.877551019191742, "reward_std": 0.18555308878421783, "rewards/accuracy_reward": 0.9081632494926453, "rewards/format_reward": 0.9693877398967743, "step": 7626 }, { "completion_length": 210.2244873046875, "epoch": 0.7674968553459119, "grad_norm": 2.214496374130249, "kl": 0.15478515625, "learning_rate": 1e-06, "loss": 0.0062, "reward": 1.6570487022399902, "reward_std": 0.1290365718305111, "rewards/accuracy_reward": 0.6672528088092804, "rewards/format_reward": 0.9897959232330322, "step": 7627 }, { "completion_length": 231.4285659790039, "epoch": 0.7675974842767296, "grad_norm": 1.5951162576675415, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7469540238380432, "reward_std": 0.14143721386790276, "rewards/accuracy_reward": 0.7571581304073334, "rewards/format_reward": 0.9897959232330322, "step": 7628 }, { "completion_length": 260.23468017578125, "epoch": 0.7676981132075472, "grad_norm": 0.7739973664283752, "kl": 0.13671875, "learning_rate": 1e-06, "loss": 0.0055, "reward": 1.786525011062622, "reward_std": 0.13839098438620567, "rewards/accuracy_reward": 0.7865250706672668, "rewards/format_reward": 1.0, "step": 7629 }, { "completion_length": 222.75509643554688, "epoch": 0.7677987421383647, "grad_norm": 0.8119737505912781, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.9235827326774597, "reward_std": 0.10367152094841003, "rewards/accuracy_reward": 0.9235827326774597, "rewards/format_reward": 1.0, "step": 7630 }, { "completion_length": 176.82653045654297, "epoch": 0.7678993710691824, "grad_norm": 0.5217236280441284, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.9085519909858704, "reward_std": 0.04720456153154373, "rewards/accuracy_reward": 0.908551961183548, "rewards/format_reward": 1.0, "step": 7631 }, { "completion_length": 247.09182739257812, "epoch": 0.768, "grad_norm": 0.35287708044052124, "kl": 0.096923828125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7304561138153076, "reward_std": 0.046149423345923424, "rewards/accuracy_reward": 0.7406602501869202, "rewards/format_reward": 0.9897959232330322, "step": 7632 }, { "completion_length": 303.02039337158203, "epoch": 0.7681006289308177, "grad_norm": 1.9763485193252563, "kl": 0.14111328125, "learning_rate": 1e-06, "loss": 0.0056, "reward": 1.7439146041870117, "reward_std": 0.1026778593659401, "rewards/accuracy_reward": 0.7541186809539795, "rewards/format_reward": 0.9897959232330322, "step": 7633 }, { "completion_length": 217.14285278320312, "epoch": 0.7682012578616352, "grad_norm": 1.0765531063079834, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.717068612575531, "reward_std": 0.17273850366473198, "rewards/accuracy_reward": 0.7170686423778534, "rewards/format_reward": 1.0, "step": 7634 }, { "completion_length": 330.8163146972656, "epoch": 0.7683018867924528, "grad_norm": 0.7113745212554932, "kl": 0.05908203125, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7966957688331604, "reward_std": 0.16804473847150803, "rewards/accuracy_reward": 0.8068998754024506, "rewards/format_reward": 0.9897959232330322, "step": 7635 }, { "completion_length": 218.7040786743164, "epoch": 0.7684025157232705, "grad_norm": 6.087150573730469, "kl": 0.140625, "learning_rate": 1e-06, "loss": 0.0056, "reward": 1.7761880159378052, "reward_std": 0.14672836661338806, "rewards/accuracy_reward": 0.7863921523094177, "rewards/format_reward": 0.9897959232330322, "step": 7636 }, { "completion_length": 198.19386672973633, "epoch": 0.7685031446540881, "grad_norm": 0.33022138476371765, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8561989068984985, "reward_std": 0.0765974372625351, "rewards/accuracy_reward": 0.8561989068984985, "rewards/format_reward": 1.0, "step": 7637 }, { "completion_length": 329.9183654785156, "epoch": 0.7686037735849056, "grad_norm": 1.121315598487854, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7823129892349243, "reward_std": 0.14173395186662674, "rewards/accuracy_reward": 0.8027211129665375, "rewards/format_reward": 0.9795918166637421, "step": 7638 }, { "completion_length": 346.2142639160156, "epoch": 0.7687044025157233, "grad_norm": 1.0293128490447998, "kl": 0.0574951171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.763729751110077, "reward_std": 0.13551979884505272, "rewards/accuracy_reward": 0.7637297511100769, "rewards/format_reward": 1.0, "step": 7639 }, { "completion_length": 256.0408020019531, "epoch": 0.7688050314465409, "grad_norm": 1.2693883180618286, "kl": 0.103271484375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.589320957660675, "reward_std": 0.25414592027664185, "rewards/accuracy_reward": 0.6199332773685455, "rewards/format_reward": 0.9693877398967743, "step": 7640 }, { "completion_length": 236.64285278320312, "epoch": 0.7689056603773585, "grad_norm": 0.4844970107078552, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8281004428863525, "reward_std": 0.1045607216656208, "rewards/accuracy_reward": 0.8281004726886749, "rewards/format_reward": 1.0, "step": 7641 }, { "completion_length": 300.27549743652344, "epoch": 0.769006289308176, "grad_norm": 1.8025429248809814, "kl": 0.112060546875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7963411808013916, "reward_std": 0.14076710492372513, "rewards/accuracy_reward": 0.8167493343353271, "rewards/format_reward": 0.9795918464660645, "step": 7642 }, { "completion_length": 264.0408172607422, "epoch": 0.7691069182389937, "grad_norm": 0.45245054364204407, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6273542642593384, "reward_std": 0.1114429160952568, "rewards/accuracy_reward": 0.6477624475955963, "rewards/format_reward": 0.9795918464660645, "step": 7643 }, { "completion_length": 277.60203552246094, "epoch": 0.7692075471698113, "grad_norm": 1.6043848991394043, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7409785389900208, "reward_std": 0.1479623056948185, "rewards/accuracy_reward": 0.740978479385376, "rewards/format_reward": 1.0, "step": 7644 }, { "completion_length": 244.4285659790039, "epoch": 0.769308176100629, "grad_norm": 0.43927887082099915, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8358776569366455, "reward_std": 0.12446214258670807, "rewards/accuracy_reward": 0.8460817635059357, "rewards/format_reward": 0.9897959232330322, "step": 7645 }, { "completion_length": 296.8163146972656, "epoch": 0.7694088050314465, "grad_norm": 0.5597967505455017, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.696610689163208, "reward_std": 0.1882670819759369, "rewards/accuracy_reward": 0.7170188128948212, "rewards/format_reward": 0.9795918166637421, "step": 7646 }, { "completion_length": 259.1326446533203, "epoch": 0.7695094339622641, "grad_norm": 2.647418260574341, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.686988115310669, "reward_std": 0.3104426860809326, "rewards/accuracy_reward": 0.6971922516822815, "rewards/format_reward": 0.9897959232330322, "step": 7647 }, { "completion_length": 224.4387664794922, "epoch": 0.7696100628930818, "grad_norm": 0.5776845216751099, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8689168095588684, "reward_std": 0.02662210538983345, "rewards/accuracy_reward": 0.8689168095588684, "rewards/format_reward": 1.0, "step": 7648 }, { "completion_length": 274.1326370239258, "epoch": 0.7697106918238994, "grad_norm": 0.4603709578514099, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8849008083343506, "reward_std": 0.05180846154689789, "rewards/accuracy_reward": 0.8849007785320282, "rewards/format_reward": 1.0, "step": 7649 }, { "completion_length": 224.4795913696289, "epoch": 0.769811320754717, "grad_norm": 0.9973803758621216, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7551020383834839, "reward_std": 0.11802951619029045, "rewards/accuracy_reward": 0.7755101919174194, "rewards/format_reward": 0.9795918166637421, "step": 7650 }, { "completion_length": 234.25509643554688, "epoch": 0.7699119496855346, "grad_norm": 0.6332511901855469, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.739195466041565, "reward_std": 0.11754266917705536, "rewards/accuracy_reward": 0.7596036791801453, "rewards/format_reward": 0.9795918464660645, "step": 7651 }, { "completion_length": 256.11224365234375, "epoch": 0.7700125786163522, "grad_norm": 0.7275934815406799, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7173469066619873, "reward_std": 0.16403119266033173, "rewards/accuracy_reward": 0.7275510430335999, "rewards/format_reward": 0.9897959232330322, "step": 7652 }, { "completion_length": 245.55101776123047, "epoch": 0.7701132075471698, "grad_norm": 0.5222304463386536, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.860544204711914, "reward_std": 0.07582557946443558, "rewards/accuracy_reward": 0.8605441749095917, "rewards/format_reward": 1.0, "step": 7653 }, { "completion_length": 224.38774871826172, "epoch": 0.7702138364779875, "grad_norm": 0.8548910021781921, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7975085973739624, "reward_std": 0.14110246300697327, "rewards/accuracy_reward": 0.8077126145362854, "rewards/format_reward": 0.9897959232330322, "step": 7654 }, { "completion_length": 223.66326141357422, "epoch": 0.770314465408805, "grad_norm": 0.14034363627433777, "kl": 0.06982421875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.9539269804954529, "reward_std": 0.007572075352072716, "rewards/accuracy_reward": 0.9539270102977753, "rewards/format_reward": 1.0, "step": 7655 }, { "completion_length": 288.7653045654297, "epoch": 0.7704150943396226, "grad_norm": 0.38407737016677856, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8663221597671509, "reward_std": 0.13420773297548294, "rewards/accuracy_reward": 0.8765262365341187, "rewards/format_reward": 0.9897959232330322, "step": 7656 }, { "completion_length": 245.59182739257812, "epoch": 0.7705157232704403, "grad_norm": 0.1690361201763153, "kl": 0.0538330078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.828571379184723, "reward_std": 0.02020305208861828, "rewards/accuracy_reward": 0.8285713791847229, "rewards/format_reward": 1.0, "step": 7657 }, { "completion_length": 178.448974609375, "epoch": 0.7706163522012579, "grad_norm": 2.0632009506225586, "kl": 0.110595703125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7334674000740051, "reward_std": 0.21516241878271103, "rewards/accuracy_reward": 0.7538754642009735, "rewards/format_reward": 0.9795918166637421, "step": 7658 }, { "completion_length": 262.1530532836914, "epoch": 0.7707169811320754, "grad_norm": 0.8531783223152161, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7402801513671875, "reward_std": 0.17707077413797379, "rewards/accuracy_reward": 0.7708924114704132, "rewards/format_reward": 0.9693877398967743, "step": 7659 }, { "completion_length": 229.2040786743164, "epoch": 0.7708176100628931, "grad_norm": 0.6991991996765137, "kl": 0.109619140625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7583079934120178, "reward_std": 0.05753705091774464, "rewards/accuracy_reward": 0.768512099981308, "rewards/format_reward": 0.9897959232330322, "step": 7660 }, { "completion_length": 241.76529693603516, "epoch": 0.7709182389937107, "grad_norm": 0.6882396340370178, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7653061151504517, "reward_std": 0.11917255818843842, "rewards/accuracy_reward": 0.7653061151504517, "rewards/format_reward": 1.0, "step": 7661 }, { "completion_length": 319.2346954345703, "epoch": 0.7710188679245283, "grad_norm": 0.6690085530281067, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6674513220787048, "reward_std": 0.14045298099517822, "rewards/accuracy_reward": 0.6776553690433502, "rewards/format_reward": 0.9897959232330322, "step": 7662 }, { "completion_length": 292.2040710449219, "epoch": 0.7711194968553459, "grad_norm": 1.8329862356185913, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.8489795327186584, "reward_std": 0.1407705321907997, "rewards/accuracy_reward": 0.8489795625209808, "rewards/format_reward": 1.0, "step": 7663 }, { "completion_length": 283.89795684814453, "epoch": 0.7712201257861635, "grad_norm": 0.7636389136314392, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7371578216552734, "reward_std": 0.09011360257863998, "rewards/accuracy_reward": 0.7371578812599182, "rewards/format_reward": 1.0, "step": 7664 }, { "completion_length": 232.9693832397461, "epoch": 0.7713207547169811, "grad_norm": 0.8209917545318604, "kl": 0.124755859375, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.7959821820259094, "reward_std": 0.1296936199069023, "rewards/accuracy_reward": 0.8367985188961029, "rewards/format_reward": 0.9591836333274841, "step": 7665 }, { "completion_length": 269.3673400878906, "epoch": 0.7714213836477988, "grad_norm": 0.6367780566215515, "kl": 0.1015625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6044101119041443, "reward_std": 0.19740622490644455, "rewards/accuracy_reward": 0.6146142035722733, "rewards/format_reward": 0.9897959232330322, "step": 7666 }, { "completion_length": 291.1122360229492, "epoch": 0.7715220125786163, "grad_norm": 0.8830443024635315, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7410416007041931, "reward_std": 0.13915224373340607, "rewards/accuracy_reward": 0.7614497542381287, "rewards/format_reward": 0.9795918166637421, "step": 7667 }, { "completion_length": 267.7040710449219, "epoch": 0.7716226415094339, "grad_norm": 1.8371121883392334, "kl": 0.1103515625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7625850439071655, "reward_std": 0.24454527348279953, "rewards/accuracy_reward": 0.8034013509750366, "rewards/format_reward": 0.9591836631298065, "step": 7668 }, { "completion_length": 268.76529693603516, "epoch": 0.7717232704402516, "grad_norm": 0.6710571050643921, "kl": 0.0701904296875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7670877575874329, "reward_std": 0.22024544328451157, "rewards/accuracy_reward": 0.7977000176906586, "rewards/format_reward": 0.9693877398967743, "step": 7669 }, { "completion_length": 341.6530456542969, "epoch": 0.7718238993710692, "grad_norm": 0.7104308605194092, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5385048985481262, "reward_std": 0.2578720673918724, "rewards/accuracy_reward": 0.5793212354183197, "rewards/format_reward": 0.9591836631298065, "step": 7670 }, { "completion_length": 292.448974609375, "epoch": 0.7719245283018868, "grad_norm": 2.1658308506011963, "kl": 0.096923828125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7713961005210876, "reward_std": 0.2825171425938606, "rewards/accuracy_reward": 0.8020084500312805, "rewards/format_reward": 0.9693877398967743, "step": 7671 }, { "completion_length": 270.92857360839844, "epoch": 0.7720251572327044, "grad_norm": 0.6642795205116272, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7530126571655273, "reward_std": 0.22789674252271652, "rewards/accuracy_reward": 0.793828934431076, "rewards/format_reward": 0.9591836333274841, "step": 7672 }, { "completion_length": 355.4795837402344, "epoch": 0.772125786163522, "grad_norm": 0.7403506636619568, "kl": 0.0504150390625, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.6715257167816162, "reward_std": 0.1651739552617073, "rewards/accuracy_reward": 0.6919339001178741, "rewards/format_reward": 0.9795918464660645, "step": 7673 }, { "completion_length": 315.5918273925781, "epoch": 0.7722264150943396, "grad_norm": 0.9115313291549683, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8229326009750366, "reward_std": 0.11372848972678185, "rewards/accuracy_reward": 0.822932630777359, "rewards/format_reward": 1.0, "step": 7674 }, { "completion_length": 226.0204086303711, "epoch": 0.7723270440251573, "grad_norm": 7.34258508682251, "kl": 0.146728515625, "learning_rate": 1e-06, "loss": 0.0059, "reward": 1.822465181350708, "reward_std": 0.09789050370454788, "rewards/accuracy_reward": 0.8326692283153534, "rewards/format_reward": 0.9897959232330322, "step": 7675 }, { "completion_length": 289.2653045654297, "epoch": 0.7724276729559748, "grad_norm": 1.1270712614059448, "kl": 0.0755615234375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7489795684814453, "reward_std": 0.18787917494773865, "rewards/accuracy_reward": 0.7591836750507355, "rewards/format_reward": 0.9897959232330322, "step": 7676 }, { "completion_length": 246.74488830566406, "epoch": 0.7725283018867924, "grad_norm": 0.7321379780769348, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6755642294883728, "reward_std": 0.21950097382068634, "rewards/accuracy_reward": 0.7061764895915985, "rewards/format_reward": 0.9693877398967743, "step": 7677 }, { "completion_length": 210.9183578491211, "epoch": 0.7726289308176101, "grad_norm": 0.7566052675247192, "kl": 0.116943359375, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.8099780678749084, "reward_std": 0.16076519340276718, "rewards/accuracy_reward": 0.8201821744441986, "rewards/format_reward": 0.9897959232330322, "step": 7678 }, { "completion_length": 212.0, "epoch": 0.7727295597484277, "grad_norm": 0.7058329582214355, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8514495491981506, "reward_std": 0.08978241495788097, "rewards/accuracy_reward": 0.8616536855697632, "rewards/format_reward": 0.9897959232330322, "step": 7679 }, { "completion_length": 204.33673095703125, "epoch": 0.7728301886792452, "grad_norm": 0.5182393193244934, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7974010705947876, "reward_std": 0.10193035006523132, "rewards/accuracy_reward": 0.8178093135356903, "rewards/format_reward": 0.9795918464660645, "step": 7680 }, { "completion_length": 316.5306091308594, "epoch": 0.7729308176100629, "grad_norm": 0.6981468796730042, "kl": 0.0635986328125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7070229053497314, "reward_std": 0.1789037585258484, "rewards/accuracy_reward": 0.727431058883667, "rewards/format_reward": 0.9795918166637421, "step": 7681 }, { "completion_length": 179.86734771728516, "epoch": 0.7730314465408805, "grad_norm": 0.7822090983390808, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.9387754797935486, "reward_std": 0.12370207160711288, "rewards/accuracy_reward": 0.9591836631298065, "rewards/format_reward": 0.9795918166637421, "step": 7682 }, { "completion_length": 247.82652282714844, "epoch": 0.7731320754716982, "grad_norm": 0.44152045249938965, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.814444661140442, "reward_std": 0.11535107344388962, "rewards/accuracy_reward": 0.8144446909427643, "rewards/format_reward": 1.0, "step": 7683 }, { "completion_length": 328.92857360839844, "epoch": 0.7732327044025157, "grad_norm": 0.48913413286209106, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8826839327812195, "reward_std": 0.0941300094127655, "rewards/accuracy_reward": 0.8928880393505096, "rewards/format_reward": 0.9897959232330322, "step": 7684 }, { "completion_length": 270.9285583496094, "epoch": 0.7733333333333333, "grad_norm": 0.6062639951705933, "kl": 0.1171875, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.838033378124237, "reward_std": 0.22471268475055695, "rewards/accuracy_reward": 0.8890537619590759, "rewards/format_reward": 0.9489795565605164, "step": 7685 }, { "completion_length": 210.6734619140625, "epoch": 0.773433962264151, "grad_norm": 0.7215941548347473, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8158008456230164, "reward_std": 0.08296167850494385, "rewards/accuracy_reward": 0.8158008456230164, "rewards/format_reward": 1.0, "step": 7686 }, { "completion_length": 250.96939086914062, "epoch": 0.7735345911949686, "grad_norm": 1.3444925546646118, "kl": 0.0760498046875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7822828888893127, "reward_std": 0.23873692750930786, "rewards/accuracy_reward": 0.833303302526474, "rewards/format_reward": 0.9489795863628387, "step": 7687 }, { "completion_length": 193.6938705444336, "epoch": 0.7736352201257861, "grad_norm": 0.18185646831989288, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.9434522986412048, "reward_std": 0.025646967813372612, "rewards/accuracy_reward": 0.943452388048172, "rewards/format_reward": 1.0, "step": 7688 }, { "completion_length": 260.33673095703125, "epoch": 0.7737358490566038, "grad_norm": 0.51614910364151, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.740156650543213, "reward_std": 0.11945989355444908, "rewards/accuracy_reward": 0.7401566803455353, "rewards/format_reward": 1.0, "step": 7689 }, { "completion_length": 297.9795837402344, "epoch": 0.7738364779874214, "grad_norm": 0.9417874813079834, "kl": 0.06884765625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.744293987751007, "reward_std": 0.18763290904462337, "rewards/accuracy_reward": 0.7544982135295868, "rewards/format_reward": 0.9897959232330322, "step": 7690 }, { "completion_length": 317.551025390625, "epoch": 0.773937106918239, "grad_norm": 0.7586870193481445, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6770219802856445, "reward_std": 0.24394556134939194, "rewards/accuracy_reward": 0.6872261166572571, "rewards/format_reward": 0.9897959232330322, "step": 7691 }, { "completion_length": 318.1734619140625, "epoch": 0.7740377358490566, "grad_norm": 4.051095485687256, "kl": 0.09716796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.549384593963623, "reward_std": 0.22177176177501678, "rewards/accuracy_reward": 0.5697927474975586, "rewards/format_reward": 0.9795918464660645, "step": 7692 }, { "completion_length": 215.84693908691406, "epoch": 0.7741383647798742, "grad_norm": 0.4169742166996002, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8826959729194641, "reward_std": 0.1079564169049263, "rewards/accuracy_reward": 0.892900139093399, "rewards/format_reward": 0.9897959232330322, "step": 7693 }, { "completion_length": 247.79591369628906, "epoch": 0.7742389937106918, "grad_norm": 1.380784034729004, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6308759450912476, "reward_std": 0.3138524815440178, "rewards/accuracy_reward": 0.681896299123764, "rewards/format_reward": 0.9489795565605164, "step": 7694 }, { "completion_length": 209.5204086303711, "epoch": 0.7743396226415095, "grad_norm": 0.3832031488418579, "kl": 0.091552734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7680159211158752, "reward_std": 0.0766611136496067, "rewards/accuracy_reward": 0.7884241342544556, "rewards/format_reward": 0.9795918166637421, "step": 7695 }, { "completion_length": 231.7653045654297, "epoch": 0.7744402515723271, "grad_norm": 0.6840155720710754, "kl": 0.05224609375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.7555879354476929, "reward_std": 0.11008204519748688, "rewards/accuracy_reward": 0.7657920122146606, "rewards/format_reward": 0.9897959232330322, "step": 7696 }, { "completion_length": 256.34693145751953, "epoch": 0.7745408805031446, "grad_norm": 0.5080065727233887, "kl": 0.072021484375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7856524586677551, "reward_std": 0.12701858580112457, "rewards/accuracy_reward": 0.8060606420040131, "rewards/format_reward": 0.9795918464660645, "step": 7697 }, { "completion_length": 190.77550506591797, "epoch": 0.7746415094339623, "grad_norm": 0.5202141404151917, "kl": 0.114013671875, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.7224153876304626, "reward_std": 0.1011330895125866, "rewards/accuracy_reward": 0.7224154472351074, "rewards/format_reward": 1.0, "step": 7698 }, { "completion_length": 211.75509643554688, "epoch": 0.7747421383647799, "grad_norm": 0.6290675401687622, "kl": 0.056884765625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.714285671710968, "reward_std": 0.15402613580226898, "rewards/accuracy_reward": 0.7244897782802582, "rewards/format_reward": 0.9897959232330322, "step": 7699 }, { "completion_length": 311.96937561035156, "epoch": 0.7748427672955975, "grad_norm": 0.7638453841209412, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8566595911979675, "reward_std": 0.1425428744405508, "rewards/accuracy_reward": 0.8668636977672577, "rewards/format_reward": 0.9897959232330322, "step": 7700 }, { "completion_length": 224.9081573486328, "epoch": 0.7749433962264151, "grad_norm": 0.7405843734741211, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7649652361869812, "reward_std": 0.18232423067092896, "rewards/accuracy_reward": 0.7853734791278839, "rewards/format_reward": 0.9795918464660645, "step": 7701 }, { "completion_length": 259.47957611083984, "epoch": 0.7750440251572327, "grad_norm": 0.6639258861541748, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6720882058143616, "reward_std": 0.15503989160060883, "rewards/accuracy_reward": 0.6924963891506195, "rewards/format_reward": 0.9795918166637421, "step": 7702 }, { "completion_length": 179.69387817382812, "epoch": 0.7751446540880503, "grad_norm": 1.3768410682678223, "kl": 0.141357421875, "learning_rate": 1e-06, "loss": 0.0057, "reward": 1.74252849817276, "reward_std": 0.19211546331644058, "rewards/accuracy_reward": 0.7527326047420502, "rewards/format_reward": 0.9897959232330322, "step": 7703 }, { "completion_length": 258.6938781738281, "epoch": 0.775245283018868, "grad_norm": 4.301003932952881, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.794054627418518, "reward_std": 0.15792301297187805, "rewards/accuracy_reward": 0.8144627809524536, "rewards/format_reward": 0.9795918166637421, "step": 7704 }, { "completion_length": 351.1836700439453, "epoch": 0.7753459119496855, "grad_norm": 1.2343969345092773, "kl": 0.14599609375, "learning_rate": 1e-06, "loss": 0.0058, "reward": 1.7930436730384827, "reward_std": 0.14292842894792557, "rewards/accuracy_reward": 0.7930436432361603, "rewards/format_reward": 1.0, "step": 7705 }, { "completion_length": 194.14285278320312, "epoch": 0.7754465408805031, "grad_norm": 0.41534531116485596, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.823013424873352, "reward_std": 0.08906963095068932, "rewards/accuracy_reward": 0.823013424873352, "rewards/format_reward": 1.0, "step": 7706 }, { "completion_length": 231.0, "epoch": 0.7755471698113208, "grad_norm": 1.3077021837234497, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7183551788330078, "reward_std": 0.22917808592319489, "rewards/accuracy_reward": 0.7387633323669434, "rewards/format_reward": 0.9795918464660645, "step": 7707 }, { "completion_length": 219.2142791748047, "epoch": 0.7756477987421384, "grad_norm": 0.6804635524749756, "kl": 0.120361328125, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.8918536901474, "reward_std": 0.12533456832170486, "rewards/accuracy_reward": 0.8918536603450775, "rewards/format_reward": 1.0, "step": 7708 }, { "completion_length": 195.23468780517578, "epoch": 0.7757484276729559, "grad_norm": 0.47859323024749756, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.814706802368164, "reward_std": 0.13843969255685806, "rewards/accuracy_reward": 0.835114985704422, "rewards/format_reward": 0.9795918464660645, "step": 7709 }, { "completion_length": 199.07142639160156, "epoch": 0.7758490566037736, "grad_norm": 0.48483502864837646, "kl": 0.0631103515625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6805357933044434, "reward_std": 0.13825394958257675, "rewards/accuracy_reward": 0.6907399296760559, "rewards/format_reward": 0.9897959232330322, "step": 7710 }, { "completion_length": 298.8571319580078, "epoch": 0.7759496855345912, "grad_norm": 0.4063293933868408, "kl": 0.110595703125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.821022629737854, "reward_std": 0.06368255615234375, "rewards/accuracy_reward": 0.8312267363071442, "rewards/format_reward": 0.9897959232330322, "step": 7711 }, { "completion_length": 284.17345428466797, "epoch": 0.7760503144654088, "grad_norm": 0.4295378625392914, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.779845416545868, "reward_std": 0.08347287774085999, "rewards/accuracy_reward": 0.7798454761505127, "rewards/format_reward": 1.0, "step": 7712 }, { "completion_length": 245.57141876220703, "epoch": 0.7761509433962264, "grad_norm": 0.6952652931213379, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.868027150630951, "reward_std": 0.13679739460349083, "rewards/accuracy_reward": 0.8782312572002411, "rewards/format_reward": 0.9897959232330322, "step": 7713 }, { "completion_length": 212.39795684814453, "epoch": 0.776251572327044, "grad_norm": 0.30715060234069824, "kl": 0.05712890625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.9355549812316895, "reward_std": 0.07256486639380455, "rewards/accuracy_reward": 0.9355549216270447, "rewards/format_reward": 1.0, "step": 7714 }, { "completion_length": 227.86734008789062, "epoch": 0.7763522012578616, "grad_norm": 0.5738506317138672, "kl": 0.0684814453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8040816187858582, "reward_std": 0.1270286738872528, "rewards/accuracy_reward": 0.8040816485881805, "rewards/format_reward": 1.0, "step": 7715 }, { "completion_length": 251.49999237060547, "epoch": 0.7764528301886793, "grad_norm": 0.8427979946136475, "kl": 0.0614013671875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7682215571403503, "reward_std": 0.26215851306915283, "rewards/accuracy_reward": 0.8090378642082214, "rewards/format_reward": 0.9591836333274841, "step": 7716 }, { "completion_length": 239.56122589111328, "epoch": 0.7765534591194968, "grad_norm": 0.9821767210960388, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6631096005439758, "reward_std": 0.2385271042585373, "rewards/accuracy_reward": 0.6835177838802338, "rewards/format_reward": 0.9795918166637421, "step": 7717 }, { "completion_length": 281.79590606689453, "epoch": 0.7766540880503144, "grad_norm": 0.716569721698761, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7278911471366882, "reward_std": 0.28673261404037476, "rewards/accuracy_reward": 0.7482993304729462, "rewards/format_reward": 0.9795918166637421, "step": 7718 }, { "completion_length": 224.56121826171875, "epoch": 0.7767547169811321, "grad_norm": 0.7115084528923035, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7590135335922241, "reward_std": 0.15536118298768997, "rewards/accuracy_reward": 0.7692176401615143, "rewards/format_reward": 0.9897959232330322, "step": 7719 }, { "completion_length": 280.3877410888672, "epoch": 0.7768553459119497, "grad_norm": 1.3112940788269043, "kl": 0.1029052734375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.696987271308899, "reward_std": 0.2539209872484207, "rewards/accuracy_reward": 0.7071914374828339, "rewards/format_reward": 0.9897959232330322, "step": 7720 }, { "completion_length": 270.6938705444336, "epoch": 0.7769559748427673, "grad_norm": 1.2312908172607422, "kl": 0.0716552734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.877551019191742, "reward_std": 0.09217509999871254, "rewards/accuracy_reward": 0.8775509893894196, "rewards/format_reward": 1.0, "step": 7721 }, { "completion_length": 278.9591827392578, "epoch": 0.7770566037735849, "grad_norm": 0.758465051651001, "kl": 0.089111328125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7192365527153015, "reward_std": 0.1879262700676918, "rewards/accuracy_reward": 0.7294406294822693, "rewards/format_reward": 0.9897959232330322, "step": 7722 }, { "completion_length": 201.7040786743164, "epoch": 0.7771572327044025, "grad_norm": 1.4878278970718384, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8537415266036987, "reward_std": 0.09493381530046463, "rewards/accuracy_reward": 0.8639455735683441, "rewards/format_reward": 0.9897959232330322, "step": 7723 }, { "completion_length": 287.51019287109375, "epoch": 0.7772578616352201, "grad_norm": 1.1486579179763794, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7016403675079346, "reward_std": 0.26918216049671173, "rewards/accuracy_reward": 0.7220486402511597, "rewards/format_reward": 0.9795918464660645, "step": 7724 }, { "completion_length": 253.39795684814453, "epoch": 0.7773584905660378, "grad_norm": 0.6864271759986877, "kl": 0.1044921875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.771390438079834, "reward_std": 0.2043653316795826, "rewards/accuracy_reward": 0.8020025789737701, "rewards/format_reward": 0.9693877398967743, "step": 7725 }, { "completion_length": 319.05101013183594, "epoch": 0.7774591194968553, "grad_norm": 0.440604567527771, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8298591375350952, "reward_std": 0.14968078583478928, "rewards/accuracy_reward": 0.8400633633136749, "rewards/format_reward": 0.9897959232330322, "step": 7726 }, { "completion_length": 266.76529693603516, "epoch": 0.777559748427673, "grad_norm": 1.0607317686080933, "kl": 0.0867919921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.705604076385498, "reward_std": 0.2374935746192932, "rewards/accuracy_reward": 0.715808242559433, "rewards/format_reward": 0.9897959232330322, "step": 7727 }, { "completion_length": 209.75509643554688, "epoch": 0.7776603773584906, "grad_norm": 0.6330199241638184, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.814774513244629, "reward_std": 0.15074018761515617, "rewards/accuracy_reward": 0.8249785900115967, "rewards/format_reward": 0.9897959232330322, "step": 7728 }, { "completion_length": 259.05101776123047, "epoch": 0.7777610062893082, "grad_norm": 0.8699803948402405, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7781385779380798, "reward_std": 0.16644524782896042, "rewards/accuracy_reward": 0.7985466718673706, "rewards/format_reward": 0.9795918464660645, "step": 7729 }, { "completion_length": 232.0408172607422, "epoch": 0.7778616352201257, "grad_norm": 0.46604371070861816, "kl": 0.108642578125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7057515978813171, "reward_std": 0.05462254211306572, "rewards/accuracy_reward": 0.7057515382766724, "rewards/format_reward": 1.0, "step": 7730 }, { "completion_length": 289.3571319580078, "epoch": 0.7779622641509434, "grad_norm": 0.6513673067092896, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.797852635383606, "reward_std": 0.11756625771522522, "rewards/accuracy_reward": 0.8182608187198639, "rewards/format_reward": 0.9795918464660645, "step": 7731 }, { "completion_length": 222.82652282714844, "epoch": 0.778062893081761, "grad_norm": 0.7647545337677002, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7683473825454712, "reward_std": 0.09243802726268768, "rewards/accuracy_reward": 0.7785515189170837, "rewards/format_reward": 0.9897959232330322, "step": 7732 }, { "completion_length": 211.78570556640625, "epoch": 0.7781635220125787, "grad_norm": 1.9555516242980957, "kl": 0.115234375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.8877550959587097, "reward_std": 0.23389668762683868, "rewards/accuracy_reward": 0.9081632494926453, "rewards/format_reward": 0.9795918464660645, "step": 7733 }, { "completion_length": 314.2244873046875, "epoch": 0.7782641509433962, "grad_norm": 0.7739964127540588, "kl": 0.109619140625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.6116737127304077, "reward_std": 0.22235558182001114, "rewards/accuracy_reward": 0.6116738319396973, "rewards/format_reward": 1.0, "step": 7734 }, { "completion_length": 248.22447967529297, "epoch": 0.7783647798742138, "grad_norm": 0.8816214799880981, "kl": 0.0565185546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.7785875797271729, "reward_std": 0.26033370196819305, "rewards/accuracy_reward": 0.7887916266918182, "rewards/format_reward": 0.9897959232330322, "step": 7735 }, { "completion_length": 198.9081573486328, "epoch": 0.7784654088050315, "grad_norm": 0.9390023946762085, "kl": 0.0650634765625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7734693884849548, "reward_std": 0.2024763971567154, "rewards/accuracy_reward": 0.7734693586826324, "rewards/format_reward": 1.0, "step": 7736 }, { "completion_length": 250.4387664794922, "epoch": 0.7785660377358491, "grad_norm": 0.7525686025619507, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.800000011920929, "reward_std": 0.1889568492770195, "rewards/accuracy_reward": 0.8204081058502197, "rewards/format_reward": 0.9795918166637421, "step": 7737 }, { "completion_length": 220.53060913085938, "epoch": 0.7786666666666666, "grad_norm": 0.505897045135498, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8168123364448547, "reward_std": 0.1076425202190876, "rewards/accuracy_reward": 0.8270165026187897, "rewards/format_reward": 0.9897959232330322, "step": 7738 }, { "completion_length": 334.89794921875, "epoch": 0.7787672955974843, "grad_norm": 0.7615912556648254, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.5985774397850037, "reward_std": 0.2142191380262375, "rewards/accuracy_reward": 0.5985774397850037, "rewards/format_reward": 1.0, "step": 7739 }, { "completion_length": 270.1530456542969, "epoch": 0.7788679245283019, "grad_norm": 0.5022823214530945, "kl": 0.0799560546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8291544914245605, "reward_std": 0.09185676649212837, "rewards/accuracy_reward": 0.8291544914245605, "rewards/format_reward": 1.0, "step": 7740 }, { "completion_length": 257.8367233276367, "epoch": 0.7789685534591195, "grad_norm": 0.7735737562179565, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.808725118637085, "reward_std": 0.15564272552728653, "rewards/accuracy_reward": 0.8393374085426331, "rewards/format_reward": 0.9693877398967743, "step": 7741 }, { "completion_length": 177.1836700439453, "epoch": 0.779069182389937, "grad_norm": 0.8615544438362122, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.876093327999115, "reward_std": 0.1458001807332039, "rewards/accuracy_reward": 0.8760932683944702, "rewards/format_reward": 1.0, "step": 7742 }, { "completion_length": 364.8877410888672, "epoch": 0.7791698113207547, "grad_norm": 1.0992470979690552, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6277929544448853, "reward_std": 0.2774766832590103, "rewards/accuracy_reward": 0.6584051847457886, "rewards/format_reward": 0.9693877398967743, "step": 7743 }, { "completion_length": 228.0, "epoch": 0.7792704402515723, "grad_norm": 0.6892944574356079, "kl": 0.0760498046875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.698306143283844, "reward_std": 0.17350181937217712, "rewards/accuracy_reward": 0.7187142074108124, "rewards/format_reward": 0.9795918464660645, "step": 7744 }, { "completion_length": 256.2244873046875, "epoch": 0.77937106918239, "grad_norm": 1.7710062265396118, "kl": 0.0665283203125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7719380259513855, "reward_std": 0.16792390495538712, "rewards/accuracy_reward": 0.7719379961490631, "rewards/format_reward": 1.0, "step": 7745 }, { "completion_length": 235.2653045654297, "epoch": 0.7794716981132076, "grad_norm": 0.5139003992080688, "kl": 0.091064453125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8699751496315002, "reward_std": 0.1623198166489601, "rewards/accuracy_reward": 0.8903834521770477, "rewards/format_reward": 0.9795918166637421, "step": 7746 }, { "completion_length": 222.03060150146484, "epoch": 0.7795723270440251, "grad_norm": 0.9607926607131958, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8175633549690247, "reward_std": 0.17648563534021378, "rewards/accuracy_reward": 0.8379715383052826, "rewards/format_reward": 0.9795918166637421, "step": 7747 }, { "completion_length": 274.5408172607422, "epoch": 0.7796729559748428, "grad_norm": 0.9500747919082642, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.4667600989341736, "reward_std": 0.16650021076202393, "rewards/accuracy_reward": 0.4667601138353348, "rewards/format_reward": 1.0, "step": 7748 }, { "completion_length": 196.9897918701172, "epoch": 0.7797735849056604, "grad_norm": 0.6073805689811707, "kl": 0.106689453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.818209707736969, "reward_std": 0.1647230014204979, "rewards/accuracy_reward": 0.8284137547016144, "rewards/format_reward": 0.9897959232330322, "step": 7749 }, { "completion_length": 247.27550506591797, "epoch": 0.779874213836478, "grad_norm": 0.5423747301101685, "kl": 0.0826416015625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7693332433700562, "reward_std": 0.10142381861805916, "rewards/accuracy_reward": 0.7897414267063141, "rewards/format_reward": 0.9795918464660645, "step": 7750 }, { "completion_length": 216.97958374023438, "epoch": 0.7799748427672956, "grad_norm": 0.7446225881576538, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.873469352722168, "reward_std": 0.10493757575750351, "rewards/accuracy_reward": 0.8836734592914581, "rewards/format_reward": 0.9897959232330322, "step": 7751 }, { "completion_length": 311.11224365234375, "epoch": 0.7800754716981132, "grad_norm": 0.6872840523719788, "kl": 0.0726318359375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.714285671710968, "reward_std": 0.18898223340511322, "rewards/accuracy_reward": 0.7244897782802582, "rewards/format_reward": 0.9897959232330322, "step": 7752 }, { "completion_length": 309.38775634765625, "epoch": 0.7801761006289308, "grad_norm": 0.4696425795555115, "kl": 0.053466796875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.726313054561615, "reward_std": 0.1736023724079132, "rewards/accuracy_reward": 0.7365171611309052, "rewards/format_reward": 0.9897959232330322, "step": 7753 }, { "completion_length": 208.0102081298828, "epoch": 0.7802767295597485, "grad_norm": 0.4802624583244324, "kl": 0.1085205078125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.9252785444259644, "reward_std": 0.051713259890675545, "rewards/accuracy_reward": 0.9252784550189972, "rewards/format_reward": 1.0, "step": 7754 }, { "completion_length": 206.06121826171875, "epoch": 0.780377358490566, "grad_norm": 0.7203904390335083, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.829746425151825, "reward_std": 0.1487193927168846, "rewards/accuracy_reward": 0.8399505317211151, "rewards/format_reward": 0.9897959232330322, "step": 7755 }, { "completion_length": 272.41835021972656, "epoch": 0.7804779874213836, "grad_norm": 0.3347095251083374, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.9052733778953552, "reward_std": 0.05174499284476042, "rewards/accuracy_reward": 0.9052733182907104, "rewards/format_reward": 1.0, "step": 7756 }, { "completion_length": 349.7755126953125, "epoch": 0.7805786163522013, "grad_norm": 0.6412246227264404, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7211053371429443, "reward_std": 0.1975051537156105, "rewards/accuracy_reward": 0.7415135204792023, "rewards/format_reward": 0.9795918464660645, "step": 7757 }, { "completion_length": 333.52040100097656, "epoch": 0.7806792452830189, "grad_norm": 0.658523440361023, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7297760844230652, "reward_std": 0.18100889772176743, "rewards/accuracy_reward": 0.7297761142253876, "rewards/format_reward": 1.0, "step": 7758 }, { "completion_length": 360.9897918701172, "epoch": 0.7807798742138364, "grad_norm": 0.5463160872459412, "kl": 0.060791015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6743969917297363, "reward_std": 0.07187874242663383, "rewards/accuracy_reward": 0.6743970513343811, "rewards/format_reward": 1.0, "step": 7759 }, { "completion_length": 298.6428527832031, "epoch": 0.7808805031446541, "grad_norm": 0.8803116083145142, "kl": 0.1044921875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6167269945144653, "reward_std": 0.29040922224521637, "rewards/accuracy_reward": 0.6473392844200134, "rewards/format_reward": 0.9693877398967743, "step": 7760 }, { "completion_length": 199.84693908691406, "epoch": 0.7809811320754717, "grad_norm": 0.4845852255821228, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8428571224212646, "reward_std": 0.13408707082271576, "rewards/accuracy_reward": 0.8428571224212646, "rewards/format_reward": 1.0, "step": 7761 }, { "completion_length": 343.1938781738281, "epoch": 0.7810817610062893, "grad_norm": 1.1092523336410522, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6804664134979248, "reward_std": 0.08845193684101105, "rewards/accuracy_reward": 0.6804664731025696, "rewards/format_reward": 1.0, "step": 7762 }, { "completion_length": 265.36734771728516, "epoch": 0.7811823899371069, "grad_norm": 0.9352918863296509, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7028782963752747, "reward_std": 0.21781011670827866, "rewards/accuracy_reward": 0.7130823731422424, "rewards/format_reward": 0.9897959232330322, "step": 7763 }, { "completion_length": 186.95917892456055, "epoch": 0.7812830188679245, "grad_norm": 2.366814613342285, "kl": 0.14404296875, "learning_rate": 1e-06, "loss": 0.0058, "reward": 1.7928209900856018, "reward_std": 0.1724909245967865, "rewards/accuracy_reward": 0.7928210496902466, "rewards/format_reward": 1.0, "step": 7764 }, { "completion_length": 368.2550964355469, "epoch": 0.7813836477987421, "grad_norm": 1.12260901927948, "kl": 0.115966796875, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.6537188291549683, "reward_std": 0.20997944474220276, "rewards/accuracy_reward": 0.663922905921936, "rewards/format_reward": 0.9897959232330322, "step": 7765 }, { "completion_length": 228.29591369628906, "epoch": 0.7814842767295598, "grad_norm": 0.6281137466430664, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7591948509216309, "reward_std": 0.14614709094166756, "rewards/accuracy_reward": 0.769398957490921, "rewards/format_reward": 0.9897959232330322, "step": 7766 }, { "completion_length": 266.0204086303711, "epoch": 0.7815849056603773, "grad_norm": 2.961968421936035, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.824291169643402, "reward_std": 0.1910664439201355, "rewards/accuracy_reward": 0.84469935297966, "rewards/format_reward": 0.9795918464660645, "step": 7767 }, { "completion_length": 254.18366241455078, "epoch": 0.7816855345911949, "grad_norm": 0.4650821089744568, "kl": 0.0762939453125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8460233807563782, "reward_std": 0.13790475577116013, "rewards/accuracy_reward": 0.8664315640926361, "rewards/format_reward": 0.9795918464660645, "step": 7768 }, { "completion_length": 261.7448959350586, "epoch": 0.7817861635220126, "grad_norm": 1.791130781173706, "kl": 0.1279296875, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.6655783653259277, "reward_std": 0.33704982697963715, "rewards/accuracy_reward": 0.7165988385677338, "rewards/format_reward": 0.9489795863628387, "step": 7769 }, { "completion_length": 315.1836700439453, "epoch": 0.7818867924528302, "grad_norm": 0.45186516642570496, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.840834140777588, "reward_std": 0.11937060579657555, "rewards/accuracy_reward": 0.8408341705799103, "rewards/format_reward": 1.0, "step": 7770 }, { "completion_length": 234.63265228271484, "epoch": 0.7819874213836479, "grad_norm": 0.8513142466545105, "kl": 0.121826171875, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.8284034729003906, "reward_std": 0.24599020183086395, "rewards/accuracy_reward": 0.8386076092720032, "rewards/format_reward": 0.9897959232330322, "step": 7771 }, { "completion_length": 213.26529693603516, "epoch": 0.7820880503144654, "grad_norm": 0.9501432180404663, "kl": 0.053955078125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.838192343711853, "reward_std": 0.13898664340376854, "rewards/accuracy_reward": 0.8381924033164978, "rewards/format_reward": 1.0, "step": 7772 }, { "completion_length": 297.3571319580078, "epoch": 0.782188679245283, "grad_norm": 0.39399832487106323, "kl": 0.106689453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7144904136657715, "reward_std": 0.12332489341497421, "rewards/accuracy_reward": 0.7144903838634491, "rewards/format_reward": 1.0, "step": 7773 }, { "completion_length": 248.37754821777344, "epoch": 0.7822893081761007, "grad_norm": 0.8598456978797913, "kl": 0.100830078125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7815914154052734, "reward_std": 0.15108711272478104, "rewards/accuracy_reward": 0.7917955219745636, "rewards/format_reward": 0.9897959232330322, "step": 7774 }, { "completion_length": 231.1734619140625, "epoch": 0.7823899371069183, "grad_norm": 1.1494463682174683, "kl": 0.12255859375, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.842054843902588, "reward_std": 0.07869911193847656, "rewards/accuracy_reward": 0.8624630272388458, "rewards/format_reward": 0.9795918464660645, "step": 7775 }, { "completion_length": 296.32652282714844, "epoch": 0.7824905660377358, "grad_norm": 0.562525749206543, "kl": 0.10205078125, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8095237612724304, "reward_std": 0.05617848224937916, "rewards/accuracy_reward": 0.8095238208770752, "rewards/format_reward": 1.0, "step": 7776 }, { "completion_length": 233.29591369628906, "epoch": 0.7825911949685534, "grad_norm": 0.6102449893951416, "kl": 0.0711669921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.780103325843811, "reward_std": 0.11676709726452827, "rewards/accuracy_reward": 0.7801033556461334, "rewards/format_reward": 1.0, "step": 7777 }, { "completion_length": 247.21427154541016, "epoch": 0.7826918238993711, "grad_norm": 0.5881121754646301, "kl": 0.09326171875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.848612904548645, "reward_std": 0.06339415162801743, "rewards/accuracy_reward": 0.8588170409202576, "rewards/format_reward": 0.9897959232330322, "step": 7778 }, { "completion_length": 281.94898223876953, "epoch": 0.7827924528301887, "grad_norm": 0.6744735240936279, "kl": 0.0657958984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8638604879379272, "reward_std": 0.09224358107894659, "rewards/accuracy_reward": 0.8740646243095398, "rewards/format_reward": 0.9897959232330322, "step": 7779 }, { "completion_length": 258.21429443359375, "epoch": 0.7828930817610062, "grad_norm": 0.5058645009994507, "kl": 0.0875244140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7672850489616394, "reward_std": 0.19288692623376846, "rewards/accuracy_reward": 0.7978973388671875, "rewards/format_reward": 0.9693877398967743, "step": 7780 }, { "completion_length": 179.83673095703125, "epoch": 0.7829937106918239, "grad_norm": 0.29957395792007446, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.829446017742157, "reward_std": 0.05744730681180954, "rewards/accuracy_reward": 0.8294460475444794, "rewards/format_reward": 1.0, "step": 7781 }, { "completion_length": 264.57141876220703, "epoch": 0.7830943396226415, "grad_norm": 0.3453691005706787, "kl": 0.0574951171875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.751356601715088, "reward_std": 0.08967086672782898, "rewards/accuracy_reward": 0.7615606784820557, "rewards/format_reward": 0.9897959232330322, "step": 7782 }, { "completion_length": 244.40816497802734, "epoch": 0.7831949685534592, "grad_norm": 0.5494592189788818, "kl": 0.073486328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7244897484779358, "reward_std": 0.2316918969154358, "rewards/accuracy_reward": 0.7551020085811615, "rewards/format_reward": 0.9693877398967743, "step": 7783 }, { "completion_length": 257.3571319580078, "epoch": 0.7832955974842767, "grad_norm": 1.4000585079193115, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.763649582862854, "reward_std": 0.12584927305579185, "rewards/accuracy_reward": 0.7636496126651764, "rewards/format_reward": 1.0, "step": 7784 }, { "completion_length": 258.27549743652344, "epoch": 0.7833962264150943, "grad_norm": 0.579458475112915, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7990450263023376, "reward_std": 0.08004175499081612, "rewards/accuracy_reward": 0.7990449368953705, "rewards/format_reward": 1.0, "step": 7785 }, { "completion_length": 234.35713958740234, "epoch": 0.783496855345912, "grad_norm": 0.470738023519516, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8401978611946106, "reward_std": 0.14482539892196655, "rewards/accuracy_reward": 0.8504019677639008, "rewards/format_reward": 0.9897959232330322, "step": 7786 }, { "completion_length": 230.22447967529297, "epoch": 0.7835974842767296, "grad_norm": 1.0256327390670776, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7999998927116394, "reward_std": 0.14362181723117828, "rewards/accuracy_reward": 0.8306121826171875, "rewards/format_reward": 0.9693877398967743, "step": 7787 }, { "completion_length": 236.948974609375, "epoch": 0.7836981132075471, "grad_norm": 1.5002028942108154, "kl": 0.102294921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6902727484703064, "reward_std": 0.24193386733531952, "rewards/accuracy_reward": 0.7106810212135315, "rewards/format_reward": 0.9795918166637421, "step": 7788 }, { "completion_length": 254.91836547851562, "epoch": 0.7837987421383648, "grad_norm": 0.37888407707214355, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8383543491363525, "reward_std": 0.07637457549571991, "rewards/accuracy_reward": 0.8383543789386749, "rewards/format_reward": 1.0, "step": 7789 }, { "completion_length": 340.89794921875, "epoch": 0.7838993710691824, "grad_norm": 1.4057937860488892, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6402292251586914, "reward_std": 0.20917758345603943, "rewards/accuracy_reward": 0.6606373488903046, "rewards/format_reward": 0.9795918464660645, "step": 7790 }, { "completion_length": 229.34693145751953, "epoch": 0.784, "grad_norm": 0.8386519551277161, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7921292781829834, "reward_std": 0.12588034197688103, "rewards/accuracy_reward": 0.8023333549499512, "rewards/format_reward": 0.9897959232330322, "step": 7791 }, { "completion_length": 230.71428680419922, "epoch": 0.7841006289308176, "grad_norm": 0.42242324352264404, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6152313947677612, "reward_std": 0.07426445558667183, "rewards/accuracy_reward": 0.6152313947677612, "rewards/format_reward": 1.0, "step": 7792 }, { "completion_length": 298.54080963134766, "epoch": 0.7842012578616352, "grad_norm": 1.103949785232544, "kl": 0.0689697265625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6909839510917664, "reward_std": 0.19924599304795265, "rewards/accuracy_reward": 0.7113921940326691, "rewards/format_reward": 0.9795918464660645, "step": 7793 }, { "completion_length": 266.05101776123047, "epoch": 0.7843018867924528, "grad_norm": 0.7619552612304688, "kl": 0.103271484375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.9049680829048157, "reward_std": 0.19964075088500977, "rewards/accuracy_reward": 0.9253761768341064, "rewards/format_reward": 0.9795918464660645, "step": 7794 }, { "completion_length": 299.10203552246094, "epoch": 0.7844025157232705, "grad_norm": 1.087610125541687, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.665720283985138, "reward_std": 0.34294672310352325, "rewards/accuracy_reward": 0.7065366804599762, "rewards/format_reward": 0.9591836333274841, "step": 7795 }, { "completion_length": 273.5816345214844, "epoch": 0.7845031446540881, "grad_norm": 1.361121416091919, "kl": 0.09326171875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6172254085540771, "reward_std": 0.18215397000312805, "rewards/accuracy_reward": 0.6478376686573029, "rewards/format_reward": 0.9693877398967743, "step": 7796 }, { "completion_length": 254.07141876220703, "epoch": 0.7846037735849056, "grad_norm": 0.9451029300689697, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6489383578300476, "reward_std": 0.2635158747434616, "rewards/accuracy_reward": 0.6693465113639832, "rewards/format_reward": 0.9795918464660645, "step": 7797 }, { "completion_length": 197.12244415283203, "epoch": 0.7847044025157233, "grad_norm": 1.2550898790359497, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8006232380867004, "reward_std": 0.08678585663437843, "rewards/accuracy_reward": 0.8006232380867004, "rewards/format_reward": 1.0, "step": 7798 }, { "completion_length": 241.67346954345703, "epoch": 0.7848050314465409, "grad_norm": 1.1697860956192017, "kl": 0.12744140625, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.6926767230033875, "reward_std": 0.2896737828850746, "rewards/accuracy_reward": 0.7130849063396454, "rewards/format_reward": 0.9795918166637421, "step": 7799 }, { "completion_length": 244.79590606689453, "epoch": 0.7849056603773585, "grad_norm": 0.9529379606246948, "kl": 0.11376953125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.6831786632537842, "reward_std": 0.17107074707746506, "rewards/accuracy_reward": 0.6933827996253967, "rewards/format_reward": 0.9897959232330322, "step": 7800 }, { "completion_length": 300.1326446533203, "epoch": 0.7850062893081761, "grad_norm": 0.8076686859130859, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6836734414100647, "reward_std": 0.22718360275030136, "rewards/accuracy_reward": 0.7244898080825806, "rewards/format_reward": 0.9591836631298065, "step": 7801 }, { "completion_length": 261.9387664794922, "epoch": 0.7851069182389937, "grad_norm": 3.2007193565368652, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7415329217910767, "reward_std": 0.10315298475325108, "rewards/accuracy_reward": 0.7619410753250122, "rewards/format_reward": 0.9795918166637421, "step": 7802 }, { "completion_length": 287.9693908691406, "epoch": 0.7852075471698113, "grad_norm": 1.7653615474700928, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6396358609199524, "reward_std": 0.2635946124792099, "rewards/accuracy_reward": 0.6702481210231781, "rewards/format_reward": 0.9693877398967743, "step": 7803 }, { "completion_length": 315.83673095703125, "epoch": 0.785308176100629, "grad_norm": 0.7501112818717957, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6896284818649292, "reward_std": 0.13663437217473984, "rewards/accuracy_reward": 0.7100367248058319, "rewards/format_reward": 0.9795918464660645, "step": 7804 }, { "completion_length": 322.1326446533203, "epoch": 0.7854088050314465, "grad_norm": 0.594287097454071, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7247269749641418, "reward_std": 0.14086651057004929, "rewards/accuracy_reward": 0.734931081533432, "rewards/format_reward": 0.9897959232330322, "step": 7805 }, { "completion_length": 201.75509643554688, "epoch": 0.7855094339622641, "grad_norm": 0.5481122732162476, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.873285174369812, "reward_std": 0.11294603161513805, "rewards/accuracy_reward": 0.8834893107414246, "rewards/format_reward": 0.9897959232330322, "step": 7806 }, { "completion_length": 286.46937561035156, "epoch": 0.7856100628930818, "grad_norm": 1.9527515172958374, "kl": 0.0855712890625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8440327048301697, "reward_std": 0.16501237452030182, "rewards/accuracy_reward": 0.8644409775733948, "rewards/format_reward": 0.9795918166637421, "step": 7807 }, { "completion_length": 243.60203552246094, "epoch": 0.7857106918238994, "grad_norm": 1.156246304512024, "kl": 0.114501953125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.7671395540237427, "reward_std": 0.23559930175542831, "rewards/accuracy_reward": 0.7875477969646454, "rewards/format_reward": 0.9795918464660645, "step": 7808 }, { "completion_length": 397.4897766113281, "epoch": 0.7858113207547169, "grad_norm": 0.5032167434692383, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8048582077026367, "reward_std": 0.12107054516673088, "rewards/accuracy_reward": 0.8252663910388947, "rewards/format_reward": 0.9795918166637421, "step": 7809 }, { "completion_length": 213.80611419677734, "epoch": 0.7859119496855346, "grad_norm": 0.9258098006248474, "kl": 0.126708984375, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.7641366124153137, "reward_std": 0.20896492153406143, "rewards/accuracy_reward": 0.7845447957515717, "rewards/format_reward": 0.9795918166637421, "step": 7810 }, { "completion_length": 226.49999237060547, "epoch": 0.7860125786163522, "grad_norm": 0.6162236928939819, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.793207049369812, "reward_std": 0.1229059100151062, "rewards/accuracy_reward": 0.8034111857414246, "rewards/format_reward": 0.9897959232330322, "step": 7811 }, { "completion_length": 325.83673095703125, "epoch": 0.7861132075471698, "grad_norm": 0.6615703701972961, "kl": 0.060302734375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7929785251617432, "reward_std": 0.2758469581604004, "rewards/accuracy_reward": 0.823590874671936, "rewards/format_reward": 0.9693877398967743, "step": 7812 }, { "completion_length": 299.4081573486328, "epoch": 0.7862138364779874, "grad_norm": 0.3991888761520386, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.791196584701538, "reward_std": 0.14696108549833298, "rewards/accuracy_reward": 0.8116048276424408, "rewards/format_reward": 0.9795918464660645, "step": 7813 }, { "completion_length": 346.8673400878906, "epoch": 0.786314465408805, "grad_norm": 0.9499112963676453, "kl": 0.0684814453125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5886724591255188, "reward_std": 0.15115881711244583, "rewards/accuracy_reward": 0.5988765358924866, "rewards/format_reward": 0.9897959232330322, "step": 7814 }, { "completion_length": 235.2448959350586, "epoch": 0.7864150943396226, "grad_norm": 0.7357030510902405, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.756689429283142, "reward_std": 0.16830410063266754, "rewards/accuracy_reward": 0.7668933868408203, "rewards/format_reward": 0.9897959232330322, "step": 7815 }, { "completion_length": 300.68365478515625, "epoch": 0.7865157232704403, "grad_norm": 2.4233572483062744, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7716553211212158, "reward_std": 0.22962886095046997, "rewards/accuracy_reward": 0.7920635044574738, "rewards/format_reward": 0.9795918166637421, "step": 7816 }, { "completion_length": 299.4693908691406, "epoch": 0.7866163522012578, "grad_norm": 0.7039561867713928, "kl": 0.109130859375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.6324234008789062, "reward_std": 0.1397239752113819, "rewards/accuracy_reward": 0.6324233859777451, "rewards/format_reward": 1.0, "step": 7817 }, { "completion_length": 219.28570556640625, "epoch": 0.7867169811320754, "grad_norm": 0.5094603896141052, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8775509595870972, "reward_std": 0.1785494051873684, "rewards/accuracy_reward": 0.8979591429233551, "rewards/format_reward": 0.9795918464660645, "step": 7818 }, { "completion_length": 202.57141876220703, "epoch": 0.7868176100628931, "grad_norm": 1.1504000425338745, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8557822704315186, "reward_std": 0.16773433610796928, "rewards/accuracy_reward": 0.8659863770008087, "rewards/format_reward": 0.9897959232330322, "step": 7819 }, { "completion_length": 222.4795913696289, "epoch": 0.7869182389937107, "grad_norm": 1.1080236434936523, "kl": 0.109619140625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.739315688610077, "reward_std": 0.14125122502446175, "rewards/accuracy_reward": 0.7495198249816895, "rewards/format_reward": 0.9897959232330322, "step": 7820 }, { "completion_length": 235.53060150146484, "epoch": 0.7870188679245284, "grad_norm": 0.9562625885009766, "kl": 0.131591796875, "learning_rate": 1e-06, "loss": 0.0053, "reward": 1.8273810148239136, "reward_std": 0.150734543800354, "rewards/accuracy_reward": 0.847789078950882, "rewards/format_reward": 0.9795918464660645, "step": 7821 }, { "completion_length": 244.74488830566406, "epoch": 0.7871194968553459, "grad_norm": 0.5789337158203125, "kl": 0.095458984375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7434441447257996, "reward_std": 0.16570672392845154, "rewards/accuracy_reward": 0.7536482810974121, "rewards/format_reward": 0.9897959232330322, "step": 7822 }, { "completion_length": 223.04080963134766, "epoch": 0.7872201257861635, "grad_norm": 2.68708872795105, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.767055332660675, "reward_std": 0.2129509449005127, "rewards/accuracy_reward": 0.787463515996933, "rewards/format_reward": 0.9795918464660645, "step": 7823 }, { "completion_length": 251.27550506591797, "epoch": 0.7873207547169812, "grad_norm": 0.639451265335083, "kl": 0.11279296875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.6703998446464539, "reward_std": 0.17350901663303375, "rewards/accuracy_reward": 0.690807968378067, "rewards/format_reward": 0.9795918166637421, "step": 7824 }, { "completion_length": 364.1428527832031, "epoch": 0.7874213836477988, "grad_norm": 0.7579257488250732, "kl": 0.12353515625, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.6351048350334167, "reward_std": 0.19745559990406036, "rewards/accuracy_reward": 0.6555129885673523, "rewards/format_reward": 0.9795918166637421, "step": 7825 }, { "completion_length": 235.36734008789062, "epoch": 0.7875220125786163, "grad_norm": 0.7991470098495483, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7452252507209778, "reward_std": 0.12479465082287788, "rewards/accuracy_reward": 0.7554293572902679, "rewards/format_reward": 0.9897959232330322, "step": 7826 }, { "completion_length": 250.6020278930664, "epoch": 0.787622641509434, "grad_norm": 0.790152370929718, "kl": 0.0592041015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7602269053459167, "reward_std": 0.1808992251753807, "rewards/accuracy_reward": 0.7704309523105621, "rewards/format_reward": 0.9897959232330322, "step": 7827 }, { "completion_length": 200.66326141357422, "epoch": 0.7877232704402516, "grad_norm": 0.516752302646637, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8845011591911316, "reward_std": 0.152975145727396, "rewards/accuracy_reward": 0.8947052955627441, "rewards/format_reward": 0.9897959232330322, "step": 7828 }, { "completion_length": 272.2959213256836, "epoch": 0.7878238993710692, "grad_norm": 1.1187487840652466, "kl": 0.112060546875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.6742976307868958, "reward_std": 0.11822887510061264, "rewards/accuracy_reward": 0.6845017075538635, "rewards/format_reward": 0.9897959232330322, "step": 7829 }, { "completion_length": 243.10203552246094, "epoch": 0.7879245283018868, "grad_norm": 1.6294587850570679, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7560389637947083, "reward_std": 0.23984107375144958, "rewards/accuracy_reward": 0.7968553006649017, "rewards/format_reward": 0.9591836631298065, "step": 7830 }, { "completion_length": 187.9285659790039, "epoch": 0.7880251572327044, "grad_norm": 0.47821879386901855, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.9220359921455383, "reward_std": 0.07170533388853073, "rewards/accuracy_reward": 0.9220359623432159, "rewards/format_reward": 1.0, "step": 7831 }, { "completion_length": 227.37754821777344, "epoch": 0.788125786163522, "grad_norm": 1.4681564569473267, "kl": 0.111328125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.6208274960517883, "reward_std": 0.23343275487422943, "rewards/accuracy_reward": 0.6412356197834015, "rewards/format_reward": 0.9795918166637421, "step": 7832 }, { "completion_length": 268.4285583496094, "epoch": 0.7882264150943397, "grad_norm": 0.8593980669975281, "kl": 0.063232421875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7340525388717651, "reward_std": 0.1947002038359642, "rewards/accuracy_reward": 0.7442565262317657, "rewards/format_reward": 0.9897959232330322, "step": 7833 }, { "completion_length": 286.1836700439453, "epoch": 0.7883270440251572, "grad_norm": 7.689232349395752, "kl": 0.11767578125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7458255887031555, "reward_std": 0.14588522166013718, "rewards/accuracy_reward": 0.7458255589008331, "rewards/format_reward": 1.0, "step": 7834 }, { "completion_length": 196.62244415283203, "epoch": 0.7884276729559748, "grad_norm": 0.925708532333374, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6877551078796387, "reward_std": 0.21545638889074326, "rewards/accuracy_reward": 0.6979591846466064, "rewards/format_reward": 0.9897959232330322, "step": 7835 }, { "completion_length": 239.78571319580078, "epoch": 0.7885283018867925, "grad_norm": 0.9052479863166809, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.916158676147461, "reward_std": 0.18005887418985367, "rewards/accuracy_reward": 0.9263627231121063, "rewards/format_reward": 0.9897959232330322, "step": 7836 }, { "completion_length": 306.2040710449219, "epoch": 0.7886289308176101, "grad_norm": 0.5621973872184753, "kl": 0.0670166015625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.606640338897705, "reward_std": 0.2555004432797432, "rewards/accuracy_reward": 0.6372525691986084, "rewards/format_reward": 0.9693877398967743, "step": 7837 }, { "completion_length": 308.2346878051758, "epoch": 0.7887295597484276, "grad_norm": 0.9808367490768433, "kl": 0.0565185546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.6572775840759277, "reward_std": 0.23900515586137772, "rewards/accuracy_reward": 0.6776857376098633, "rewards/format_reward": 0.9795918166637421, "step": 7838 }, { "completion_length": 282.15306091308594, "epoch": 0.7888301886792453, "grad_norm": 0.915693998336792, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8064624667167664, "reward_std": 0.21995287388563156, "rewards/accuracy_reward": 0.8268707394599915, "rewards/format_reward": 0.9795918464660645, "step": 7839 }, { "completion_length": 316.79591369628906, "epoch": 0.7889308176100629, "grad_norm": 0.3487623333930969, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7414966225624084, "reward_std": 0.0314970389008522, "rewards/accuracy_reward": 0.7414966225624084, "rewards/format_reward": 1.0, "step": 7840 }, { "completion_length": 274.8673400878906, "epoch": 0.7890314465408805, "grad_norm": 3.659757614135742, "kl": 0.1318359375, "learning_rate": 1e-06, "loss": 0.0053, "reward": 1.7558965682983398, "reward_std": 0.320618100464344, "rewards/accuracy_reward": 0.7865089178085327, "rewards/format_reward": 0.9693877398967743, "step": 7841 }, { "completion_length": 273.9693832397461, "epoch": 0.7891320754716981, "grad_norm": 0.6912102699279785, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7141767144203186, "reward_std": 0.13069745525717735, "rewards/accuracy_reward": 0.7243807315826416, "rewards/format_reward": 0.9897959232330322, "step": 7842 }, { "completion_length": 224.51019287109375, "epoch": 0.7892327044025157, "grad_norm": 1.1694464683532715, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.82376229763031, "reward_std": 0.20349421352148056, "rewards/accuracy_reward": 0.8441705107688904, "rewards/format_reward": 0.9795918464660645, "step": 7843 }, { "completion_length": 254.76529693603516, "epoch": 0.7893333333333333, "grad_norm": 0.5087771415710449, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7786844968795776, "reward_std": 0.10750725492835045, "rewards/accuracy_reward": 0.7786845564842224, "rewards/format_reward": 1.0, "step": 7844 }, { "completion_length": 163.14285278320312, "epoch": 0.789433962264151, "grad_norm": 0.6031743288040161, "kl": 0.108642578125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8718820810317993, "reward_std": 0.13084455206990242, "rewards/accuracy_reward": 0.8922902047634125, "rewards/format_reward": 0.9795918166637421, "step": 7845 }, { "completion_length": 297.6326446533203, "epoch": 0.7895345911949686, "grad_norm": 2.3580305576324463, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7521953582763672, "reward_std": 0.1573365032672882, "rewards/accuracy_reward": 0.752195417881012, "rewards/format_reward": 1.0, "step": 7846 }, { "completion_length": 228.7448959350586, "epoch": 0.7896352201257861, "grad_norm": 0.836082398891449, "kl": 0.098388671875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7279089093208313, "reward_std": 0.23617777973413467, "rewards/accuracy_reward": 0.7585210502147675, "rewards/format_reward": 0.9693877398967743, "step": 7847 }, { "completion_length": 200.66326141357422, "epoch": 0.7897358490566038, "grad_norm": 1.044000506401062, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.9107142090797424, "reward_std": 0.1759776771068573, "rewards/accuracy_reward": 0.920918345451355, "rewards/format_reward": 0.9897959232330322, "step": 7848 }, { "completion_length": 259.9183654785156, "epoch": 0.7898364779874214, "grad_norm": 0.6757360100746155, "kl": 0.0770263671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.898639440536499, "reward_std": 0.1000455766916275, "rewards/accuracy_reward": 0.898639440536499, "rewards/format_reward": 1.0, "step": 7849 }, { "completion_length": 254.71427154541016, "epoch": 0.789937106918239, "grad_norm": 0.6077635884284973, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.9081632494926453, "reward_std": 0.10903036221861839, "rewards/accuracy_reward": 0.9081631898880005, "rewards/format_reward": 1.0, "step": 7850 }, { "completion_length": 271.76529693603516, "epoch": 0.7900377358490566, "grad_norm": 1.608576774597168, "kl": 0.1455078125, "learning_rate": 1e-06, "loss": 0.0058, "reward": 1.6892341375350952, "reward_std": 0.1185067892074585, "rewards/accuracy_reward": 0.6994383037090302, "rewards/format_reward": 0.9897959232330322, "step": 7851 }, { "completion_length": 196.51020050048828, "epoch": 0.7901383647798742, "grad_norm": 2.981553554534912, "kl": 0.27783203125, "learning_rate": 1e-06, "loss": 0.0111, "reward": 1.7346231341362, "reward_std": 0.22381766140460968, "rewards/accuracy_reward": 0.7550313770771027, "rewards/format_reward": 0.9795918464660645, "step": 7852 }, { "completion_length": 228.59183502197266, "epoch": 0.7902389937106918, "grad_norm": 0.383612722158432, "kl": 0.103271484375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8414966464042664, "reward_std": 0.120016910135746, "rewards/accuracy_reward": 0.8414965867996216, "rewards/format_reward": 1.0, "step": 7853 }, { "completion_length": 272.84693908691406, "epoch": 0.7903396226415095, "grad_norm": 0.731005072593689, "kl": 0.107177734375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7343966364860535, "reward_std": 0.13730880990624428, "rewards/accuracy_reward": 0.7446007430553436, "rewards/format_reward": 0.9897959232330322, "step": 7854 }, { "completion_length": 221.80612182617188, "epoch": 0.790440251572327, "grad_norm": 1.312116026878357, "kl": 0.09423828125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8187384009361267, "reward_std": 0.18394411355257034, "rewards/accuracy_reward": 0.8391465246677399, "rewards/format_reward": 0.9795918166637421, "step": 7855 }, { "completion_length": 273.6836700439453, "epoch": 0.7905408805031446, "grad_norm": 3.626682996749878, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.927334487438202, "reward_std": 0.16037627682089806, "rewards/accuracy_reward": 0.9375386238098145, "rewards/format_reward": 0.9897959232330322, "step": 7856 }, { "completion_length": 226.87754821777344, "epoch": 0.7906415094339623, "grad_norm": 0.49757885932922363, "kl": 0.1025390625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8508409261703491, "reward_std": 0.18356400728225708, "rewards/accuracy_reward": 0.8814531862735748, "rewards/format_reward": 0.9693877398967743, "step": 7857 }, { "completion_length": 215.33673095703125, "epoch": 0.7907421383647799, "grad_norm": 2.166942596435547, "kl": 0.103515625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7461965084075928, "reward_std": 0.1325821578502655, "rewards/accuracy_reward": 0.7564007639884949, "rewards/format_reward": 0.9897959232330322, "step": 7858 }, { "completion_length": 308.02040100097656, "epoch": 0.7908427672955974, "grad_norm": 1.018695592880249, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6941854357719421, "reward_std": 0.25789594650268555, "rewards/accuracy_reward": 0.7043895125389099, "rewards/format_reward": 0.9897959232330322, "step": 7859 }, { "completion_length": 376.45916748046875, "epoch": 0.7909433962264151, "grad_norm": 0.895780622959137, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.5996598601341248, "reward_std": 0.16154487431049347, "rewards/accuracy_reward": 0.6302721202373505, "rewards/format_reward": 0.9693877398967743, "step": 7860 }, { "completion_length": 279.7346954345703, "epoch": 0.7910440251572327, "grad_norm": 0.7211007475852966, "kl": 0.0648193359375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7094781398773193, "reward_std": 0.12471513077616692, "rewards/accuracy_reward": 0.7094781696796417, "rewards/format_reward": 1.0, "step": 7861 }, { "completion_length": 342.3061065673828, "epoch": 0.7911446540880503, "grad_norm": 0.6483877301216125, "kl": 0.0531005859375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.778911530971527, "reward_std": 0.17632890492677689, "rewards/accuracy_reward": 0.7891156077384949, "rewards/format_reward": 0.9897959232330322, "step": 7862 }, { "completion_length": 223.14286041259766, "epoch": 0.7912452830188679, "grad_norm": 0.7463541030883789, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6896103024482727, "reward_std": 0.07510203309357166, "rewards/accuracy_reward": 0.6896103620529175, "rewards/format_reward": 1.0, "step": 7863 }, { "completion_length": 267.39795684814453, "epoch": 0.7913459119496855, "grad_norm": 0.6666275262832642, "kl": 0.103759765625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7580623626708984, "reward_std": 0.08853841572999954, "rewards/accuracy_reward": 0.7580623924732208, "rewards/format_reward": 1.0, "step": 7864 }, { "completion_length": 205.2244873046875, "epoch": 0.7914465408805031, "grad_norm": 1.1368138790130615, "kl": 0.124755859375, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.7020407319068909, "reward_std": 0.2279079481959343, "rewards/accuracy_reward": 0.732653021812439, "rewards/format_reward": 0.9693877398967743, "step": 7865 }, { "completion_length": 277.4387664794922, "epoch": 0.7915471698113208, "grad_norm": 0.7164583206176758, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8771541714668274, "reward_std": 0.09552393481135368, "rewards/accuracy_reward": 0.877154141664505, "rewards/format_reward": 1.0, "step": 7866 }, { "completion_length": 166.64285278320312, "epoch": 0.7916477987421384, "grad_norm": 0.118931345641613, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 7867 }, { "completion_length": 273.38775634765625, "epoch": 0.791748427672956, "grad_norm": 1.060279369354248, "kl": 0.135498046875, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.8220075368881226, "reward_std": 0.1944177784025669, "rewards/accuracy_reward": 0.8424157202243805, "rewards/format_reward": 0.9795918166637421, "step": 7868 }, { "completion_length": 344.6632537841797, "epoch": 0.7918490566037736, "grad_norm": 0.5626668334007263, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7533487677574158, "reward_std": 0.20025762170553207, "rewards/accuracy_reward": 0.7737569808959961, "rewards/format_reward": 0.9795918166637421, "step": 7869 }, { "completion_length": 210.2448959350586, "epoch": 0.7919496855345912, "grad_norm": 0.5751513242721558, "kl": 0.0816650390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.804251790046692, "reward_std": 0.13395484909415245, "rewards/accuracy_reward": 0.8042518198490143, "rewards/format_reward": 1.0, "step": 7870 }, { "completion_length": 181.77550506591797, "epoch": 0.7920503144654089, "grad_norm": 0.6067134737968445, "kl": 0.0677490234375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7843537330627441, "reward_std": 0.08422525599598885, "rewards/accuracy_reward": 0.7843537628650665, "rewards/format_reward": 1.0, "step": 7871 }, { "completion_length": 183.85713958740234, "epoch": 0.7921509433962264, "grad_norm": 0.9138347506523132, "kl": 0.1162109375, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.792673647403717, "reward_std": 0.04536732658743858, "rewards/accuracy_reward": 0.8028777837753296, "rewards/format_reward": 0.9897959232330322, "step": 7872 }, { "completion_length": 302.3877410888672, "epoch": 0.792251572327044, "grad_norm": 0.5942801833152771, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8135509490966797, "reward_std": 0.17134632542729378, "rewards/accuracy_reward": 0.8237550258636475, "rewards/format_reward": 0.9897959232330322, "step": 7873 }, { "completion_length": 375.91835021972656, "epoch": 0.7923522012578617, "grad_norm": 0.4128870964050293, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8857761025428772, "reward_std": 0.10854019597172737, "rewards/accuracy_reward": 0.8959802091121674, "rewards/format_reward": 0.9897959232330322, "step": 7874 }, { "completion_length": 272.3673400878906, "epoch": 0.7924528301886793, "grad_norm": 0.6221296787261963, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.844015896320343, "reward_std": 0.17733419872820377, "rewards/accuracy_reward": 0.8542200326919556, "rewards/format_reward": 0.9897959232330322, "step": 7875 }, { "completion_length": 280.01019287109375, "epoch": 0.7925534591194968, "grad_norm": 0.6423693299293518, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.875283420085907, "reward_std": 0.10216936841607094, "rewards/accuracy_reward": 0.8854875266551971, "rewards/format_reward": 0.9897959232330322, "step": 7876 }, { "completion_length": 328.33673095703125, "epoch": 0.7926540880503145, "grad_norm": 0.9182955026626587, "kl": 0.089111328125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7627793550491333, "reward_std": 0.25169894844293594, "rewards/accuracy_reward": 0.7729834318161011, "rewards/format_reward": 0.9897959232330322, "step": 7877 }, { "completion_length": 188.66326141357422, "epoch": 0.7927547169811321, "grad_norm": 0.8974019289016724, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7409558296203613, "reward_std": 0.12854330241680145, "rewards/accuracy_reward": 0.7409558594226837, "rewards/format_reward": 1.0, "step": 7878 }, { "completion_length": 261.61224365234375, "epoch": 0.7928553459119497, "grad_norm": 1.085959553718567, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7575944066047668, "reward_std": 0.06074336916208267, "rewards/accuracy_reward": 0.7677985429763794, "rewards/format_reward": 0.9897959232330322, "step": 7879 }, { "completion_length": 199.4897918701172, "epoch": 0.7929559748427673, "grad_norm": 1.3486188650131226, "kl": 0.16845703125, "learning_rate": 1e-06, "loss": 0.0067, "reward": 1.8414394855499268, "reward_std": 0.06182699277997017, "rewards/accuracy_reward": 0.8414395153522491, "rewards/format_reward": 1.0, "step": 7880 }, { "completion_length": 296.1428527832031, "epoch": 0.7930566037735849, "grad_norm": 0.7633287310600281, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7268349528312683, "reward_std": 0.17726555094122887, "rewards/accuracy_reward": 0.7370390892028809, "rewards/format_reward": 0.9897959232330322, "step": 7881 }, { "completion_length": 221.16326904296875, "epoch": 0.7931572327044025, "grad_norm": 0.6504783034324646, "kl": 0.0755615234375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.714763104915619, "reward_std": 0.12859038263559341, "rewards/accuracy_reward": 0.7249671816825867, "rewards/format_reward": 0.9897959232330322, "step": 7882 }, { "completion_length": 198.51019287109375, "epoch": 0.7932578616352202, "grad_norm": 0.7958909273147583, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.764028012752533, "reward_std": 0.10693705826997757, "rewards/accuracy_reward": 0.7640280425548553, "rewards/format_reward": 1.0, "step": 7883 }, { "completion_length": 284.2142791748047, "epoch": 0.7933584905660377, "grad_norm": 0.4893682897090912, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7963143587112427, "reward_std": 0.17200102657079697, "rewards/accuracy_reward": 0.8269266188144684, "rewards/format_reward": 0.9693877398967743, "step": 7884 }, { "completion_length": 341.948974609375, "epoch": 0.7934591194968553, "grad_norm": 0.8244376182556152, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.4327207803726196, "reward_std": 0.11959350109100342, "rewards/accuracy_reward": 0.4327207952737808, "rewards/format_reward": 1.0, "step": 7885 }, { "completion_length": 281.1224365234375, "epoch": 0.793559748427673, "grad_norm": 1.0731514692306519, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7889949083328247, "reward_std": 0.1865479312837124, "rewards/accuracy_reward": 0.7991989850997925, "rewards/format_reward": 0.9897959232330322, "step": 7886 }, { "completion_length": 282.3775405883789, "epoch": 0.7936603773584906, "grad_norm": 0.44917574524879456, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7191771268844604, "reward_std": 0.1308730710297823, "rewards/accuracy_reward": 0.7293812334537506, "rewards/format_reward": 0.9897959232330322, "step": 7887 }, { "completion_length": 268.23468017578125, "epoch": 0.7937610062893081, "grad_norm": 0.875003457069397, "kl": 0.10693359375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.8655404448509216, "reward_std": 0.08117014169692993, "rewards/accuracy_reward": 0.8757444620132446, "rewards/format_reward": 0.9897959232330322, "step": 7888 }, { "completion_length": 233.7551040649414, "epoch": 0.7938616352201258, "grad_norm": 2.3663172721862793, "kl": 0.0767822265625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.919851541519165, "reward_std": 0.1324072778224945, "rewards/accuracy_reward": 0.9300556480884552, "rewards/format_reward": 0.9897959232330322, "step": 7889 }, { "completion_length": 235.6530532836914, "epoch": 0.7939622641509434, "grad_norm": 0.34737157821655273, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8565038442611694, "reward_std": 0.0962742492556572, "rewards/accuracy_reward": 0.8667078614234924, "rewards/format_reward": 0.9897959232330322, "step": 7890 }, { "completion_length": 224.9183578491211, "epoch": 0.794062893081761, "grad_norm": 0.6461721658706665, "kl": 0.09814453125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.784450888633728, "reward_std": 0.06726531125605106, "rewards/accuracy_reward": 0.7844509184360504, "rewards/format_reward": 1.0, "step": 7891 }, { "completion_length": 337.5408020019531, "epoch": 0.7941635220125787, "grad_norm": 0.8810147047042847, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.641898512840271, "reward_std": 0.22782732546329498, "rewards/accuracy_reward": 0.6725108027458191, "rewards/format_reward": 0.9693877398967743, "step": 7892 }, { "completion_length": 219.7040786743164, "epoch": 0.7942641509433962, "grad_norm": 2.1129894256591797, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.89565110206604, "reward_std": 0.16791991516947746, "rewards/accuracy_reward": 0.9058551490306854, "rewards/format_reward": 0.9897959232330322, "step": 7893 }, { "completion_length": 234.17346954345703, "epoch": 0.7943647798742138, "grad_norm": 2.2122068405151367, "kl": 0.093994140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7857142686843872, "reward_std": 0.21920377016067505, "rewards/accuracy_reward": 0.795918345451355, "rewards/format_reward": 0.9897959232330322, "step": 7894 }, { "completion_length": 280.84693145751953, "epoch": 0.7944654088050315, "grad_norm": 1.5493605136871338, "kl": 0.128173828125, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.6949002742767334, "reward_std": 0.1809174120426178, "rewards/accuracy_reward": 0.7255126535892487, "rewards/format_reward": 0.9693877398967743, "step": 7895 }, { "completion_length": 252.51019287109375, "epoch": 0.7945660377358491, "grad_norm": 0.6794633865356445, "kl": 0.0745849609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6632652282714844, "reward_std": 0.15402612835168839, "rewards/accuracy_reward": 0.6734693646430969, "rewards/format_reward": 0.9897959232330322, "step": 7896 }, { "completion_length": 214.68366241455078, "epoch": 0.7946666666666666, "grad_norm": 0.8446978330612183, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7357141971588135, "reward_std": 0.24785901606082916, "rewards/accuracy_reward": 0.7663265466690063, "rewards/format_reward": 0.9693877398967743, "step": 7897 }, { "completion_length": 251.1428451538086, "epoch": 0.7947672955974843, "grad_norm": 0.7317599654197693, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8269016742706299, "reward_std": 0.07429943233728409, "rewards/accuracy_reward": 0.8269016742706299, "rewards/format_reward": 1.0, "step": 7898 }, { "completion_length": 202.4591827392578, "epoch": 0.7948679245283019, "grad_norm": 0.9364070296287537, "kl": 0.09716796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7631725072860718, "reward_std": 0.08128525409847498, "rewards/accuracy_reward": 0.7631725370883942, "rewards/format_reward": 1.0, "step": 7899 }, { "completion_length": 239.3061180114746, "epoch": 0.7949685534591195, "grad_norm": 1.159745216369629, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6308274269104004, "reward_std": 0.16137294471263885, "rewards/accuracy_reward": 0.6614396721124649, "rewards/format_reward": 0.9693877398967743, "step": 7900 }, { "completion_length": 283.55101776123047, "epoch": 0.7950691823899371, "grad_norm": 0.4303363561630249, "kl": 0.13525390625, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.8334285020828247, "reward_std": 0.16304028779268265, "rewards/accuracy_reward": 0.8436325192451477, "rewards/format_reward": 0.9897959232330322, "step": 7901 }, { "completion_length": 268.74488830566406, "epoch": 0.7951698113207547, "grad_norm": 0.797153115272522, "kl": 0.12060546875, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7906822562217712, "reward_std": 0.1471739113330841, "rewards/accuracy_reward": 0.8110905289649963, "rewards/format_reward": 0.9795918166637421, "step": 7902 }, { "completion_length": 188.56121826171875, "epoch": 0.7952704402515723, "grad_norm": 0.803905189037323, "kl": 0.10546875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8506492972373962, "reward_std": 0.13523094728589058, "rewards/accuracy_reward": 0.8608534038066864, "rewards/format_reward": 0.9897959232330322, "step": 7903 }, { "completion_length": 241.4387664794922, "epoch": 0.79537106918239, "grad_norm": 1.3000502586364746, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6621871590614319, "reward_std": 0.25420869141817093, "rewards/accuracy_reward": 0.7030035555362701, "rewards/format_reward": 0.9591836631298065, "step": 7904 }, { "completion_length": 277.1632537841797, "epoch": 0.7954716981132075, "grad_norm": 0.3512396812438965, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.745562732219696, "reward_std": 0.1182071603834629, "rewards/accuracy_reward": 0.7659709453582764, "rewards/format_reward": 0.9795918464660645, "step": 7905 }, { "completion_length": 273.6428527832031, "epoch": 0.7955723270440251, "grad_norm": 0.7089250087738037, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8673468828201294, "reward_std": 0.06185103580355644, "rewards/accuracy_reward": 0.8775510191917419, "rewards/format_reward": 0.9897959232330322, "step": 7906 }, { "completion_length": 307.34693908691406, "epoch": 0.7956729559748428, "grad_norm": 0.8766037225723267, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6897807717323303, "reward_std": 0.28205351531505585, "rewards/accuracy_reward": 0.7510051727294922, "rewards/format_reward": 0.9387754797935486, "step": 7907 }, { "completion_length": 298.5306167602539, "epoch": 0.7957735849056604, "grad_norm": 1.461783528327942, "kl": 0.11376953125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.8780143857002258, "reward_std": 0.15340344607830048, "rewards/accuracy_reward": 0.8984226882457733, "rewards/format_reward": 0.9795918166637421, "step": 7908 }, { "completion_length": 194.34693145751953, "epoch": 0.7958742138364779, "grad_norm": 1.0799835920333862, "kl": 0.11865234375, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.8514611721038818, "reward_std": 0.10634130239486694, "rewards/accuracy_reward": 0.8514612317085266, "rewards/format_reward": 1.0, "step": 7909 }, { "completion_length": 243.86734008789062, "epoch": 0.7959748427672956, "grad_norm": 0.6058510541915894, "kl": 0.11669921875, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.8348719477653503, "reward_std": 0.14944322034716606, "rewards/accuracy_reward": 0.8450760841369629, "rewards/format_reward": 0.9897959232330322, "step": 7910 }, { "completion_length": 267.34693145751953, "epoch": 0.7960754716981132, "grad_norm": 0.4979725480079651, "kl": 0.0716552734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.683867871761322, "reward_std": 0.15675782412290573, "rewards/accuracy_reward": 0.6940719485282898, "rewards/format_reward": 0.9897959232330322, "step": 7911 }, { "completion_length": 193.5408172607422, "epoch": 0.7961761006289308, "grad_norm": 1.1115434169769287, "kl": 0.1015625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7434405088424683, "reward_std": 0.1909308061003685, "rewards/accuracy_reward": 0.7536445260047913, "rewards/format_reward": 0.9897959232330322, "step": 7912 }, { "completion_length": 218.32652282714844, "epoch": 0.7962767295597484, "grad_norm": 0.7449159622192383, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7343537211418152, "reward_std": 0.18968834728002548, "rewards/accuracy_reward": 0.744557797908783, "rewards/format_reward": 0.9897959232330322, "step": 7913 }, { "completion_length": 240.57141876220703, "epoch": 0.796377358490566, "grad_norm": 1.0334595441818237, "kl": 0.14013671875, "learning_rate": 1e-06, "loss": 0.0057, "reward": 1.7065553069114685, "reward_std": 0.0834467001259327, "rewards/accuracy_reward": 0.7269634902477264, "rewards/format_reward": 0.9795918464660645, "step": 7914 }, { "completion_length": 252.85713958740234, "epoch": 0.7964779874213836, "grad_norm": 0.18998469412326813, "kl": 0.0570068359375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.836734652519226, "reward_std": 0.03485357388854027, "rewards/accuracy_reward": 0.8367346823215485, "rewards/format_reward": 1.0, "step": 7915 }, { "completion_length": 216.12244415283203, "epoch": 0.7965786163522013, "grad_norm": 0.8352606892585754, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.837971568107605, "reward_std": 0.14876049757003784, "rewards/accuracy_reward": 0.8583797216415405, "rewards/format_reward": 0.9795918166637421, "step": 7916 }, { "completion_length": 218.89795684814453, "epoch": 0.7966792452830189, "grad_norm": 1.0151679515838623, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8321670293807983, "reward_std": 0.16002566367387772, "rewards/accuracy_reward": 0.852575272321701, "rewards/format_reward": 0.9795918166637421, "step": 7917 }, { "completion_length": 175.27550506591797, "epoch": 0.7967798742138364, "grad_norm": 1.8989295959472656, "kl": 0.158203125, "learning_rate": 1e-06, "loss": 0.0063, "reward": 1.8662645816802979, "reward_std": 0.12940087541937828, "rewards/accuracy_reward": 0.8866726756095886, "rewards/format_reward": 0.9795918464660645, "step": 7918 }, { "completion_length": 186.9591827392578, "epoch": 0.7968805031446541, "grad_norm": 0.4768775403499603, "kl": 0.120849609375, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.7883867025375366, "reward_std": 0.12497687339782715, "rewards/accuracy_reward": 0.7985908389091492, "rewards/format_reward": 0.9897959232330322, "step": 7919 }, { "completion_length": 257.6530532836914, "epoch": 0.7969811320754717, "grad_norm": 0.6259143352508545, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.700259804725647, "reward_std": 0.15632296726107597, "rewards/accuracy_reward": 0.7206679880619049, "rewards/format_reward": 0.9795918464660645, "step": 7920 }, { "completion_length": 243.84693908691406, "epoch": 0.7970817610062894, "grad_norm": 1.0671017169952393, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8328149914741516, "reward_std": 0.22300418838858604, "rewards/accuracy_reward": 0.8532231748104095, "rewards/format_reward": 0.9795918166637421, "step": 7921 }, { "completion_length": 264.6836700439453, "epoch": 0.7971823899371069, "grad_norm": 1.230396032333374, "kl": 0.107421875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7154725193977356, "reward_std": 0.15838340669870377, "rewards/accuracy_reward": 0.7358807623386383, "rewards/format_reward": 0.9795918166637421, "step": 7922 }, { "completion_length": 233.13265228271484, "epoch": 0.7972830188679245, "grad_norm": 0.20553718507289886, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7969173789024353, "reward_std": 0.054849741980433464, "rewards/accuracy_reward": 0.8173255324363708, "rewards/format_reward": 0.9795918464660645, "step": 7923 }, { "completion_length": 195.86734771728516, "epoch": 0.7973836477987422, "grad_norm": 0.6151820421218872, "kl": 0.0498046875, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.905636489391327, "reward_std": 0.1286320984363556, "rewards/accuracy_reward": 0.9056365191936493, "rewards/format_reward": 1.0, "step": 7924 }, { "completion_length": 280.8163299560547, "epoch": 0.7974842767295598, "grad_norm": 1.1913460493087769, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7515184879302979, "reward_std": 0.1904759779572487, "rewards/accuracy_reward": 0.761722594499588, "rewards/format_reward": 0.9897959232330322, "step": 7925 }, { "completion_length": 274.0918426513672, "epoch": 0.7975849056603773, "grad_norm": 0.7872530817985535, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7508142590522766, "reward_std": 0.23630012199282646, "rewards/accuracy_reward": 0.7814265489578247, "rewards/format_reward": 0.9693877398967743, "step": 7926 }, { "completion_length": 283.07142639160156, "epoch": 0.797685534591195, "grad_norm": 0.5819805264472961, "kl": 0.122314453125, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.7739473581314087, "reward_std": 0.10920111741870642, "rewards/accuracy_reward": 0.7943556010723114, "rewards/format_reward": 0.9795918166637421, "step": 7927 }, { "completion_length": 252.43877410888672, "epoch": 0.7977861635220126, "grad_norm": 0.5883917808532715, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8051583766937256, "reward_std": 0.11548876762390137, "rewards/accuracy_reward": 0.8357706964015961, "rewards/format_reward": 0.9693877398967743, "step": 7928 }, { "completion_length": 340.6530456542969, "epoch": 0.7978867924528302, "grad_norm": 0.8110576272010803, "kl": 0.081787109375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7854906916618347, "reward_std": 0.1315140500664711, "rewards/accuracy_reward": 0.8058987855911255, "rewards/format_reward": 0.9795918464660645, "step": 7929 }, { "completion_length": 254.31632232666016, "epoch": 0.7979874213836478, "grad_norm": 0.7162452936172485, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.9376092553138733, "reward_std": 0.13051079213619232, "rewards/accuracy_reward": 0.9580174684524536, "rewards/format_reward": 0.9795918464660645, "step": 7930 }, { "completion_length": 283.82652282714844, "epoch": 0.7980880503144654, "grad_norm": 0.6118070483207703, "kl": 0.1103515625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.6823475360870361, "reward_std": 0.17134982347488403, "rewards/accuracy_reward": 0.7027557790279388, "rewards/format_reward": 0.9795918166637421, "step": 7931 }, { "completion_length": 248.82653045654297, "epoch": 0.798188679245283, "grad_norm": 1.2359076738357544, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7229914665222168, "reward_std": 0.17651191726326942, "rewards/accuracy_reward": 0.7433996796607971, "rewards/format_reward": 0.9795918464660645, "step": 7932 }, { "completion_length": 250.76529693603516, "epoch": 0.7982893081761007, "grad_norm": 0.5319507122039795, "kl": 0.13134765625, "learning_rate": 1e-06, "loss": 0.0052, "reward": 1.8102758526802063, "reward_std": 0.14645549654960632, "rewards/accuracy_reward": 0.8306840658187866, "rewards/format_reward": 0.9795918464660645, "step": 7933 }, { "completion_length": 244.75509643554688, "epoch": 0.7983899371069182, "grad_norm": 0.8727911710739136, "kl": 0.1171875, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.6662337183952332, "reward_std": 0.19180431962013245, "rewards/accuracy_reward": 0.6968460083007812, "rewards/format_reward": 0.9693877398967743, "step": 7934 }, { "completion_length": 245.8571319580078, "epoch": 0.7984905660377358, "grad_norm": 0.7820446491241455, "kl": 0.0986328125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8806284070014954, "reward_std": 0.21276842057704926, "rewards/accuracy_reward": 0.9010365903377533, "rewards/format_reward": 0.9795918464660645, "step": 7935 }, { "completion_length": 178.7244873046875, "epoch": 0.7985911949685535, "grad_norm": 0.9453347325325012, "kl": 0.102294921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8445839285850525, "reward_std": 0.09563644602894783, "rewards/accuracy_reward": 0.8445839583873749, "rewards/format_reward": 1.0, "step": 7936 }, { "completion_length": 261.6632614135742, "epoch": 0.7986918238993711, "grad_norm": 0.8145498037338257, "kl": 0.10546875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.6876272559165955, "reward_std": 0.2228798344731331, "rewards/accuracy_reward": 0.7386476397514343, "rewards/format_reward": 0.9489795863628387, "step": 7937 }, { "completion_length": 282.3571319580078, "epoch": 0.7987924528301886, "grad_norm": 1.4724032878875732, "kl": 0.1729736328125, "learning_rate": 1e-06, "loss": 0.0069, "reward": 1.8369613885879517, "reward_std": 0.26660727709531784, "rewards/accuracy_reward": 0.8675736486911774, "rewards/format_reward": 0.9693877398967743, "step": 7938 }, { "completion_length": 219.448974609375, "epoch": 0.7988930817610063, "grad_norm": 0.6461840867996216, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8877550959587097, "reward_std": 0.08884849771857262, "rewards/accuracy_reward": 0.8979591429233551, "rewards/format_reward": 0.9897959232330322, "step": 7939 }, { "completion_length": 405.3163299560547, "epoch": 0.7989937106918239, "grad_norm": 0.5869724750518799, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.578408658504486, "reward_std": 0.1904999166727066, "rewards/accuracy_reward": 0.6294291168451309, "rewards/format_reward": 0.9489795565605164, "step": 7940 }, { "completion_length": 269.8061218261719, "epoch": 0.7990943396226415, "grad_norm": 0.9525020718574524, "kl": 0.104736328125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.782653033733368, "reward_std": 0.12894374504685402, "rewards/accuracy_reward": 0.7826530635356903, "rewards/format_reward": 1.0, "step": 7941 }, { "completion_length": 239.9591827392578, "epoch": 0.7991949685534592, "grad_norm": 0.18103502690792084, "kl": 0.114990234375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.7528859972953796, "reward_std": 0.007941973395645618, "rewards/accuracy_reward": 0.7528859972953796, "rewards/format_reward": 1.0, "step": 7942 }, { "completion_length": 316.6428527832031, "epoch": 0.7992955974842767, "grad_norm": 0.6039541363716125, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7732295393943787, "reward_std": 0.18580450862646103, "rewards/accuracy_reward": 0.7936377227306366, "rewards/format_reward": 0.9795918166637421, "step": 7943 }, { "completion_length": 283.47959899902344, "epoch": 0.7993962264150943, "grad_norm": 0.7929289937019348, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6475205421447754, "reward_std": 0.18198464065790176, "rewards/accuracy_reward": 0.6679287254810333, "rewards/format_reward": 0.9795918464660645, "step": 7944 }, { "completion_length": 285.5102005004883, "epoch": 0.799496855345912, "grad_norm": 0.6570061445236206, "kl": 0.135986328125, "learning_rate": 1e-06, "loss": 0.0054, "reward": 1.670960783958435, "reward_std": 0.1763375699520111, "rewards/accuracy_reward": 0.6913689374923706, "rewards/format_reward": 0.9795918464660645, "step": 7945 }, { "completion_length": 350.28570556640625, "epoch": 0.7995974842767296, "grad_norm": 0.6918259859085083, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8364430665969849, "reward_std": 0.12643982842564583, "rewards/accuracy_reward": 0.8568512797355652, "rewards/format_reward": 0.9795918166637421, "step": 7946 }, { "completion_length": 309.2653045654297, "epoch": 0.7996981132075471, "grad_norm": 1.0966538190841675, "kl": 0.122314453125, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.705143928527832, "reward_std": 0.1204509511590004, "rewards/accuracy_reward": 0.7255521714687347, "rewards/format_reward": 0.9795918464660645, "step": 7947 }, { "completion_length": 175.37754821777344, "epoch": 0.7997987421383648, "grad_norm": 1.2125691175460815, "kl": 0.10791015625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7041600942611694, "reward_std": 0.09386255592107773, "rewards/accuracy_reward": 0.7245683073997498, "rewards/format_reward": 0.9795918166637421, "step": 7948 }, { "completion_length": 251.11224365234375, "epoch": 0.7998993710691824, "grad_norm": 0.7639866471290588, "kl": 0.098388671875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8492063283920288, "reward_std": 0.1368439793586731, "rewards/accuracy_reward": 0.8492062985897064, "rewards/format_reward": 1.0, "step": 7949 }, { "completion_length": 300.56121826171875, "epoch": 0.8, "grad_norm": 1.1410598754882812, "kl": 0.10107421875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8231911063194275, "reward_std": 0.1419210061430931, "rewards/accuracy_reward": 0.8333951830863953, "rewards/format_reward": 0.9897959232330322, "step": 7950 }, { "completion_length": 266.2142791748047, "epoch": 0.8001006289308176, "grad_norm": 0.7922556400299072, "kl": 0.0675048828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.740767776966095, "reward_std": 0.1536642163991928, "rewards/accuracy_reward": 0.7611758708953857, "rewards/format_reward": 0.9795918166637421, "step": 7951 }, { "completion_length": 265.9591751098633, "epoch": 0.8002012578616352, "grad_norm": 1.5069206953048706, "kl": 0.109375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7457364201545715, "reward_std": 0.2372603714466095, "rewards/accuracy_reward": 0.7865526676177979, "rewards/format_reward": 0.9591836631298065, "step": 7952 }, { "completion_length": 350.77549743652344, "epoch": 0.8003018867924528, "grad_norm": 0.7370877861976624, "kl": 0.077880859375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7903191447257996, "reward_std": 0.16393818706274033, "rewards/accuracy_reward": 0.8005232810974121, "rewards/format_reward": 0.9897959232330322, "step": 7953 }, { "completion_length": 167.33673095703125, "epoch": 0.8004025157232705, "grad_norm": 0.9833840131759644, "kl": 0.11181640625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7980091571807861, "reward_std": 0.1572873666882515, "rewards/accuracy_reward": 0.8184174001216888, "rewards/format_reward": 0.9795918464660645, "step": 7954 }, { "completion_length": 298.4285583496094, "epoch": 0.800503144654088, "grad_norm": 0.4961099922657013, "kl": 0.0584716796875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.79642254114151, "reward_std": 0.07486269623041153, "rewards/accuracy_reward": 0.8066267371177673, "rewards/format_reward": 0.9897959232330322, "step": 7955 }, { "completion_length": 286.42857360839844, "epoch": 0.8006037735849056, "grad_norm": 1.4787203073501587, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8472479581832886, "reward_std": 0.1477258764207363, "rewards/accuracy_reward": 0.8676561713218689, "rewards/format_reward": 0.9795918464660645, "step": 7956 }, { "completion_length": 364.84693908691406, "epoch": 0.8007044025157233, "grad_norm": 0.3702752888202667, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.5947752594947815, "reward_std": 0.14850166626274586, "rewards/accuracy_reward": 0.635591596364975, "rewards/format_reward": 0.9591836631298065, "step": 7957 }, { "completion_length": 306.05101776123047, "epoch": 0.8008050314465409, "grad_norm": 0.8659510612487793, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6740748882293701, "reward_std": 0.18814707547426224, "rewards/accuracy_reward": 0.6944830417633057, "rewards/format_reward": 0.9795918464660645, "step": 7958 }, { "completion_length": 185.2040786743164, "epoch": 0.8009056603773584, "grad_norm": 0.801776647567749, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8047088384628296, "reward_std": 0.039171066135168076, "rewards/accuracy_reward": 0.8149130344390869, "rewards/format_reward": 0.9897959232330322, "step": 7959 }, { "completion_length": 309.4387741088867, "epoch": 0.8010062893081761, "grad_norm": 0.5527673959732056, "kl": 0.059326171875, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.832793414592743, "reward_std": 0.14076830074191093, "rewards/accuracy_reward": 0.8634057641029358, "rewards/format_reward": 0.9693877398967743, "step": 7960 }, { "completion_length": 299.7142791748047, "epoch": 0.8011069182389937, "grad_norm": 1.2685118913650513, "kl": 0.139892578125, "learning_rate": 1e-06, "loss": 0.0056, "reward": 1.7075183391571045, "reward_std": 0.22300972044467926, "rewards/accuracy_reward": 0.7279264628887177, "rewards/format_reward": 0.9795918464660645, "step": 7961 }, { "completion_length": 228.37754821777344, "epoch": 0.8012075471698114, "grad_norm": 0.7712048888206482, "kl": 0.11328125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.715943694114685, "reward_std": 0.1425306797027588, "rewards/accuracy_reward": 0.7567600607872009, "rewards/format_reward": 0.9591836333274841, "step": 7962 }, { "completion_length": 248.18366241455078, "epoch": 0.8013081761006289, "grad_norm": 1.4619243144989014, "kl": 0.091552734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8242230415344238, "reward_std": 0.17259392887353897, "rewards/accuracy_reward": 0.8548353314399719, "rewards/format_reward": 0.9693877398967743, "step": 7963 }, { "completion_length": 344.19386291503906, "epoch": 0.8014088050314465, "grad_norm": 0.6743185520172119, "kl": 0.11669921875, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.6543094515800476, "reward_std": 0.12795424461364746, "rewards/accuracy_reward": 0.6747177541255951, "rewards/format_reward": 0.9795918166637421, "step": 7964 }, { "completion_length": 294.67345428466797, "epoch": 0.8015094339622642, "grad_norm": 0.6060325503349304, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.826128602027893, "reward_std": 0.18443337455391884, "rewards/accuracy_reward": 0.8363327085971832, "rewards/format_reward": 0.9897959232330322, "step": 7965 }, { "completion_length": 215.83673095703125, "epoch": 0.8016100628930818, "grad_norm": 0.7634598016738892, "kl": 0.065185546875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.9360118508338928, "reward_std": 0.06779027730226517, "rewards/accuracy_reward": 0.9360119104385376, "rewards/format_reward": 1.0, "step": 7966 }, { "completion_length": 315.7449035644531, "epoch": 0.8017106918238994, "grad_norm": 0.8558750152587891, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6101858615875244, "reward_std": 0.25958067178726196, "rewards/accuracy_reward": 0.6305940449237823, "rewards/format_reward": 0.9795918464660645, "step": 7967 }, { "completion_length": 308.10203552246094, "epoch": 0.801811320754717, "grad_norm": 0.5889390110969543, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7831844687461853, "reward_std": 0.11611752212047577, "rewards/accuracy_reward": 0.8035926520824432, "rewards/format_reward": 0.9795918464660645, "step": 7968 }, { "completion_length": 257.57141876220703, "epoch": 0.8019119496855346, "grad_norm": 0.5401339530944824, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8895366787910461, "reward_std": 0.16471900790929794, "rewards/accuracy_reward": 0.9099449217319489, "rewards/format_reward": 0.9795918166637421, "step": 7969 }, { "completion_length": 239.30611419677734, "epoch": 0.8020125786163522, "grad_norm": 1.1163432598114014, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7923338413238525, "reward_std": 0.1758110448718071, "rewards/accuracy_reward": 0.8025379478931427, "rewards/format_reward": 0.9897959232330322, "step": 7970 }, { "completion_length": 220.66326141357422, "epoch": 0.8021132075471699, "grad_norm": 0.4972301125526428, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.830676794052124, "reward_std": 0.06899712048470974, "rewards/accuracy_reward": 0.830676794052124, "rewards/format_reward": 1.0, "step": 7971 }, { "completion_length": 292.49998474121094, "epoch": 0.8022138364779874, "grad_norm": 0.9624027609825134, "kl": 0.10107421875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.6263520121574402, "reward_std": 0.1672021970152855, "rewards/accuracy_reward": 0.646760106086731, "rewards/format_reward": 0.9795918464660645, "step": 7972 }, { "completion_length": 238.4183578491211, "epoch": 0.802314465408805, "grad_norm": 1.2205827236175537, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.9049466848373413, "reward_std": 0.12546405009925365, "rewards/accuracy_reward": 0.9151507914066315, "rewards/format_reward": 0.9897959232330322, "step": 7973 }, { "completion_length": 233.63265228271484, "epoch": 0.8024150943396227, "grad_norm": 0.448772668838501, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8433106541633606, "reward_std": 0.110008604824543, "rewards/accuracy_reward": 0.8535147309303284, "rewards/format_reward": 0.9897959232330322, "step": 7974 }, { "completion_length": 205.08162689208984, "epoch": 0.8025157232704403, "grad_norm": 0.6242243051528931, "kl": 0.0789794921875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.870262324810028, "reward_std": 0.07098227739334106, "rewards/accuracy_reward": 0.8702623546123505, "rewards/format_reward": 1.0, "step": 7975 }, { "completion_length": 285.4081573486328, "epoch": 0.8026163522012578, "grad_norm": 0.5203565955162048, "kl": 0.0533447265625, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.763265311717987, "reward_std": 0.1486334353685379, "rewards/accuracy_reward": 0.7632652819156647, "rewards/format_reward": 1.0, "step": 7976 }, { "completion_length": 291.89795684814453, "epoch": 0.8027169811320755, "grad_norm": 0.4298199415206909, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8741496801376343, "reward_std": 0.11584595590829849, "rewards/accuracy_reward": 0.8741496503353119, "rewards/format_reward": 1.0, "step": 7977 }, { "completion_length": 194.2346954345703, "epoch": 0.8028176100628931, "grad_norm": 0.27874553203582764, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.9571428894996643, "reward_std": 0.05969300866127014, "rewards/accuracy_reward": 0.9571428298950195, "rewards/format_reward": 1.0, "step": 7978 }, { "completion_length": 235.8775405883789, "epoch": 0.8029182389937107, "grad_norm": 1.518792986869812, "kl": 0.1513671875, "learning_rate": 1e-06, "loss": 0.0061, "reward": 1.6507482528686523, "reward_std": 0.2228418067097664, "rewards/accuracy_reward": 0.6507482528686523, "rewards/format_reward": 1.0, "step": 7979 }, { "completion_length": 244.5102081298828, "epoch": 0.8030188679245283, "grad_norm": 0.9897779822349548, "kl": 0.14111328125, "learning_rate": 1e-06, "loss": 0.0056, "reward": 1.7445579171180725, "reward_std": 0.22003624588251114, "rewards/accuracy_reward": 0.7649660110473633, "rewards/format_reward": 0.9795918166637421, "step": 7980 }, { "completion_length": 249.33673095703125, "epoch": 0.8031194968553459, "grad_norm": 1.7939317226409912, "kl": 0.181884765625, "learning_rate": 1e-06, "loss": 0.0073, "reward": 1.7265151739120483, "reward_std": 0.17361964285373688, "rewards/accuracy_reward": 0.726515144109726, "rewards/format_reward": 1.0, "step": 7981 }, { "completion_length": 252.03060913085938, "epoch": 0.8032201257861635, "grad_norm": 0.6244293451309204, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8587087988853455, "reward_std": 0.16575097292661667, "rewards/accuracy_reward": 0.879116952419281, "rewards/format_reward": 0.9795918166637421, "step": 7982 }, { "completion_length": 295.7653045654297, "epoch": 0.8033207547169812, "grad_norm": 0.6915286779403687, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7471654415130615, "reward_std": 0.1807190179824829, "rewards/accuracy_reward": 0.7573695778846741, "rewards/format_reward": 0.9897959232330322, "step": 7983 }, { "completion_length": 212.42857360839844, "epoch": 0.8034213836477987, "grad_norm": 0.8100190162658691, "kl": 0.113037109375, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.9105571508407593, "reward_std": 0.101325623691082, "rewards/accuracy_reward": 0.9207611680030823, "rewards/format_reward": 0.9897959232330322, "step": 7984 }, { "completion_length": 288.51019287109375, "epoch": 0.8035220125786163, "grad_norm": 0.3967370092868805, "kl": 0.064208984375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8879265189170837, "reward_std": 0.08906300365924835, "rewards/accuracy_reward": 0.8981306552886963, "rewards/format_reward": 0.9897959232330322, "step": 7985 }, { "completion_length": 214.57142639160156, "epoch": 0.803622641509434, "grad_norm": 0.8076884150505066, "kl": 0.102294921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.789703130722046, "reward_std": 0.12684667855501175, "rewards/accuracy_reward": 0.7999072074890137, "rewards/format_reward": 0.9897959232330322, "step": 7986 }, { "completion_length": 270.1938781738281, "epoch": 0.8037232704402516, "grad_norm": 0.49076688289642334, "kl": 0.0699462890625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8235410451889038, "reward_std": 0.1160498633980751, "rewards/accuracy_reward": 0.8337450921535492, "rewards/format_reward": 0.9897959232330322, "step": 7987 }, { "completion_length": 232.81632232666016, "epoch": 0.8038238993710691, "grad_norm": 0.4579102396965027, "kl": 0.114501953125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.86672842502594, "reward_std": 0.12625928968191147, "rewards/accuracy_reward": 0.8769326210021973, "rewards/format_reward": 0.9897959232330322, "step": 7988 }, { "completion_length": 267.94896697998047, "epoch": 0.8039245283018868, "grad_norm": 0.5156091451644897, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.9009482264518738, "reward_std": 0.12795773148536682, "rewards/accuracy_reward": 0.9315605163574219, "rewards/format_reward": 0.9693877398967743, "step": 7989 }, { "completion_length": 307.6734619140625, "epoch": 0.8040251572327044, "grad_norm": 0.6766165494918823, "kl": 0.107177734375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.73280268907547, "reward_std": 0.21740928292274475, "rewards/accuracy_reward": 0.7532109320163727, "rewards/format_reward": 0.9795918166637421, "step": 7990 }, { "completion_length": 188.49999237060547, "epoch": 0.804125786163522, "grad_norm": 1.0284090042114258, "kl": 0.11767578125, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7826685309410095, "reward_std": 0.10002943500876427, "rewards/accuracy_reward": 0.7826685011386871, "rewards/format_reward": 1.0, "step": 7991 }, { "completion_length": 235.52040481567383, "epoch": 0.8042264150943397, "grad_norm": 1.5760334730148315, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8775509595870972, "reward_std": 0.09670460596680641, "rewards/accuracy_reward": 0.8877550661563873, "rewards/format_reward": 0.9897959232330322, "step": 7992 }, { "completion_length": 270.68367767333984, "epoch": 0.8043270440251572, "grad_norm": 2.365926742553711, "kl": 0.0712890625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8886054158210754, "reward_std": 0.1212324220687151, "rewards/accuracy_reward": 0.9090135991573334, "rewards/format_reward": 0.9795918166637421, "step": 7993 }, { "completion_length": 242.34693145751953, "epoch": 0.8044276729559748, "grad_norm": 1.067046046257019, "kl": 0.0706787109375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.84950590133667, "reward_std": 0.19890104234218597, "rewards/accuracy_reward": 0.8699140548706055, "rewards/format_reward": 0.9795918166637421, "step": 7994 }, { "completion_length": 237.61224365234375, "epoch": 0.8045283018867925, "grad_norm": 0.3622324764728546, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7251376509666443, "reward_std": 0.08212713710963726, "rewards/accuracy_reward": 0.7353417277336121, "rewards/format_reward": 0.9897959232330322, "step": 7995 }, { "completion_length": 276.01019287109375, "epoch": 0.8046289308176101, "grad_norm": 0.48414549231529236, "kl": 0.067626953125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8695191144943237, "reward_std": 0.12721766158938408, "rewards/accuracy_reward": 0.8695192039012909, "rewards/format_reward": 1.0, "step": 7996 }, { "completion_length": 261.56121826171875, "epoch": 0.8047295597484276, "grad_norm": 0.6012353897094727, "kl": 0.0516357421875, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8939942121505737, "reward_std": 0.16750064492225647, "rewards/accuracy_reward": 0.8939942717552185, "rewards/format_reward": 1.0, "step": 7997 }, { "completion_length": 246.1020278930664, "epoch": 0.8048301886792453, "grad_norm": 0.5363362431526184, "kl": 0.0662841796875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8787171840667725, "reward_std": 0.09441244602203369, "rewards/accuracy_reward": 0.8889212906360626, "rewards/format_reward": 0.9897959232330322, "step": 7998 }, { "completion_length": 239.40816497802734, "epoch": 0.8049308176100629, "grad_norm": 1.1748318672180176, "kl": 0.1171875, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.8570456504821777, "reward_std": 0.10381132224574685, "rewards/accuracy_reward": 0.8672497570514679, "rewards/format_reward": 0.9897959232330322, "step": 7999 }, { "completion_length": 253.7653045654297, "epoch": 0.8050314465408805, "grad_norm": 1.1281253099441528, "kl": 0.125244140625, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.7139812111854553, "reward_std": 0.19090691208839417, "rewards/accuracy_reward": 0.7343893945217133, "rewards/format_reward": 0.9795918464660645, "step": 8000 }, { "completion_length": 268.3367385864258, "epoch": 0.8051320754716981, "grad_norm": 0.6085481643676758, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8073371648788452, "reward_std": 0.16982878744602203, "rewards/accuracy_reward": 0.8175413012504578, "rewards/format_reward": 0.9897959232330322, "step": 8001 }, { "completion_length": 289.6632537841797, "epoch": 0.8052327044025157, "grad_norm": 1.5543402433395386, "kl": 0.102294921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8060010075569153, "reward_std": 0.1607096865773201, "rewards/accuracy_reward": 0.8366132080554962, "rewards/format_reward": 0.9693877398967743, "step": 8002 }, { "completion_length": 189.55101776123047, "epoch": 0.8053333333333333, "grad_norm": 0.5712351202964783, "kl": 0.093994140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7760204076766968, "reward_std": 0.067827008664608, "rewards/accuracy_reward": 0.7862244844436646, "rewards/format_reward": 0.9897959232330322, "step": 8003 }, { "completion_length": 216.53060913085938, "epoch": 0.805433962264151, "grad_norm": 1.0773295164108276, "kl": 0.107666015625, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.9028756618499756, "reward_std": 0.13530784845352173, "rewards/accuracy_reward": 0.9130797684192657, "rewards/format_reward": 0.9897959232330322, "step": 8004 }, { "completion_length": 371.05101013183594, "epoch": 0.8055345911949685, "grad_norm": 3.9024949073791504, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6504130363464355, "reward_std": 0.17883528023958206, "rewards/accuracy_reward": 0.6504129767417908, "rewards/format_reward": 1.0, "step": 8005 }, { "completion_length": 241.15306091308594, "epoch": 0.8056352201257861, "grad_norm": 0.9852998852729797, "kl": 0.112060546875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7941043376922607, "reward_std": 0.20963869243860245, "rewards/accuracy_reward": 0.8043084144592285, "rewards/format_reward": 0.9897959232330322, "step": 8006 }, { "completion_length": 284.6938705444336, "epoch": 0.8057358490566038, "grad_norm": 2.900714635848999, "kl": 0.12646484375, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.5603004097938538, "reward_std": 0.11069322377443314, "rewards/accuracy_reward": 0.5705044567584991, "rewards/format_reward": 0.9897959232330322, "step": 8007 }, { "completion_length": 190.83673095703125, "epoch": 0.8058364779874214, "grad_norm": 4.547369956970215, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8517006039619446, "reward_std": 0.20413294434547424, "rewards/accuracy_reward": 0.8619047403335571, "rewards/format_reward": 0.9897959232330322, "step": 8008 }, { "completion_length": 272.5612106323242, "epoch": 0.8059371069182389, "grad_norm": 0.4368894696235657, "kl": 0.0692138671875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7179471254348755, "reward_std": 0.11058124899864197, "rewards/accuracy_reward": 0.7179470956325531, "rewards/format_reward": 1.0, "step": 8009 }, { "completion_length": 275.12245178222656, "epoch": 0.8060377358490566, "grad_norm": 0.51857590675354, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6160887479782104, "reward_std": 0.11812647059559822, "rewards/accuracy_reward": 0.6364968866109848, "rewards/format_reward": 0.9795918464660645, "step": 8010 }, { "completion_length": 185.86734008789062, "epoch": 0.8061383647798742, "grad_norm": 0.8666189312934875, "kl": 0.12255859375, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.7906587719917297, "reward_std": 0.1254262737929821, "rewards/accuracy_reward": 0.8212710618972778, "rewards/format_reward": 0.9693877398967743, "step": 8011 }, { "completion_length": 218.83673095703125, "epoch": 0.8062389937106919, "grad_norm": 0.3290432393550873, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.828385829925537, "reward_std": 0.06480317376554012, "rewards/accuracy_reward": 0.8283858597278595, "rewards/format_reward": 1.0, "step": 8012 }, { "completion_length": 320.35713958740234, "epoch": 0.8063396226415094, "grad_norm": 0.6031222343444824, "kl": 0.0616455078125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6200512051582336, "reward_std": 0.09985298663377762, "rewards/accuracy_reward": 0.6302552819252014, "rewards/format_reward": 0.9897959232330322, "step": 8013 }, { "completion_length": 209.2040786743164, "epoch": 0.806440251572327, "grad_norm": 1.1761873960494995, "kl": 0.10693359375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7937517166137695, "reward_std": 0.17377795279026031, "rewards/accuracy_reward": 0.8141599297523499, "rewards/format_reward": 0.9795918166637421, "step": 8014 }, { "completion_length": 300.1122283935547, "epoch": 0.8065408805031447, "grad_norm": 0.38889190554618835, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7977735996246338, "reward_std": 0.10161323845386505, "rewards/accuracy_reward": 0.807977706193924, "rewards/format_reward": 0.9897959232330322, "step": 8015 }, { "completion_length": 259.08162689208984, "epoch": 0.8066415094339623, "grad_norm": 0.4624440371990204, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8861638307571411, "reward_std": 0.05298402765765786, "rewards/accuracy_reward": 0.8861638009548187, "rewards/format_reward": 1.0, "step": 8016 }, { "completion_length": 233.9285659790039, "epoch": 0.8067421383647799, "grad_norm": 0.7012978196144104, "kl": 0.0863037109375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.748690903186798, "reward_std": 0.1145763136446476, "rewards/accuracy_reward": 0.7690991163253784, "rewards/format_reward": 0.9795918464660645, "step": 8017 }, { "completion_length": 234.88775634765625, "epoch": 0.8068427672955975, "grad_norm": 0.41357746720314026, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8569926619529724, "reward_std": 0.05649940110743046, "rewards/accuracy_reward": 0.85699263215065, "rewards/format_reward": 1.0, "step": 8018 }, { "completion_length": 183.19387817382812, "epoch": 0.8069433962264151, "grad_norm": 0.6502061486244202, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.869387686252594, "reward_std": 0.08340440317988396, "rewards/accuracy_reward": 0.8795917928218842, "rewards/format_reward": 0.9897959232330322, "step": 8019 }, { "completion_length": 233.21428680419922, "epoch": 0.8070440251572327, "grad_norm": 0.5195059180259705, "kl": 0.05859375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8246991038322449, "reward_std": 0.12216731533408165, "rewards/accuracy_reward": 0.8349031805992126, "rewards/format_reward": 0.9897959232330322, "step": 8020 }, { "completion_length": 216.91836547851562, "epoch": 0.8071446540880504, "grad_norm": 1.4864768981933594, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8692233562469482, "reward_std": 0.13774882815778255, "rewards/accuracy_reward": 0.8896315395832062, "rewards/format_reward": 0.9795918166637421, "step": 8021 }, { "completion_length": 252.79591369628906, "epoch": 0.8072452830188679, "grad_norm": 0.2878163158893585, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8542273044586182, "reward_std": 0.019283898174762726, "rewards/accuracy_reward": 0.8542273938655853, "rewards/format_reward": 1.0, "step": 8022 }, { "completion_length": 293.38775634765625, "epoch": 0.8073459119496855, "grad_norm": 1.311699390411377, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7346237897872925, "reward_std": 0.16387461125850677, "rewards/accuracy_reward": 0.7448278963565826, "rewards/format_reward": 0.9897959232330322, "step": 8023 }, { "completion_length": 236.64285278320312, "epoch": 0.8074465408805032, "grad_norm": 0.382508784532547, "kl": 0.0626220703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8586415648460388, "reward_std": 0.09131418727338314, "rewards/accuracy_reward": 0.8790497779846191, "rewards/format_reward": 0.9795918166637421, "step": 8024 }, { "completion_length": 244.19387817382812, "epoch": 0.8075471698113208, "grad_norm": 0.24771611392498016, "kl": 0.09716796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7617974281311035, "reward_std": 0.04875069484114647, "rewards/accuracy_reward": 0.7617973685264587, "rewards/format_reward": 1.0, "step": 8025 }, { "completion_length": 222.51019287109375, "epoch": 0.8076477987421383, "grad_norm": 1.0428751707077026, "kl": 0.0914306640625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8317717909812927, "reward_std": 0.1917886659502983, "rewards/accuracy_reward": 0.8419758677482605, "rewards/format_reward": 0.9897959232330322, "step": 8026 }, { "completion_length": 238.77549743652344, "epoch": 0.807748427672956, "grad_norm": 0.9816984534263611, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7114739418029785, "reward_std": 0.2689538896083832, "rewards/accuracy_reward": 0.742086261510849, "rewards/format_reward": 0.9693877398967743, "step": 8027 }, { "completion_length": 197.4285659790039, "epoch": 0.8078490566037736, "grad_norm": 0.41870948672294617, "kl": 0.0703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.9203514456748962, "reward_std": 0.061478788033127785, "rewards/accuracy_reward": 0.9203514754772186, "rewards/format_reward": 1.0, "step": 8028 }, { "completion_length": 267.33673095703125, "epoch": 0.8079496855345912, "grad_norm": 0.43580982089042664, "kl": 0.0762939453125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8685086369514465, "reward_std": 0.13100330159068108, "rewards/accuracy_reward": 0.8787127733230591, "rewards/format_reward": 0.9897959232330322, "step": 8029 }, { "completion_length": 179.2959213256836, "epoch": 0.8080503144654088, "grad_norm": 0.8818239569664001, "kl": 0.0673828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.926397979259491, "reward_std": 0.1122853234410286, "rewards/accuracy_reward": 0.9366020560264587, "rewards/format_reward": 0.9897959232330322, "step": 8030 }, { "completion_length": 278.84693908691406, "epoch": 0.8081509433962264, "grad_norm": 1.2052860260009766, "kl": 0.0655517578125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6380510926246643, "reward_std": 0.14131474494934082, "rewards/accuracy_reward": 0.6380510628223419, "rewards/format_reward": 1.0, "step": 8031 }, { "completion_length": 218.32652282714844, "epoch": 0.808251572327044, "grad_norm": 0.482927531003952, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.838448703289032, "reward_std": 0.046655936166644096, "rewards/accuracy_reward": 0.8384487330913544, "rewards/format_reward": 1.0, "step": 8032 }, { "completion_length": 248.28571319580078, "epoch": 0.8083522012578617, "grad_norm": 0.7556018829345703, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8488875031471252, "reward_std": 0.18322931975126266, "rewards/accuracy_reward": 0.8692956566810608, "rewards/format_reward": 0.9795918166637421, "step": 8033 }, { "completion_length": 267.62245178222656, "epoch": 0.8084528301886792, "grad_norm": 0.8834694027900696, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7065432667732239, "reward_std": 0.2293337807059288, "rewards/accuracy_reward": 0.7167474031448364, "rewards/format_reward": 0.9897959232330322, "step": 8034 }, { "completion_length": 310.1836700439453, "epoch": 0.8085534591194968, "grad_norm": 2.3185877799987793, "kl": 0.19775390625, "learning_rate": 1e-06, "loss": 0.0079, "reward": 1.8483559489250183, "reward_std": 0.08844101428985596, "rewards/accuracy_reward": 0.8585600852966309, "rewards/format_reward": 0.9897959232330322, "step": 8035 }, { "completion_length": 215.5102081298828, "epoch": 0.8086540880503145, "grad_norm": 2.0543911457061768, "kl": 0.111572265625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.8357142806053162, "reward_std": 0.07405722513794899, "rewards/accuracy_reward": 0.8357143104076385, "rewards/format_reward": 1.0, "step": 8036 }, { "completion_length": 201.9081573486328, "epoch": 0.8087547169811321, "grad_norm": 0.8463276624679565, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8636539578437805, "reward_std": 0.1235092356801033, "rewards/accuracy_reward": 0.8636540174484253, "rewards/format_reward": 1.0, "step": 8037 }, { "completion_length": 255.84693908691406, "epoch": 0.8088553459119496, "grad_norm": 0.7412976026535034, "kl": 0.10888671875, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.6884883046150208, "reward_std": 0.1874675303697586, "rewards/accuracy_reward": 0.6986924111843109, "rewards/format_reward": 0.9897959232330322, "step": 8038 }, { "completion_length": 276.551025390625, "epoch": 0.8089559748427673, "grad_norm": 1.2035084962844849, "kl": 0.0655517578125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7285041213035583, "reward_std": 0.2810870558023453, "rewards/accuracy_reward": 0.7387082278728485, "rewards/format_reward": 0.9897959232330322, "step": 8039 }, { "completion_length": 240.75509643554688, "epoch": 0.8090566037735849, "grad_norm": 0.47550061345100403, "kl": 0.091064453125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8761904835700989, "reward_std": 0.10243984311819077, "rewards/accuracy_reward": 0.8761904537677765, "rewards/format_reward": 1.0, "step": 8040 }, { "completion_length": 291.24488830566406, "epoch": 0.8091572327044025, "grad_norm": 0.6035317778587341, "kl": 0.0614013671875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.794557809829712, "reward_std": 0.10607128217816353, "rewards/accuracy_reward": 0.8047619163990021, "rewards/format_reward": 0.9897959232330322, "step": 8041 }, { "completion_length": 209.63265228271484, "epoch": 0.8092578616352202, "grad_norm": 0.5680051445960999, "kl": 0.096923828125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8153546452522278, "reward_std": 0.12278426066040993, "rewards/accuracy_reward": 0.8153547048568726, "rewards/format_reward": 1.0, "step": 8042 }, { "completion_length": 209.45917510986328, "epoch": 0.8093584905660377, "grad_norm": 0.5931038856506348, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7929356694221497, "reward_std": 0.15530585870146751, "rewards/accuracy_reward": 0.803139716386795, "rewards/format_reward": 0.9897959232330322, "step": 8043 }, { "completion_length": 222.1632537841797, "epoch": 0.8094591194968553, "grad_norm": 0.41526687145233154, "kl": 0.103515625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.824829876422882, "reward_std": 0.06936390325427055, "rewards/accuracy_reward": 0.8452380895614624, "rewards/format_reward": 0.9795918464660645, "step": 8044 }, { "completion_length": 241.01019287109375, "epoch": 0.809559748427673, "grad_norm": 0.3697513937950134, "kl": 0.09814453125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.775404930114746, "reward_std": 0.06874345988035202, "rewards/accuracy_reward": 0.7754048705101013, "rewards/format_reward": 1.0, "step": 8045 }, { "completion_length": 227.10203552246094, "epoch": 0.8096603773584906, "grad_norm": 1.0219371318817139, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.748191773891449, "reward_std": 0.12234445288777351, "rewards/accuracy_reward": 0.7481918036937714, "rewards/format_reward": 1.0, "step": 8046 }, { "completion_length": 261.46937561035156, "epoch": 0.8097610062893081, "grad_norm": 0.4680333435535431, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7476433515548706, "reward_std": 0.08156401058658957, "rewards/accuracy_reward": 0.7578474283218384, "rewards/format_reward": 0.9897959232330322, "step": 8047 }, { "completion_length": 258.4897918701172, "epoch": 0.8098616352201258, "grad_norm": 1.411044716835022, "kl": 0.0596923828125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7033527493476868, "reward_std": 0.23688528686761856, "rewards/accuracy_reward": 0.7237609028816223, "rewards/format_reward": 0.9795918166637421, "step": 8048 }, { "completion_length": 221.01020050048828, "epoch": 0.8099622641509434, "grad_norm": 1.075502872467041, "kl": 0.0863037109375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8063322305679321, "reward_std": 0.16915472596883774, "rewards/accuracy_reward": 0.8267404437065125, "rewards/format_reward": 0.9795918166637421, "step": 8049 }, { "completion_length": 332.8673324584961, "epoch": 0.810062893081761, "grad_norm": 0.6722110509872437, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6955163478851318, "reward_std": 0.14390695095062256, "rewards/accuracy_reward": 0.705720454454422, "rewards/format_reward": 0.9897959232330322, "step": 8050 }, { "completion_length": 149.38774871826172, "epoch": 0.8101635220125786, "grad_norm": 1.0031987428665161, "kl": 0.0489501953125, "learning_rate": 1e-06, "loss": 0.002, "reward": 1.938070833683014, "reward_std": 0.06789182871580124, "rewards/accuracy_reward": 0.9380708634853363, "rewards/format_reward": 1.0, "step": 8051 }, { "completion_length": 196.4693832397461, "epoch": 0.8102641509433962, "grad_norm": 0.7167384624481201, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7299113273620605, "reward_std": 0.1562339961528778, "rewards/accuracy_reward": 0.7605235874652863, "rewards/format_reward": 0.9693877398967743, "step": 8052 }, { "completion_length": 256.44898223876953, "epoch": 0.8103647798742138, "grad_norm": 1.123447060585022, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7999789118766785, "reward_std": 0.09396784193813801, "rewards/accuracy_reward": 0.8101829588413239, "rewards/format_reward": 0.9897959232330322, "step": 8053 }, { "completion_length": 271.05101013183594, "epoch": 0.8104654088050315, "grad_norm": 0.7787932753562927, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8536796569824219, "reward_std": 0.09974608570337296, "rewards/accuracy_reward": 0.8536796569824219, "rewards/format_reward": 1.0, "step": 8054 }, { "completion_length": 254.8775405883789, "epoch": 0.810566037735849, "grad_norm": 0.4099096655845642, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8244898319244385, "reward_std": 0.06725053116679192, "rewards/accuracy_reward": 0.8551020324230194, "rewards/format_reward": 0.9693877398967743, "step": 8055 }, { "completion_length": 209.39794921875, "epoch": 0.8106666666666666, "grad_norm": 0.24633437395095825, "kl": 0.075927734375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.9149492979049683, "reward_std": 0.03276148438453674, "rewards/accuracy_reward": 0.9149492979049683, "rewards/format_reward": 1.0, "step": 8056 }, { "completion_length": 239.76529693603516, "epoch": 0.8107672955974843, "grad_norm": 1.7790437936782837, "kl": 0.0826416015625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.649958610534668, "reward_std": 0.2197212055325508, "rewards/accuracy_reward": 0.6601626574993134, "rewards/format_reward": 0.9897959232330322, "step": 8057 }, { "completion_length": 246.33673095703125, "epoch": 0.8108679245283019, "grad_norm": 0.7032144069671631, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8877145648002625, "reward_std": 0.1551523581147194, "rewards/accuracy_reward": 0.8979186713695526, "rewards/format_reward": 0.9897959232330322, "step": 8058 }, { "completion_length": 212.9387664794922, "epoch": 0.8109685534591194, "grad_norm": 0.6871019601821899, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.860038697719574, "reward_std": 0.13376543670892715, "rewards/accuracy_reward": 0.8600387871265411, "rewards/format_reward": 1.0, "step": 8059 }, { "completion_length": 245.06121063232422, "epoch": 0.8110691823899371, "grad_norm": 0.7280361652374268, "kl": 0.115234375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.72184157371521, "reward_std": 0.24990606307983398, "rewards/accuracy_reward": 0.7626579403877258, "rewards/format_reward": 0.9591836333274841, "step": 8060 }, { "completion_length": 288.15306091308594, "epoch": 0.8111698113207547, "grad_norm": 1.2785533666610718, "kl": 0.0550537109375, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7835600972175598, "reward_std": 0.17235170304775238, "rewards/accuracy_reward": 0.7835600674152374, "rewards/format_reward": 1.0, "step": 8061 }, { "completion_length": 288.0102081298828, "epoch": 0.8112704402515724, "grad_norm": 0.4437963664531708, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7239928245544434, "reward_std": 0.0679582729935646, "rewards/accuracy_reward": 0.7239928841590881, "rewards/format_reward": 1.0, "step": 8062 }, { "completion_length": 268.18367767333984, "epoch": 0.81137106918239, "grad_norm": 0.4046877920627594, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8076249361038208, "reward_std": 0.1070986445993185, "rewards/accuracy_reward": 0.8280331790447235, "rewards/format_reward": 0.9795918166637421, "step": 8063 }, { "completion_length": 208.9285659790039, "epoch": 0.8114716981132075, "grad_norm": 0.5368456840515137, "kl": 0.10302734375, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.9285714030265808, "reward_std": 0.1079898476600647, "rewards/accuracy_reward": 0.9489795565605164, "rewards/format_reward": 0.9795918166637421, "step": 8064 }, { "completion_length": 314.3673400878906, "epoch": 0.8115723270440252, "grad_norm": 1.1060417890548706, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7106369137763977, "reward_std": 0.3050285130739212, "rewards/accuracy_reward": 0.731045126914978, "rewards/format_reward": 0.9795918166637421, "step": 8065 }, { "completion_length": 209.4081573486328, "epoch": 0.8116729559748428, "grad_norm": 1.8233736753463745, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7987634539604187, "reward_std": 0.20465455949306488, "rewards/accuracy_reward": 0.8089675307273865, "rewards/format_reward": 0.9897959232330322, "step": 8066 }, { "completion_length": 173.06122589111328, "epoch": 0.8117735849056604, "grad_norm": 0.5522665977478027, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8260358572006226, "reward_std": 0.07050710171461105, "rewards/accuracy_reward": 0.8260358572006226, "rewards/format_reward": 1.0, "step": 8067 }, { "completion_length": 260.4387664794922, "epoch": 0.811874213836478, "grad_norm": 0.5538409948348999, "kl": 0.071044921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7120807766914368, "reward_std": 0.1777130737900734, "rewards/accuracy_reward": 0.7222848534584045, "rewards/format_reward": 0.9897959232330322, "step": 8068 }, { "completion_length": 225.16326141357422, "epoch": 0.8119748427672956, "grad_norm": 0.6626809239387512, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.9331064820289612, "reward_std": 0.17698336392641068, "rewards/accuracy_reward": 0.9535146951675415, "rewards/format_reward": 0.9795918464660645, "step": 8069 }, { "completion_length": 210.85714721679688, "epoch": 0.8120754716981132, "grad_norm": 0.7473193407058716, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8929498195648193, "reward_std": 0.16159871965646744, "rewards/accuracy_reward": 0.9031539857387543, "rewards/format_reward": 0.9897959232330322, "step": 8070 }, { "completion_length": 231.6632537841797, "epoch": 0.8121761006289309, "grad_norm": 0.987694501876831, "kl": 0.101318359375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.774768054485321, "reward_std": 0.11241679266095161, "rewards/accuracy_reward": 0.7747680842876434, "rewards/format_reward": 1.0, "step": 8071 }, { "completion_length": 212.75509643554688, "epoch": 0.8122767295597484, "grad_norm": 1.0209728479385376, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.9165772199630737, "reward_std": 0.009420452872291207, "rewards/accuracy_reward": 0.9165773093700409, "rewards/format_reward": 1.0, "step": 8072 }, { "completion_length": 256.6632614135742, "epoch": 0.812377358490566, "grad_norm": 0.5759257674217224, "kl": 0.0655517578125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8685385584831238, "reward_std": 0.12252606451511383, "rewards/accuracy_reward": 0.8787426650524139, "rewards/format_reward": 0.9897959232330322, "step": 8073 }, { "completion_length": 199.64285278320312, "epoch": 0.8124779874213837, "grad_norm": 0.5711973309516907, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7323567271232605, "reward_std": 0.09739743173122406, "rewards/accuracy_reward": 0.7527649104595184, "rewards/format_reward": 0.9795918464660645, "step": 8074 }, { "completion_length": 175.16325759887695, "epoch": 0.8125786163522013, "grad_norm": 0.43683019280433655, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.9076125025749207, "reward_std": 0.0570375956594944, "rewards/accuracy_reward": 0.9076125919818878, "rewards/format_reward": 1.0, "step": 8075 }, { "completion_length": 285.27549743652344, "epoch": 0.8126792452830188, "grad_norm": 0.6760589480400085, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7339311242103577, "reward_std": 0.20240944623947144, "rewards/accuracy_reward": 0.7441352307796478, "rewards/format_reward": 0.9897959232330322, "step": 8076 }, { "completion_length": 276.4387664794922, "epoch": 0.8127798742138365, "grad_norm": 0.32977908849716187, "kl": 0.0992431640625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.9226552248001099, "reward_std": 0.04398280009627342, "rewards/accuracy_reward": 0.9226552546024323, "rewards/format_reward": 1.0, "step": 8077 }, { "completion_length": 237.88774871826172, "epoch": 0.8128805031446541, "grad_norm": 0.9302973747253418, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8833075761795044, "reward_std": 0.130623921751976, "rewards/accuracy_reward": 0.9037157297134399, "rewards/format_reward": 0.9795918464660645, "step": 8078 }, { "completion_length": 200.04080963134766, "epoch": 0.8129811320754717, "grad_norm": 0.9820109009742737, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.768384039402008, "reward_std": 0.2708384841680527, "rewards/accuracy_reward": 0.7887921333312988, "rewards/format_reward": 0.9795918166637421, "step": 8079 }, { "completion_length": 242.74490356445312, "epoch": 0.8130817610062893, "grad_norm": 0.5328105688095093, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7957968711853027, "reward_std": 0.16105738282203674, "rewards/accuracy_reward": 0.8264091312885284, "rewards/format_reward": 0.9693877398967743, "step": 8080 }, { "completion_length": 208.16326141357422, "epoch": 0.8131823899371069, "grad_norm": 0.6921793818473816, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8129251599311829, "reward_std": 0.13241545855998993, "rewards/accuracy_reward": 0.8129251599311829, "rewards/format_reward": 1.0, "step": 8081 }, { "completion_length": 238.15306091308594, "epoch": 0.8132830188679245, "grad_norm": 0.46709901094436646, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.765242874622345, "reward_std": 0.0978953130543232, "rewards/accuracy_reward": 0.7856510877609253, "rewards/format_reward": 0.9795918464660645, "step": 8082 }, { "completion_length": 243.81632232666016, "epoch": 0.8133836477987422, "grad_norm": 0.36840564012527466, "kl": 0.0682373046875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8285095691680908, "reward_std": 0.10267258435487747, "rewards/accuracy_reward": 0.838713675737381, "rewards/format_reward": 0.9897959232330322, "step": 8083 }, { "completion_length": 275.5816192626953, "epoch": 0.8134842767295597, "grad_norm": 0.7367089986801147, "kl": 0.066650390625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.861564576625824, "reward_std": 0.17400379478931427, "rewards/accuracy_reward": 0.8717686533927917, "rewards/format_reward": 0.9897959232330322, "step": 8084 }, { "completion_length": 284.51019287109375, "epoch": 0.8135849056603773, "grad_norm": 0.5439666509628296, "kl": 0.076171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6937427520751953, "reward_std": 0.13493815436959267, "rewards/accuracy_reward": 0.7243550419807434, "rewards/format_reward": 0.9693877398967743, "step": 8085 }, { "completion_length": 241.72447967529297, "epoch": 0.813685534591195, "grad_norm": 0.4716106653213501, "kl": 0.064453125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.7296317219734192, "reward_std": 0.09025579690933228, "rewards/accuracy_reward": 0.7296317517757416, "rewards/format_reward": 1.0, "step": 8086 }, { "completion_length": 252.12245178222656, "epoch": 0.8137861635220126, "grad_norm": 0.49016159772872925, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7116721868515015, "reward_std": 0.10384368523955345, "rewards/accuracy_reward": 0.721876323223114, "rewards/format_reward": 0.9897959232330322, "step": 8087 }, { "completion_length": 283.07141876220703, "epoch": 0.8138867924528302, "grad_norm": 1.1267449855804443, "kl": 0.124267578125, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.7895264029502869, "reward_std": 0.17356517538428307, "rewards/accuracy_reward": 0.8099345862865448, "rewards/format_reward": 0.9795918166637421, "step": 8088 }, { "completion_length": 196.21428680419922, "epoch": 0.8139874213836478, "grad_norm": 0.8210443258285522, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7641929388046265, "reward_std": 0.2037542760372162, "rewards/accuracy_reward": 0.784601092338562, "rewards/format_reward": 0.9795918166637421, "step": 8089 }, { "completion_length": 183.60203552246094, "epoch": 0.8140880503144654, "grad_norm": 0.638996958732605, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8928349614143372, "reward_std": 0.06837860587984324, "rewards/accuracy_reward": 0.8928349912166595, "rewards/format_reward": 1.0, "step": 8090 }, { "completion_length": 252.93877410888672, "epoch": 0.814188679245283, "grad_norm": 1.1096478700637817, "kl": 0.0667724609375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.7629955410957336, "reward_std": 0.14569170027971268, "rewards/accuracy_reward": 0.7731996476650238, "rewards/format_reward": 0.9897959232330322, "step": 8091 }, { "completion_length": 324.5306091308594, "epoch": 0.8142893081761007, "grad_norm": 1.2205842733383179, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7052627205848694, "reward_std": 0.22594021260738373, "rewards/accuracy_reward": 0.7460790872573853, "rewards/format_reward": 0.9591836333274841, "step": 8092 }, { "completion_length": 212.8163299560547, "epoch": 0.8143899371069182, "grad_norm": 0.4550818204879761, "kl": 0.11279296875, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7705215811729431, "reward_std": 0.07835924997925758, "rewards/accuracy_reward": 0.7807256281375885, "rewards/format_reward": 0.9897959232330322, "step": 8093 }, { "completion_length": 157.82653045654297, "epoch": 0.8144905660377358, "grad_norm": 1.162607192993164, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7828230261802673, "reward_std": 0.15896600112318993, "rewards/accuracy_reward": 0.7828231155872345, "rewards/format_reward": 1.0, "step": 8094 }, { "completion_length": 237.81632232666016, "epoch": 0.8145911949685535, "grad_norm": 0.619110643863678, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.798550307750702, "reward_std": 0.1210513561964035, "rewards/accuracy_reward": 0.8087544441223145, "rewards/format_reward": 0.9897959232330322, "step": 8095 }, { "completion_length": 245.51020050048828, "epoch": 0.8146918238993711, "grad_norm": 0.8549579977989197, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.814372956752777, "reward_std": 0.22081707045435905, "rewards/accuracy_reward": 0.8245770931243896, "rewards/format_reward": 0.9897959232330322, "step": 8096 }, { "completion_length": 242.5408172607422, "epoch": 0.8147924528301886, "grad_norm": 0.7430897355079651, "kl": 0.12451171875, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.8672541379928589, "reward_std": 0.1737082526087761, "rewards/accuracy_reward": 0.8978663682937622, "rewards/format_reward": 0.9693877398967743, "step": 8097 }, { "completion_length": 247.97958374023438, "epoch": 0.8148930817610063, "grad_norm": 0.7155874371528625, "kl": 0.072265625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8185940384864807, "reward_std": 0.10199041105806828, "rewards/accuracy_reward": 0.8287981450557709, "rewards/format_reward": 0.9897959232330322, "step": 8098 }, { "completion_length": 236.87754821777344, "epoch": 0.8149937106918239, "grad_norm": 0.5389293432235718, "kl": 0.108154296875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.727782130241394, "reward_std": 0.13676812127232552, "rewards/accuracy_reward": 0.7583944201469421, "rewards/format_reward": 0.9693877398967743, "step": 8099 }, { "completion_length": 251.05101776123047, "epoch": 0.8150943396226416, "grad_norm": 0.5774082541465759, "kl": 0.08740234375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.737370789051056, "reward_std": 0.1321662962436676, "rewards/accuracy_reward": 0.7475748658180237, "rewards/format_reward": 0.9897959232330322, "step": 8100 }, { "completion_length": 206.12244415283203, "epoch": 0.8151949685534591, "grad_norm": 0.5206725001335144, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.9453548789024353, "reward_std": 0.06104092299938202, "rewards/accuracy_reward": 0.9555589854717255, "rewards/format_reward": 0.9897959232330322, "step": 8101 }, { "completion_length": 218.948974609375, "epoch": 0.8152955974842767, "grad_norm": 0.7036762833595276, "kl": 0.0750732421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8952927589416504, "reward_std": 0.15533236786723137, "rewards/accuracy_reward": 0.9157009720802307, "rewards/format_reward": 0.9795918166637421, "step": 8102 }, { "completion_length": 285.2142791748047, "epoch": 0.8153962264150943, "grad_norm": 0.8348444104194641, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.6673182249069214, "reward_std": 0.13899778202176094, "rewards/accuracy_reward": 0.6775223612785339, "rewards/format_reward": 0.9897959232330322, "step": 8103 }, { "completion_length": 206.59183502197266, "epoch": 0.815496855345912, "grad_norm": 0.4430575370788574, "kl": 0.105712890625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7929498553276062, "reward_std": 0.08551008068025112, "rewards/accuracy_reward": 0.8031539916992188, "rewards/format_reward": 0.9897959232330322, "step": 8104 }, { "completion_length": 289.4387741088867, "epoch": 0.8155974842767295, "grad_norm": 0.7021031975746155, "kl": 0.1025390625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7585914134979248, "reward_std": 0.15618648752570152, "rewards/accuracy_reward": 0.7687955498695374, "rewards/format_reward": 0.9897959232330322, "step": 8105 }, { "completion_length": 284.9795837402344, "epoch": 0.8156981132075471, "grad_norm": 0.7687286734580994, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7810855507850647, "reward_std": 0.14135928079485893, "rewards/accuracy_reward": 0.8014938533306122, "rewards/format_reward": 0.9795918464660645, "step": 8106 }, { "completion_length": 321.12245178222656, "epoch": 0.8157987421383648, "grad_norm": 1.0039719343185425, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.721017301082611, "reward_std": 0.25227902084589005, "rewards/accuracy_reward": 0.7618336379528046, "rewards/format_reward": 0.9591836333274841, "step": 8107 }, { "completion_length": 222.07142639160156, "epoch": 0.8158993710691824, "grad_norm": 0.7501165270805359, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.9080533981323242, "reward_std": 0.10249796509742737, "rewards/accuracy_reward": 0.9080534279346466, "rewards/format_reward": 1.0, "step": 8108 }, { "completion_length": 305.7142868041992, "epoch": 0.816, "grad_norm": 0.43101102113723755, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7718359231948853, "reward_std": 0.21018575131893158, "rewards/accuracy_reward": 0.8024482429027557, "rewards/format_reward": 0.9693877398967743, "step": 8109 }, { "completion_length": 267.77549743652344, "epoch": 0.8161006289308176, "grad_norm": 1.2891857624053955, "kl": 0.164794921875, "learning_rate": 1e-06, "loss": 0.0066, "reward": 1.706127941608429, "reward_std": 0.21348517388105392, "rewards/accuracy_reward": 0.7571483850479126, "rewards/format_reward": 0.9489795863628387, "step": 8110 }, { "completion_length": 260.73468017578125, "epoch": 0.8162012578616352, "grad_norm": 0.7625797390937805, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.726277232170105, "reward_std": 0.1568518839776516, "rewards/accuracy_reward": 0.7364812791347504, "rewards/format_reward": 0.9897959232330322, "step": 8111 }, { "completion_length": 243.6938705444336, "epoch": 0.8163018867924529, "grad_norm": 0.515538215637207, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.802721083164215, "reward_std": 0.12484510987997055, "rewards/accuracy_reward": 0.8129251599311829, "rewards/format_reward": 0.9897959232330322, "step": 8112 }, { "completion_length": 211.99999237060547, "epoch": 0.8164025157232705, "grad_norm": 1.8016289472579956, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8017168641090393, "reward_std": 0.13658741675317287, "rewards/accuracy_reward": 0.8119209408760071, "rewards/format_reward": 0.9897959232330322, "step": 8113 }, { "completion_length": 287.77549743652344, "epoch": 0.816503144654088, "grad_norm": 0.5643607974052429, "kl": 0.086181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.712649166584015, "reward_std": 0.09238797007128596, "rewards/accuracy_reward": 0.7126492261886597, "rewards/format_reward": 1.0, "step": 8114 }, { "completion_length": 271.06121826171875, "epoch": 0.8166037735849057, "grad_norm": 0.4215463101863861, "kl": 0.11279296875, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.7810468077659607, "reward_std": 0.11582650989294052, "rewards/accuracy_reward": 0.7912509441375732, "rewards/format_reward": 0.9897959232330322, "step": 8115 }, { "completion_length": 173.45917510986328, "epoch": 0.8167044025157233, "grad_norm": 1.0709638595581055, "kl": 0.126953125, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.8163264989852905, "reward_std": 0.11917255818843842, "rewards/accuracy_reward": 0.8163265287876129, "rewards/format_reward": 1.0, "step": 8116 }, { "completion_length": 292.38775634765625, "epoch": 0.8168050314465409, "grad_norm": 0.5378186106681824, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8805784583091736, "reward_std": 0.10866820625960827, "rewards/accuracy_reward": 0.9009865820407867, "rewards/format_reward": 0.9795918166637421, "step": 8117 }, { "completion_length": 241.93878173828125, "epoch": 0.8169056603773585, "grad_norm": 0.9979655742645264, "kl": 0.108642578125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.6376176476478577, "reward_std": 0.2556503415107727, "rewards/accuracy_reward": 0.6580258905887604, "rewards/format_reward": 0.9795918166637421, "step": 8118 }, { "completion_length": 183.9081573486328, "epoch": 0.8170062893081761, "grad_norm": 0.594315230846405, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8907235264778137, "reward_std": 0.10706343874335289, "rewards/accuracy_reward": 0.9111317098140717, "rewards/format_reward": 0.9795918464660645, "step": 8119 }, { "completion_length": 193.23468780517578, "epoch": 0.8171069182389937, "grad_norm": 0.5763953328132629, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8474128246307373, "reward_std": 0.17526395618915558, "rewards/accuracy_reward": 0.8678210377693176, "rewards/format_reward": 0.9795918166637421, "step": 8120 }, { "completion_length": 262.11224365234375, "epoch": 0.8172075471698114, "grad_norm": 0.608119785785675, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8054885268211365, "reward_std": 0.15036308765411377, "rewards/accuracy_reward": 0.815692663192749, "rewards/format_reward": 0.9897959232330322, "step": 8121 }, { "completion_length": 215.51020050048828, "epoch": 0.8173081761006289, "grad_norm": 0.7437085509300232, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.9443250894546509, "reward_std": 0.12804801017045975, "rewards/accuracy_reward": 0.9647332727909088, "rewards/format_reward": 0.9795918166637421, "step": 8122 }, { "completion_length": 179.31632232666016, "epoch": 0.8174088050314465, "grad_norm": 0.9806138277053833, "kl": 0.097412109375, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8954080939292908, "reward_std": 0.08774175215512514, "rewards/accuracy_reward": 0.9056122303009033, "rewards/format_reward": 0.9897959232330322, "step": 8123 }, { "completion_length": 186.15306091308594, "epoch": 0.8175094339622642, "grad_norm": 0.4724993407726288, "kl": 0.102294921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.9102787971496582, "reward_std": 0.09934856276959181, "rewards/accuracy_reward": 0.920482873916626, "rewards/format_reward": 0.9897959232330322, "step": 8124 }, { "completion_length": 196.64285278320312, "epoch": 0.8176100628930818, "grad_norm": 1.1353976726531982, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8053935766220093, "reward_std": 0.23325789719820023, "rewards/accuracy_reward": 0.8360057771205902, "rewards/format_reward": 0.9693877398967743, "step": 8125 }, { "completion_length": 225.26529693603516, "epoch": 0.8177106918238993, "grad_norm": 0.3681904673576355, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7849854230880737, "reward_std": 0.12141092866659164, "rewards/accuracy_reward": 0.7951894998550415, "rewards/format_reward": 0.9897959232330322, "step": 8126 }, { "completion_length": 236.5, "epoch": 0.817811320754717, "grad_norm": 0.9944829344749451, "kl": 0.097900390625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8823128938674927, "reward_std": 0.1369277462363243, "rewards/accuracy_reward": 0.8823129236698151, "rewards/format_reward": 1.0, "step": 8127 }, { "completion_length": 266.42857360839844, "epoch": 0.8179119496855346, "grad_norm": 0.43708324432373047, "kl": 0.0654296875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.713799774646759, "reward_std": 0.14281179755926132, "rewards/accuracy_reward": 0.7342079281806946, "rewards/format_reward": 0.9795918464660645, "step": 8128 }, { "completion_length": 226.25509643554688, "epoch": 0.8180125786163522, "grad_norm": 0.8299222588539124, "kl": 0.117919921875, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.615483582019806, "reward_std": 0.15455443412065506, "rewards/accuracy_reward": 0.6154835373163223, "rewards/format_reward": 1.0, "step": 8129 }, { "completion_length": 227.17346954345703, "epoch": 0.8181132075471698, "grad_norm": 0.5400635004043579, "kl": 0.0814208984375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7293135523796082, "reward_std": 0.1351943574845791, "rewards/accuracy_reward": 0.7497217059135437, "rewards/format_reward": 0.9795918166637421, "step": 8130 }, { "completion_length": 223.65306091308594, "epoch": 0.8182138364779874, "grad_norm": 0.4552214443683624, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.9021823406219482, "reward_std": 0.07387144863605499, "rewards/accuracy_reward": 0.9123864769935608, "rewards/format_reward": 0.9897959232330322, "step": 8131 }, { "completion_length": 224.49999237060547, "epoch": 0.818314465408805, "grad_norm": 0.36143752932548523, "kl": 0.126953125, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.83222895860672, "reward_std": 0.10717221722006798, "rewards/accuracy_reward": 0.8424330651760101, "rewards/format_reward": 0.9897959232330322, "step": 8132 }, { "completion_length": 186.83673095703125, "epoch": 0.8184150943396227, "grad_norm": 1.2796704769134521, "kl": 0.162109375, "learning_rate": 1e-06, "loss": 0.0065, "reward": 1.700316071510315, "reward_std": 0.1447504535317421, "rewards/accuracy_reward": 0.7309283912181854, "rewards/format_reward": 0.9693877398967743, "step": 8133 }, { "completion_length": 209.02040100097656, "epoch": 0.8185157232704402, "grad_norm": 0.7865212559700012, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.853703796863556, "reward_std": 0.19959130883216858, "rewards/accuracy_reward": 0.8741121292114258, "rewards/format_reward": 0.9795918464660645, "step": 8134 }, { "completion_length": 183.32652282714844, "epoch": 0.8186163522012578, "grad_norm": 0.9685254096984863, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.856369137763977, "reward_std": 0.11868245154619217, "rewards/accuracy_reward": 0.856369137763977, "rewards/format_reward": 1.0, "step": 8135 }, { "completion_length": 252.91836547851562, "epoch": 0.8187169811320755, "grad_norm": 1.3708829879760742, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6332408785820007, "reward_std": 0.1928611546754837, "rewards/accuracy_reward": 0.6740573048591614, "rewards/format_reward": 0.9591836631298065, "step": 8136 }, { "completion_length": 214.08162689208984, "epoch": 0.8188176100628931, "grad_norm": 0.8647743463516235, "kl": 0.0889892578125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.723761260509491, "reward_std": 0.21920707076787949, "rewards/accuracy_reward": 0.7543734908103943, "rewards/format_reward": 0.9693877398967743, "step": 8137 }, { "completion_length": 257.79591369628906, "epoch": 0.8189182389937107, "grad_norm": 0.9943005442619324, "kl": 0.072509765625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7074940204620361, "reward_std": 0.24881191551685333, "rewards/accuracy_reward": 0.7381062507629395, "rewards/format_reward": 0.9693877398967743, "step": 8138 }, { "completion_length": 219.68366241455078, "epoch": 0.8190188679245283, "grad_norm": 3.0293447971343994, "kl": 0.098876953125, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7106987833976746, "reward_std": 0.20037377253174782, "rewards/accuracy_reward": 0.710698813199997, "rewards/format_reward": 1.0, "step": 8139 }, { "completion_length": 224.60203552246094, "epoch": 0.8191194968553459, "grad_norm": 1.3213791847229004, "kl": 0.1083984375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7762404084205627, "reward_std": 0.1312425099313259, "rewards/accuracy_reward": 0.7864445149898529, "rewards/format_reward": 0.9897959232330322, "step": 8140 }, { "completion_length": 185.08162689208984, "epoch": 0.8192201257861635, "grad_norm": 0.8961305022239685, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8133253455162048, "reward_std": 0.2665098309516907, "rewards/accuracy_reward": 0.8439375758171082, "rewards/format_reward": 0.9693877398967743, "step": 8141 }, { "completion_length": 199.64285278320312, "epoch": 0.8193207547169812, "grad_norm": 1.0467227697372437, "kl": 0.162109375, "learning_rate": 1e-06, "loss": 0.0065, "reward": 1.7185086607933044, "reward_std": 0.12484848126769066, "rewards/accuracy_reward": 0.7287128269672394, "rewards/format_reward": 0.9897959232330322, "step": 8142 }, { "completion_length": 195.78570556640625, "epoch": 0.8194213836477987, "grad_norm": 0.479914128780365, "kl": 0.099853515625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8986173272132874, "reward_std": 0.09220623597502708, "rewards/accuracy_reward": 0.8986172676086426, "rewards/format_reward": 1.0, "step": 8143 }, { "completion_length": 148.42857360839844, "epoch": 0.8195220125786163, "grad_norm": 0.5529284477233887, "kl": 0.0814208984375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8265305757522583, "reward_std": 0.06970714777708054, "rewards/accuracy_reward": 0.8265306055545807, "rewards/format_reward": 1.0, "step": 8144 }, { "completion_length": 217.8775405883789, "epoch": 0.819622641509434, "grad_norm": 0.9692100286483765, "kl": 0.0770263671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8564273118972778, "reward_std": 0.13331250846385956, "rewards/accuracy_reward": 0.8666312992572784, "rewards/format_reward": 0.9897959232330322, "step": 8145 }, { "completion_length": 174.11224365234375, "epoch": 0.8197232704402516, "grad_norm": 1.856506109237671, "kl": 0.126220703125, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.7896015048027039, "reward_std": 0.10967937856912613, "rewards/accuracy_reward": 0.7896015346050262, "rewards/format_reward": 1.0, "step": 8146 }, { "completion_length": 249.12244415283203, "epoch": 0.8198238993710691, "grad_norm": 0.5105123519897461, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6051587462425232, "reward_std": 0.19366078078746796, "rewards/accuracy_reward": 0.6459750533103943, "rewards/format_reward": 0.9591836333274841, "step": 8147 }, { "completion_length": 275.7448959350586, "epoch": 0.8199245283018868, "grad_norm": 0.84047931432724, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6940618753433228, "reward_std": 0.1658065766096115, "rewards/accuracy_reward": 0.7246741354465485, "rewards/format_reward": 0.9693877398967743, "step": 8148 }, { "completion_length": 185.67346954345703, "epoch": 0.8200251572327044, "grad_norm": 0.41844016313552856, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8163264989852905, "reward_std": 0.03818017616868019, "rewards/accuracy_reward": 0.8163264989852905, "rewards/format_reward": 1.0, "step": 8149 }, { "completion_length": 153.9285659790039, "epoch": 0.820125786163522, "grad_norm": 8.744664192199707, "kl": 0.10498046875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.851010024547577, "reward_std": 0.17640934884548187, "rewards/accuracy_reward": 0.8510101139545441, "rewards/format_reward": 1.0, "step": 8150 }, { "completion_length": 275.08162689208984, "epoch": 0.8202264150943396, "grad_norm": 0.6684731841087341, "kl": 0.109619140625, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8032774329185486, "reward_std": 0.1744624450802803, "rewards/accuracy_reward": 0.8338897228240967, "rewards/format_reward": 0.9693877398967743, "step": 8151 }, { "completion_length": 259.6530456542969, "epoch": 0.8203270440251572, "grad_norm": 0.5718665719032288, "kl": 0.111328125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.5617689490318298, "reward_std": 0.16516976431012154, "rewards/accuracy_reward": 0.5821771025657654, "rewards/format_reward": 0.9795918464660645, "step": 8152 }, { "completion_length": 161.02040100097656, "epoch": 0.8204276729559749, "grad_norm": 1.011971116065979, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7783887386322021, "reward_std": 0.13648933172225952, "rewards/accuracy_reward": 0.7885929048061371, "rewards/format_reward": 0.9897959232330322, "step": 8153 }, { "completion_length": 178.07142639160156, "epoch": 0.8205283018867925, "grad_norm": 0.5801910758018494, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8557823300361633, "reward_std": 0.07892763987183571, "rewards/accuracy_reward": 0.8659864068031311, "rewards/format_reward": 0.9897959232330322, "step": 8154 }, { "completion_length": 176.75509643554688, "epoch": 0.82062893081761, "grad_norm": 0.43229731917381287, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8600582480430603, "reward_std": 0.0617084875702858, "rewards/accuracy_reward": 0.8804664611816406, "rewards/format_reward": 0.9795918166637421, "step": 8155 }, { "completion_length": 205.03060913085938, "epoch": 0.8207295597484277, "grad_norm": 1.7910099029541016, "kl": 0.126220703125, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.7538814544677734, "reward_std": 0.14490798115730286, "rewards/accuracy_reward": 0.764085590839386, "rewards/format_reward": 0.9897959232330322, "step": 8156 }, { "completion_length": 190.53060913085938, "epoch": 0.8208301886792453, "grad_norm": 1.6109966039657593, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.6642901301383972, "reward_std": 0.12707144021987915, "rewards/accuracy_reward": 0.6642901301383972, "rewards/format_reward": 1.0, "step": 8157 }, { "completion_length": 248.36734008789062, "epoch": 0.8209308176100629, "grad_norm": 0.544887363910675, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7333333492279053, "reward_std": 0.19323867559432983, "rewards/accuracy_reward": 0.7639455795288086, "rewards/format_reward": 0.9693877398967743, "step": 8158 }, { "completion_length": 195.0, "epoch": 0.8210314465408804, "grad_norm": 0.6609986424446106, "kl": 0.108642578125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.9603058099746704, "reward_std": 0.072426101192832, "rewards/accuracy_reward": 0.960305780172348, "rewards/format_reward": 1.0, "step": 8159 }, { "completion_length": 259.9693832397461, "epoch": 0.8211320754716981, "grad_norm": 0.6240255832672119, "kl": 0.112548828125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.7770575881004333, "reward_std": 0.1602904498577118, "rewards/accuracy_reward": 0.8076698780059814, "rewards/format_reward": 0.9693877398967743, "step": 8160 }, { "completion_length": 234.10203552246094, "epoch": 0.8212327044025157, "grad_norm": 0.6992682814598083, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7410790920257568, "reward_std": 0.1762738898396492, "rewards/accuracy_reward": 0.7614873349666595, "rewards/format_reward": 0.9795918464660645, "step": 8161 }, { "completion_length": 256.46937561035156, "epoch": 0.8213333333333334, "grad_norm": 1.2431296110153198, "kl": 0.096923828125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7174035906791687, "reward_std": 0.16566764563322067, "rewards/accuracy_reward": 0.7378117442131042, "rewards/format_reward": 0.9795918464660645, "step": 8162 }, { "completion_length": 177.67346954345703, "epoch": 0.821433962264151, "grad_norm": 1.2103344202041626, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7184622287750244, "reward_std": 0.1757541447877884, "rewards/accuracy_reward": 0.7286662757396698, "rewards/format_reward": 0.9897959232330322, "step": 8163 }, { "completion_length": 204.85713958740234, "epoch": 0.8215345911949685, "grad_norm": 0.6592879891395569, "kl": 0.0714111328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.8754498958587646, "reward_std": 0.18934328109025955, "rewards/accuracy_reward": 0.8856540024280548, "rewards/format_reward": 0.9897959232330322, "step": 8164 }, { "completion_length": 225.28570556640625, "epoch": 0.8216352201257862, "grad_norm": 0.8881369233131409, "kl": 0.127197265625, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.7380046844482422, "reward_std": 0.26196179538965225, "rewards/accuracy_reward": 0.7584128975868225, "rewards/format_reward": 0.9795918166637421, "step": 8165 }, { "completion_length": 196.04080963134766, "epoch": 0.8217358490566038, "grad_norm": 0.5526814460754395, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7589626908302307, "reward_std": 0.13806133717298508, "rewards/accuracy_reward": 0.7589627802371979, "rewards/format_reward": 1.0, "step": 8166 }, { "completion_length": 151.03060913085938, "epoch": 0.8218364779874214, "grad_norm": 0.7727159857749939, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7150955200195312, "reward_std": 0.11915627866983414, "rewards/accuracy_reward": 0.725299596786499, "rewards/format_reward": 0.9897959232330322, "step": 8167 }, { "completion_length": 160.32653045654297, "epoch": 0.821937106918239, "grad_norm": 1.1962392330169678, "kl": 0.097900390625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8004329204559326, "reward_std": 0.11879347264766693, "rewards/accuracy_reward": 0.8208410739898682, "rewards/format_reward": 0.9795918464660645, "step": 8168 }, { "completion_length": 234.17346954345703, "epoch": 0.8220377358490566, "grad_norm": 0.5930505394935608, "kl": 0.14404296875, "learning_rate": 1e-06, "loss": 0.0058, "reward": 1.8300549387931824, "reward_std": 0.13697662577033043, "rewards/accuracy_reward": 0.8300549685955048, "rewards/format_reward": 1.0, "step": 8169 }, { "completion_length": 177.34693908691406, "epoch": 0.8221383647798742, "grad_norm": 1.1597687005996704, "kl": 0.103759765625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7282129526138306, "reward_std": 0.1157775092869997, "rewards/accuracy_reward": 0.7282130420207977, "rewards/format_reward": 1.0, "step": 8170 }, { "completion_length": 218.6734619140625, "epoch": 0.8222389937106919, "grad_norm": 0.25534749031066895, "kl": 0.10693359375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.891774833202362, "reward_std": 0.01145346648991108, "rewards/accuracy_reward": 0.8917748928070068, "rewards/format_reward": 1.0, "step": 8171 }, { "completion_length": 213.78570556640625, "epoch": 0.8223396226415094, "grad_norm": 0.4013763666152954, "kl": 0.103515625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7559857368469238, "reward_std": 0.1121826320886612, "rewards/accuracy_reward": 0.7865980863571167, "rewards/format_reward": 0.9693877398967743, "step": 8172 }, { "completion_length": 177.03060913085938, "epoch": 0.822440251572327, "grad_norm": 0.6397393345832825, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.850000023841858, "reward_std": 0.11028805375099182, "rewards/accuracy_reward": 0.8499999642372131, "rewards/format_reward": 1.0, "step": 8173 }, { "completion_length": 244.8775405883789, "epoch": 0.8225408805031447, "grad_norm": 0.9011102318763733, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7658605575561523, "reward_std": 0.1901024505496025, "rewards/accuracy_reward": 0.776064544916153, "rewards/format_reward": 0.9897959232330322, "step": 8174 }, { "completion_length": 194.2448959350586, "epoch": 0.8226415094339623, "grad_norm": 0.48762112855911255, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7656319737434387, "reward_std": 0.10567882284522057, "rewards/accuracy_reward": 0.7758360207080841, "rewards/format_reward": 0.9897959232330322, "step": 8175 }, { "completion_length": 196.2653045654297, "epoch": 0.8227421383647798, "grad_norm": 0.4732377827167511, "kl": 0.13232421875, "learning_rate": 1e-06, "loss": 0.0053, "reward": 1.8214166760444641, "reward_std": 0.05388933606445789, "rewards/accuracy_reward": 0.8214167654514313, "rewards/format_reward": 1.0, "step": 8176 }, { "completion_length": 246.54080963134766, "epoch": 0.8228427672955975, "grad_norm": 0.9003240466117859, "kl": 0.0750732421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.846691370010376, "reward_std": 0.15746963024139404, "rewards/accuracy_reward": 0.8568954765796661, "rewards/format_reward": 0.9897959232330322, "step": 8177 }, { "completion_length": 226.2653045654297, "epoch": 0.8229433962264151, "grad_norm": 2.4633126258850098, "kl": 0.060546875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7489795088768005, "reward_std": 0.1759442836046219, "rewards/accuracy_reward": 0.7489795982837677, "rewards/format_reward": 1.0, "step": 8178 }, { "completion_length": 232.66326141357422, "epoch": 0.8230440251572327, "grad_norm": 0.4285210371017456, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8328679203987122, "reward_std": 0.0520162433385849, "rewards/accuracy_reward": 0.8328680396080017, "rewards/format_reward": 1.0, "step": 8179 }, { "completion_length": 238.60203552246094, "epoch": 0.8231446540880503, "grad_norm": 0.858905017375946, "kl": 0.08251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7856332659721375, "reward_std": 0.17351023852825165, "rewards/accuracy_reward": 0.806041419506073, "rewards/format_reward": 0.9795918464660645, "step": 8180 }, { "completion_length": 247.5408172607422, "epoch": 0.8232452830188679, "grad_norm": 0.4735754430294037, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8178107142448425, "reward_std": 0.12543493509292603, "rewards/accuracy_reward": 0.8178107440471649, "rewards/format_reward": 1.0, "step": 8181 }, { "completion_length": 220.59183502197266, "epoch": 0.8233459119496855, "grad_norm": 0.6106826066970825, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7320555448532104, "reward_std": 0.09374591335654259, "rewards/accuracy_reward": 0.7320555746555328, "rewards/format_reward": 1.0, "step": 8182 }, { "completion_length": 245.53060150146484, "epoch": 0.8234465408805032, "grad_norm": 0.5439995527267456, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.9027210474014282, "reward_std": 0.11057436652481556, "rewards/accuracy_reward": 0.9027210772037506, "rewards/format_reward": 1.0, "step": 8183 }, { "completion_length": 290.06121826171875, "epoch": 0.8235471698113207, "grad_norm": 0.5918445587158203, "kl": 0.05712890625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8431044220924377, "reward_std": 0.183366559445858, "rewards/accuracy_reward": 0.8431044816970825, "rewards/format_reward": 1.0, "step": 8184 }, { "completion_length": 181.7040786743164, "epoch": 0.8236477987421383, "grad_norm": 1.2963082790374756, "kl": 0.079833984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8204081654548645, "reward_std": 0.18053733184933662, "rewards/accuracy_reward": 0.8306122422218323, "rewards/format_reward": 0.9897959232330322, "step": 8185 }, { "completion_length": 247.4387664794922, "epoch": 0.823748427672956, "grad_norm": 0.8616045713424683, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.686488687992096, "reward_std": 0.1761203557252884, "rewards/accuracy_reward": 0.686488687992096, "rewards/format_reward": 1.0, "step": 8186 }, { "completion_length": 234.34693145751953, "epoch": 0.8238490566037736, "grad_norm": 0.8551561832427979, "kl": 0.107421875, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.761760175228119, "reward_std": 0.102378292940557, "rewards/accuracy_reward": 0.7617602944374084, "rewards/format_reward": 1.0, "step": 8187 }, { "completion_length": 178.34693145751953, "epoch": 0.8239496855345912, "grad_norm": 0.3343687355518341, "kl": 0.0748291015625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.9268044829368591, "reward_std": 0.028781231492757797, "rewards/accuracy_reward": 0.9268044829368591, "rewards/format_reward": 1.0, "step": 8188 }, { "completion_length": 258.1530456542969, "epoch": 0.8240503144654088, "grad_norm": 0.4142351746559143, "kl": 0.09228515625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8177040219306946, "reward_std": 0.08765485137701035, "rewards/accuracy_reward": 0.8381121754646301, "rewards/format_reward": 0.9795918464660645, "step": 8189 }, { "completion_length": 205.31632232666016, "epoch": 0.8241509433962264, "grad_norm": 0.6353245973587036, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8700602054595947, "reward_std": 0.10657485574483871, "rewards/accuracy_reward": 0.8700602352619171, "rewards/format_reward": 1.0, "step": 8190 }, { "completion_length": 201.2040786743164, "epoch": 0.824251572327044, "grad_norm": 0.6173679232597351, "kl": 0.0794677734375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8468537330627441, "reward_std": 0.11844188719987869, "rewards/accuracy_reward": 0.8570578396320343, "rewards/format_reward": 0.9897959232330322, "step": 8191 }, { "completion_length": 287.7652893066406, "epoch": 0.8243522012578617, "grad_norm": 0.7869815826416016, "kl": 0.1162109375, "learning_rate": 1e-06, "loss": 0.0047, "reward": 1.7865141034126282, "reward_std": 0.14559153467416763, "rewards/accuracy_reward": 0.8069222867488861, "rewards/format_reward": 0.9795918464660645, "step": 8192 }, { "completion_length": 274.17346954345703, "epoch": 0.8244528301886792, "grad_norm": 0.43337559700012207, "kl": 0.07421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.800207793712616, "reward_std": 0.11255700141191483, "rewards/accuracy_reward": 0.8104119002819061, "rewards/format_reward": 0.9897959232330322, "step": 8193 }, { "completion_length": 233.30611419677734, "epoch": 0.8245534591194968, "grad_norm": 0.4706260561943054, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7303206324577332, "reward_std": 0.09667400270700455, "rewards/accuracy_reward": 0.7405247986316681, "rewards/format_reward": 0.9897959232330322, "step": 8194 }, { "completion_length": 228.07141876220703, "epoch": 0.8246540880503145, "grad_norm": 0.5176210999488831, "kl": 0.0689697265625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8823128938674927, "reward_std": 0.10851360484957695, "rewards/accuracy_reward": 0.8925170004367828, "rewards/format_reward": 0.9897959232330322, "step": 8195 }, { "completion_length": 246.6530532836914, "epoch": 0.8247547169811321, "grad_norm": 2.457756757736206, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8307050466537476, "reward_std": 0.06692745629698038, "rewards/accuracy_reward": 0.8307049870491028, "rewards/format_reward": 1.0, "step": 8196 }, { "completion_length": 201.62244415283203, "epoch": 0.8248553459119496, "grad_norm": 1.1183267831802368, "kl": 0.0750732421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.803243339061737, "reward_std": 0.04845372587442398, "rewards/accuracy_reward": 0.8032434284687042, "rewards/format_reward": 1.0, "step": 8197 }, { "completion_length": 236.06122589111328, "epoch": 0.8249559748427673, "grad_norm": 0.6860868334770203, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.6938774585723877, "reward_std": 0.20226720720529556, "rewards/accuracy_reward": 0.7040815949440002, "rewards/format_reward": 0.9897959232330322, "step": 8198 }, { "completion_length": 365.06121826171875, "epoch": 0.8250566037735849, "grad_norm": 0.40582820773124695, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.646031677722931, "reward_std": 0.11812880262732506, "rewards/accuracy_reward": 0.6562357842922211, "rewards/format_reward": 0.9897959232330322, "step": 8199 }, { "completion_length": 201.67346954345703, "epoch": 0.8251572327044026, "grad_norm": 6.634585857391357, "kl": 0.101318359375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.6538549661636353, "reward_std": 0.1438702642917633, "rewards/accuracy_reward": 0.6742631793022156, "rewards/format_reward": 0.9795918166637421, "step": 8200 }, { "completion_length": 335.95916748046875, "epoch": 0.8252578616352201, "grad_norm": 1.1239186525344849, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.6658081412315369, "reward_std": 0.2015044018626213, "rewards/accuracy_reward": 0.7066245675086975, "rewards/format_reward": 0.9591836333274841, "step": 8201 }, { "completion_length": 264.52040100097656, "epoch": 0.8253584905660377, "grad_norm": 0.490607351064682, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8333284854888916, "reward_std": 0.08363010361790657, "rewards/accuracy_reward": 0.8333285450935364, "rewards/format_reward": 1.0, "step": 8202 }, { "completion_length": 214.22447967529297, "epoch": 0.8254591194968554, "grad_norm": 0.385132372379303, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8408163189888, "reward_std": 0.0431959368288517, "rewards/accuracy_reward": 0.8510204255580902, "rewards/format_reward": 0.9897959232330322, "step": 8203 }, { "completion_length": 290.89794921875, "epoch": 0.825559748427673, "grad_norm": 0.7833883166313171, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.740010917186737, "reward_std": 0.20805353671312332, "rewards/accuracy_reward": 0.7604190409183502, "rewards/format_reward": 0.9795918166637421, "step": 8204 }, { "completion_length": 247.60204315185547, "epoch": 0.8256603773584905, "grad_norm": 0.9271578788757324, "kl": 0.12890625, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.8356852531433105, "reward_std": 0.2112177386879921, "rewards/accuracy_reward": 0.8560934066772461, "rewards/format_reward": 0.9795918464660645, "step": 8205 }, { "completion_length": 300.9387741088867, "epoch": 0.8257610062893082, "grad_norm": 1.6997885704040527, "kl": 0.0894775390625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.581257700920105, "reward_std": 0.23084279894828796, "rewards/accuracy_reward": 0.6016659438610077, "rewards/format_reward": 0.9795918464660645, "step": 8206 }, { "completion_length": 322.9285583496094, "epoch": 0.8258616352201258, "grad_norm": 0.5482646226882935, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8514841198921204, "reward_std": 0.1354348547756672, "rewards/accuracy_reward": 0.8718923628330231, "rewards/format_reward": 0.9795918464660645, "step": 8207 }, { "completion_length": 232.01020050048828, "epoch": 0.8259622641509434, "grad_norm": 1.0096840858459473, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7265647649765015, "reward_std": 0.16697891056537628, "rewards/accuracy_reward": 0.7367689311504364, "rewards/format_reward": 0.9897959232330322, "step": 8208 }, { "completion_length": 204.7448959350586, "epoch": 0.826062893081761, "grad_norm": 0.6522997617721558, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.845392644405365, "reward_std": 0.07608375512063503, "rewards/accuracy_reward": 0.845392644405365, "rewards/format_reward": 1.0, "step": 8209 }, { "completion_length": 213.01020050048828, "epoch": 0.8261635220125786, "grad_norm": 1.3819996118545532, "kl": 0.0850830078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7886054515838623, "reward_std": 0.11628149077296257, "rewards/accuracy_reward": 0.7886054217815399, "rewards/format_reward": 1.0, "step": 8210 }, { "completion_length": 310.30611419677734, "epoch": 0.8262641509433962, "grad_norm": 0.8975456953048706, "kl": 0.092041015625, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.76451975107193, "reward_std": 0.23697929456830025, "rewards/accuracy_reward": 0.8155401349067688, "rewards/format_reward": 0.9489795863628387, "step": 8211 }, { "completion_length": 187.6836700439453, "epoch": 0.8263647798742139, "grad_norm": 0.7248384356498718, "kl": 0.083740234375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7823129296302795, "reward_std": 0.07211756519973278, "rewards/accuracy_reward": 0.7925169765949249, "rewards/format_reward": 0.9897959232330322, "step": 8212 }, { "completion_length": 207.95917510986328, "epoch": 0.8264654088050315, "grad_norm": 0.9380499124526978, "kl": 0.106201171875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8156565427780151, "reward_std": 0.14697201177477837, "rewards/accuracy_reward": 0.8156565725803375, "rewards/format_reward": 1.0, "step": 8213 }, { "completion_length": 267.9693832397461, "epoch": 0.826566037735849, "grad_norm": 0.6653892993927002, "kl": 0.0704345703125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7551020383834839, "reward_std": 0.15069952607154846, "rewards/accuracy_reward": 0.7653061151504517, "rewards/format_reward": 0.9897959232330322, "step": 8214 }, { "completion_length": 197.2244873046875, "epoch": 0.8266666666666667, "grad_norm": 1.1612951755523682, "kl": 0.152099609375, "learning_rate": 1e-06, "loss": 0.0061, "reward": 1.6736637353897095, "reward_std": 0.18147621676325798, "rewards/accuracy_reward": 0.6940719187259674, "rewards/format_reward": 0.9795918464660645, "step": 8215 }, { "completion_length": 295.28570556640625, "epoch": 0.8267672955974843, "grad_norm": 0.596169114112854, "kl": 0.0714111328125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6999999284744263, "reward_std": 0.22329312562942505, "rewards/accuracy_reward": 0.7510203719139099, "rewards/format_reward": 0.9489795565605164, "step": 8216 }, { "completion_length": 277.05101013183594, "epoch": 0.8268679245283019, "grad_norm": 0.6737810373306274, "kl": 0.0601806640625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.721088469028473, "reward_std": 0.12378795817494392, "rewards/accuracy_reward": 0.7312924563884735, "rewards/format_reward": 0.9897959232330322, "step": 8217 }, { "completion_length": 278.2653045654297, "epoch": 0.8269685534591195, "grad_norm": 0.8349868655204773, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6979297399520874, "reward_std": 0.11279077082872391, "rewards/accuracy_reward": 0.697929710149765, "rewards/format_reward": 1.0, "step": 8218 }, { "completion_length": 270.948974609375, "epoch": 0.8270691823899371, "grad_norm": 1.4934972524642944, "kl": 0.0728759765625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6914886236190796, "reward_std": 0.1625114604830742, "rewards/accuracy_reward": 0.7016927599906921, "rewards/format_reward": 0.9897959232330322, "step": 8219 }, { "completion_length": 277.0408172607422, "epoch": 0.8271698113207547, "grad_norm": 0.8144669532775879, "kl": 0.0589599609375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.739455759525299, "reward_std": 0.200858935713768, "rewards/accuracy_reward": 0.7496598660945892, "rewards/format_reward": 0.9897959232330322, "step": 8220 }, { "completion_length": 162.85713958740234, "epoch": 0.8272704402515724, "grad_norm": 0.9618061184883118, "kl": 0.105224609375, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7993197441101074, "reward_std": 0.19787004590034485, "rewards/accuracy_reward": 0.8401360809803009, "rewards/format_reward": 0.9591836631298065, "step": 8221 }, { "completion_length": 277.448974609375, "epoch": 0.8273710691823899, "grad_norm": 1.1589158773422241, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6234829425811768, "reward_std": 0.20926641672849655, "rewards/accuracy_reward": 0.6336870491504669, "rewards/format_reward": 0.9897959232330322, "step": 8222 }, { "completion_length": 267.9795913696289, "epoch": 0.8274716981132075, "grad_norm": 0.5422345399856567, "kl": 0.080810546875, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.594928801059723, "reward_std": 0.15484391897916794, "rewards/accuracy_reward": 0.6153370141983032, "rewards/format_reward": 0.9795918166637421, "step": 8223 }, { "completion_length": 236.80611419677734, "epoch": 0.8275723270440252, "grad_norm": 1.0397425889968872, "kl": 0.0621337890625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.833914041519165, "reward_std": 0.17645451426506042, "rewards/accuracy_reward": 0.8645263016223907, "rewards/format_reward": 0.9693877398967743, "step": 8224 }, { "completion_length": 236.30612182617188, "epoch": 0.8276729559748428, "grad_norm": 0.432912677526474, "kl": 0.080322265625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8062048554420471, "reward_std": 0.0647068815305829, "rewards/accuracy_reward": 0.8062048852443695, "rewards/format_reward": 1.0, "step": 8225 }, { "completion_length": 187.52040481567383, "epoch": 0.8277735849056603, "grad_norm": 1.874612808227539, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.918367326259613, "reward_std": 0.026997461915016174, "rewards/accuracy_reward": 0.918367326259613, "rewards/format_reward": 1.0, "step": 8226 }, { "completion_length": 270.5816192626953, "epoch": 0.827874213836478, "grad_norm": 0.5996285080909729, "kl": 0.0775146484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6746854186058044, "reward_std": 0.1644263006746769, "rewards/accuracy_reward": 0.6848895847797394, "rewards/format_reward": 0.9897959232330322, "step": 8227 }, { "completion_length": 204.97958374023438, "epoch": 0.8279748427672956, "grad_norm": 1.2317005395889282, "kl": 0.076416015625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8268963098526, "reward_std": 0.1811583936214447, "rewards/accuracy_reward": 0.8268963396549225, "rewards/format_reward": 1.0, "step": 8228 }, { "completion_length": 219.4081573486328, "epoch": 0.8280754716981132, "grad_norm": 3.3268730640411377, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7076029181480408, "reward_std": 0.11102988570928574, "rewards/accuracy_reward": 0.7178071141242981, "rewards/format_reward": 0.9897959232330322, "step": 8229 }, { "completion_length": 262.4387741088867, "epoch": 0.8281761006289308, "grad_norm": 1.1466434001922607, "kl": 0.1279296875, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.7452049851417542, "reward_std": 0.11741030495613813, "rewards/accuracy_reward": 0.7554090023040771, "rewards/format_reward": 0.9897959232330322, "step": 8230 }, { "completion_length": 207.6734619140625, "epoch": 0.8282767295597484, "grad_norm": 0.9888952374458313, "kl": 0.11865234375, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7891333103179932, "reward_std": 0.16368908435106277, "rewards/accuracy_reward": 0.7993374168872833, "rewards/format_reward": 0.9897959232330322, "step": 8231 }, { "completion_length": 218.7040786743164, "epoch": 0.828377358490566, "grad_norm": 0.7131022214889526, "kl": 0.096923828125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.823501706123352, "reward_std": 0.12443014979362488, "rewards/accuracy_reward": 0.8337058424949646, "rewards/format_reward": 0.9897959232330322, "step": 8232 }, { "completion_length": 315.1734619140625, "epoch": 0.8284779874213837, "grad_norm": 0.5093013048171997, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.801437795162201, "reward_std": 0.12450187653303146, "rewards/accuracy_reward": 0.8218459784984589, "rewards/format_reward": 0.9795918464660645, "step": 8233 }, { "completion_length": 189.25509643554688, "epoch": 0.8285786163522012, "grad_norm": 0.6475445628166199, "kl": 0.0743408203125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8061224222183228, "reward_std": 0.06517763435840607, "rewards/accuracy_reward": 0.8061224520206451, "rewards/format_reward": 1.0, "step": 8234 }, { "completion_length": 210.2142791748047, "epoch": 0.8286792452830188, "grad_norm": 0.675713300704956, "kl": 0.124267578125, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.8061224222183228, "reward_std": 0.14146549627184868, "rewards/accuracy_reward": 0.8265306055545807, "rewards/format_reward": 0.9795918464660645, "step": 8235 }, { "completion_length": 224.01020050048828, "epoch": 0.8287798742138365, "grad_norm": 1.2130722999572754, "kl": 0.10595703125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7385361790657043, "reward_std": 0.3255530446767807, "rewards/accuracy_reward": 0.7793525457382202, "rewards/format_reward": 0.9591836631298065, "step": 8236 }, { "completion_length": 245.03060913085938, "epoch": 0.8288805031446541, "grad_norm": 1.0428787469863892, "kl": 0.11083984375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.6887754797935486, "reward_std": 0.16474898904561996, "rewards/accuracy_reward": 0.7193877398967743, "rewards/format_reward": 0.9693877398967743, "step": 8237 }, { "completion_length": 254.6632537841797, "epoch": 0.8289811320754717, "grad_norm": 0.7435166239738464, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7042638659477234, "reward_std": 0.17531772702932358, "rewards/accuracy_reward": 0.7348760664463043, "rewards/format_reward": 0.9693877398967743, "step": 8238 }, { "completion_length": 246.30612182617188, "epoch": 0.8290817610062893, "grad_norm": 0.33164817094802856, "kl": 0.079345703125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8678478002548218, "reward_std": 0.1482987403869629, "rewards/accuracy_reward": 0.8882559537887573, "rewards/format_reward": 0.9795918166637421, "step": 8239 }, { "completion_length": 316.4081573486328, "epoch": 0.8291823899371069, "grad_norm": 0.7916126251220703, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6099762916564941, "reward_std": 0.22235523909330368, "rewards/accuracy_reward": 0.6405884772539139, "rewards/format_reward": 0.9693877398967743, "step": 8240 }, { "completion_length": 223.64285278320312, "epoch": 0.8292830188679245, "grad_norm": 0.5533948540687561, "kl": 0.124755859375, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.78212970495224, "reward_std": 0.1321867611259222, "rewards/accuracy_reward": 0.8229460120201111, "rewards/format_reward": 0.9591836631298065, "step": 8241 }, { "completion_length": 207.36734008789062, "epoch": 0.8293836477987422, "grad_norm": 0.8783348202705383, "kl": 0.14208984375, "learning_rate": 1e-06, "loss": 0.0057, "reward": 1.7603510618209839, "reward_std": 0.11770624294877052, "rewards/accuracy_reward": 0.770555168390274, "rewards/format_reward": 0.9897959232330322, "step": 8242 }, { "completion_length": 272.67346954345703, "epoch": 0.8294842767295597, "grad_norm": 0.409773588180542, "kl": 0.109130859375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8351885676383972, "reward_std": 0.059351859614253044, "rewards/accuracy_reward": 0.8351885974407196, "rewards/format_reward": 1.0, "step": 8243 }, { "completion_length": 169.02040100097656, "epoch": 0.8295849056603773, "grad_norm": 1.0412436723709106, "kl": 0.068359375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.9484935998916626, "reward_std": 0.07633893471211195, "rewards/accuracy_reward": 0.9689018130302429, "rewards/format_reward": 0.9795918166637421, "step": 8244 }, { "completion_length": 192.30612182617188, "epoch": 0.829685534591195, "grad_norm": 2.4246833324432373, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6917903423309326, "reward_std": 0.20118918269872665, "rewards/accuracy_reward": 0.7121984958648682, "rewards/format_reward": 0.9795918166637421, "step": 8245 }, { "completion_length": 199.2142791748047, "epoch": 0.8297861635220126, "grad_norm": 2.8371410369873047, "kl": 0.0966796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7227890491485596, "reward_std": 0.12367213517427444, "rewards/accuracy_reward": 0.7431972622871399, "rewards/format_reward": 0.9795918464660645, "step": 8246 }, { "completion_length": 276.28570556640625, "epoch": 0.8298867924528301, "grad_norm": 0.5224232077598572, "kl": 0.056396484375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8181817531585693, "reward_std": 0.1580786406993866, "rewards/accuracy_reward": 0.8283859193325043, "rewards/format_reward": 0.9897959232330322, "step": 8247 }, { "completion_length": 244.4081573486328, "epoch": 0.8299874213836478, "grad_norm": 1.2639981508255005, "kl": 0.115478515625, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.6046792268753052, "reward_std": 0.3867039382457733, "rewards/accuracy_reward": 0.6659037321805954, "rewards/format_reward": 0.9387754797935486, "step": 8248 }, { "completion_length": 261.6938705444336, "epoch": 0.8300880503144654, "grad_norm": 2.884336233139038, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7167750597000122, "reward_std": 0.24078793823719025, "rewards/accuracy_reward": 0.72697913646698, "rewards/format_reward": 0.9897959232330322, "step": 8249 }, { "completion_length": 272.33673095703125, "epoch": 0.8301886792452831, "grad_norm": 1.2446963787078857, "kl": 0.103271484375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7870438694953918, "reward_std": 0.18019897490739822, "rewards/accuracy_reward": 0.8482683897018433, "rewards/format_reward": 0.938775509595871, "step": 8250 }, { "completion_length": 307.24488830566406, "epoch": 0.8302893081761006, "grad_norm": 0.5459966063499451, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.702567219734192, "reward_std": 0.2586570009589195, "rewards/accuracy_reward": 0.7433835864067078, "rewards/format_reward": 0.9591836333274841, "step": 8251 }, { "completion_length": 279.0408172607422, "epoch": 0.8303899371069182, "grad_norm": 0.7667818665504456, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8499886393547058, "reward_std": 0.2103484831750393, "rewards/accuracy_reward": 0.9010090231895447, "rewards/format_reward": 0.9489795863628387, "step": 8252 }, { "completion_length": 276.4387664794922, "epoch": 0.8304905660377359, "grad_norm": 0.5913642644882202, "kl": 0.1044921875, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.646988868713379, "reward_std": 0.23082385957241058, "rewards/accuracy_reward": 0.6980092823505402, "rewards/format_reward": 0.9489795863628387, "step": 8253 }, { "completion_length": 188.32652282714844, "epoch": 0.8305911949685535, "grad_norm": 2.040498971939087, "kl": 0.10107421875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8098085522651672, "reward_std": 0.1142445057630539, "rewards/accuracy_reward": 0.8098086416721344, "rewards/format_reward": 1.0, "step": 8254 }, { "completion_length": 248.6836700439453, "epoch": 0.830691823899371, "grad_norm": 0.7467203140258789, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.787755012512207, "reward_std": 0.19649925082921982, "rewards/accuracy_reward": 0.8081632554531097, "rewards/format_reward": 0.9795918464660645, "step": 8255 }, { "completion_length": 224.4897918701172, "epoch": 0.8307924528301887, "grad_norm": 0.9211542010307312, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6694943308830261, "reward_std": 0.22584842145442963, "rewards/accuracy_reward": 0.7001065909862518, "rewards/format_reward": 0.9693877398967743, "step": 8256 }, { "completion_length": 298.9591827392578, "epoch": 0.8308930817610063, "grad_norm": 0.4612695276737213, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.5752310156822205, "reward_std": 0.11337093636393547, "rewards/accuracy_reward": 0.5854350477457047, "rewards/format_reward": 0.9897959232330322, "step": 8257 }, { "completion_length": 285.12245178222656, "epoch": 0.8309937106918239, "grad_norm": 1.358024001121521, "kl": 0.0867919921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7663264870643616, "reward_std": 0.2527212053537369, "rewards/accuracy_reward": 0.8173469007015228, "rewards/format_reward": 0.9489795565605164, "step": 8258 }, { "completion_length": 257.2040786743164, "epoch": 0.8310943396226416, "grad_norm": 0.5049421787261963, "kl": 0.0836181640625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.792154610157013, "reward_std": 0.1164766363799572, "rewards/accuracy_reward": 0.8023587167263031, "rewards/format_reward": 0.9897959232330322, "step": 8259 }, { "completion_length": 301.8775329589844, "epoch": 0.8311949685534591, "grad_norm": 0.6662769317626953, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.795918345451355, "reward_std": 0.21808192878961563, "rewards/accuracy_reward": 0.8265305757522583, "rewards/format_reward": 0.9693877398967743, "step": 8260 }, { "completion_length": 321.1326446533203, "epoch": 0.8312955974842767, "grad_norm": 0.8142345547676086, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6557584404945374, "reward_std": 0.22754217684268951, "rewards/accuracy_reward": 0.6659625172615051, "rewards/format_reward": 0.9897959232330322, "step": 8261 }, { "completion_length": 230.8775405883789, "epoch": 0.8313962264150944, "grad_norm": 0.4997763931751251, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8548776507377625, "reward_std": 0.19894862174987793, "rewards/accuracy_reward": 0.8752858340740204, "rewards/format_reward": 0.9795918464660645, "step": 8262 }, { "completion_length": 213.61224365234375, "epoch": 0.831496855345912, "grad_norm": 1.5124845504760742, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7938775420188904, "reward_std": 0.18279437720775604, "rewards/accuracy_reward": 0.8142856657505035, "rewards/format_reward": 0.9795918166637421, "step": 8263 }, { "completion_length": 306.9183578491211, "epoch": 0.8315974842767295, "grad_norm": 0.5597595572471619, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.7721757888793945, "reward_std": 0.15410274267196655, "rewards/accuracy_reward": 0.7925840020179749, "rewards/format_reward": 0.9795918464660645, "step": 8264 }, { "completion_length": 275.1632614135742, "epoch": 0.8316981132075472, "grad_norm": 0.3917880952358246, "kl": 0.115234375, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.763080358505249, "reward_std": 0.10521206632256508, "rewards/accuracy_reward": 0.7834885120391846, "rewards/format_reward": 0.9795918464660645, "step": 8265 }, { "completion_length": 358.6836700439453, "epoch": 0.8317987421383648, "grad_norm": 0.5387508273124695, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6681638360023499, "reward_std": 0.1887584999203682, "rewards/accuracy_reward": 0.6987760365009308, "rewards/format_reward": 0.9693877398967743, "step": 8266 }, { "completion_length": 256.0204086303711, "epoch": 0.8318993710691824, "grad_norm": 0.5609199404716492, "kl": 0.09521484375, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7937560081481934, "reward_std": 0.12930070608854294, "rewards/accuracy_reward": 0.8039601147174835, "rewards/format_reward": 0.9897959232330322, "step": 8267 }, { "completion_length": 230.03060913085938, "epoch": 0.832, "grad_norm": 1.8278239965438843, "kl": 0.09326171875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8791080713272095, "reward_std": 0.184371218085289, "rewards/accuracy_reward": 0.8995162844657898, "rewards/format_reward": 0.9795918464660645, "step": 8268 }, { "completion_length": 278.24488830566406, "epoch": 0.8321006289308176, "grad_norm": 0.32813891768455505, "kl": 0.089111328125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6548823118209839, "reward_std": 0.11672877008095384, "rewards/accuracy_reward": 0.6752904653549194, "rewards/format_reward": 0.9795918166637421, "step": 8269 }, { "completion_length": 250.37754821777344, "epoch": 0.8322012578616352, "grad_norm": 0.5785277485847473, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.6228598356246948, "reward_std": 0.1434844397008419, "rewards/accuracy_reward": 0.6432679444551468, "rewards/format_reward": 0.9795918166637421, "step": 8270 }, { "completion_length": 231.2244873046875, "epoch": 0.8323018867924529, "grad_norm": 1.1345908641815186, "kl": 0.1328125, "learning_rate": 1e-06, "loss": 0.0053, "reward": 1.6464422941207886, "reward_std": 0.13351625204086304, "rewards/accuracy_reward": 0.6464423537254333, "rewards/format_reward": 1.0, "step": 8271 }, { "completion_length": 197.05101776123047, "epoch": 0.8324025157232704, "grad_norm": 0.6901242733001709, "kl": 0.129150390625, "learning_rate": 1e-06, "loss": 0.0052, "reward": 1.9171363711357117, "reward_std": 0.12923453003168106, "rewards/accuracy_reward": 0.9375445246696472, "rewards/format_reward": 0.9795918464660645, "step": 8272 }, { "completion_length": 242.87754821777344, "epoch": 0.832503144654088, "grad_norm": 0.6929619908332825, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8988255858421326, "reward_std": 0.08153385668992996, "rewards/accuracy_reward": 0.9090296924114227, "rewards/format_reward": 0.9897959232330322, "step": 8273 }, { "completion_length": 294.9591827392578, "epoch": 0.8326037735849057, "grad_norm": 0.9891514182090759, "kl": 0.103759765625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.5211811661720276, "reward_std": 0.1872970312833786, "rewards/accuracy_reward": 0.5415893197059631, "rewards/format_reward": 0.9795918166637421, "step": 8274 }, { "completion_length": 193.33673095703125, "epoch": 0.8327044025157233, "grad_norm": 1.2030824422836304, "kl": 0.089599609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.831383764743805, "reward_std": 0.113313689827919, "rewards/accuracy_reward": 0.8517920076847076, "rewards/format_reward": 0.9795918166637421, "step": 8275 }, { "completion_length": 256.4081573486328, "epoch": 0.8328050314465408, "grad_norm": 0.6524789929389954, "kl": 0.0736083984375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7160113453865051, "reward_std": 0.13944415003061295, "rewards/accuracy_reward": 0.7364194393157959, "rewards/format_reward": 0.9795918166637421, "step": 8276 }, { "completion_length": 265.2142791748047, "epoch": 0.8329056603773585, "grad_norm": 0.7823127508163452, "kl": 0.091064453125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.661252200603485, "reward_std": 0.1710667386651039, "rewards/accuracy_reward": 0.6714563965797424, "rewards/format_reward": 0.9897959232330322, "step": 8277 }, { "completion_length": 206.9081573486328, "epoch": 0.8330062893081761, "grad_norm": 0.5652687549591064, "kl": 0.07763671875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.785026490688324, "reward_std": 0.1587294638156891, "rewards/accuracy_reward": 0.7952305972576141, "rewards/format_reward": 0.9897959232330322, "step": 8278 }, { "completion_length": 297.39794921875, "epoch": 0.8331069182389937, "grad_norm": 0.8826209902763367, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.7312924265861511, "reward_std": 0.20972592383623123, "rewards/accuracy_reward": 0.7517006397247314, "rewards/format_reward": 0.9795918166637421, "step": 8279 }, { "completion_length": 240.16326904296875, "epoch": 0.8332075471698113, "grad_norm": 2.1632468700408936, "kl": 0.0845947265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8750653862953186, "reward_std": 0.12277885526418686, "rewards/accuracy_reward": 0.8954735994338989, "rewards/format_reward": 0.9795918166637421, "step": 8280 }, { "completion_length": 283.8571319580078, "epoch": 0.8333081761006289, "grad_norm": 0.5465570688247681, "kl": 0.103515625, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7869021892547607, "reward_std": 0.08839914947748184, "rewards/accuracy_reward": 0.7869021594524384, "rewards/format_reward": 1.0, "step": 8281 }, { "completion_length": 186.5306053161621, "epoch": 0.8334088050314465, "grad_norm": 0.9703532457351685, "kl": 0.1134033203125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.8166688680648804, "reward_std": 0.13922057300806046, "rewards/accuracy_reward": 0.8268728852272034, "rewards/format_reward": 0.9897959232330322, "step": 8282 }, { "completion_length": 214.27550506591797, "epoch": 0.8335094339622642, "grad_norm": 4.499970436096191, "kl": 0.123291015625, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.7855905890464783, "reward_std": 0.18702014535665512, "rewards/accuracy_reward": 0.7855905592441559, "rewards/format_reward": 1.0, "step": 8283 }, { "completion_length": 298.79591369628906, "epoch": 0.8336100628930818, "grad_norm": 1.0736957788467407, "kl": 0.10302734375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7323386073112488, "reward_std": 0.22958476096391678, "rewards/accuracy_reward": 0.7527468502521515, "rewards/format_reward": 0.9795918166637421, "step": 8284 }, { "completion_length": 229.12244415283203, "epoch": 0.8337106918238993, "grad_norm": 0.7742533087730408, "kl": 0.10302734375, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.7859165668487549, "reward_std": 0.15477920323610306, "rewards/accuracy_reward": 0.7961206436157227, "rewards/format_reward": 0.9897959232330322, "step": 8285 }, { "completion_length": 287.99999237060547, "epoch": 0.833811320754717, "grad_norm": 0.30239996314048767, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.713682770729065, "reward_std": 0.1057063490152359, "rewards/accuracy_reward": 0.7340909242630005, "rewards/format_reward": 0.9795918464660645, "step": 8286 }, { "completion_length": 198.79591369628906, "epoch": 0.8339119496855346, "grad_norm": 0.8621310591697693, "kl": 0.09326171875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7151154279708862, "reward_std": 0.1468476951122284, "rewards/accuracy_reward": 0.7151154577732086, "rewards/format_reward": 1.0, "step": 8287 }, { "completion_length": 236.8775405883789, "epoch": 0.8340125786163523, "grad_norm": 1.082733154296875, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.5890670418739319, "reward_std": 0.26340894401073456, "rewards/accuracy_reward": 0.629883348941803, "rewards/format_reward": 0.9591836631298065, "step": 8288 }, { "completion_length": 331.7142791748047, "epoch": 0.8341132075471698, "grad_norm": 1.858343482017517, "kl": 0.17724609375, "learning_rate": 1e-06, "loss": 0.0071, "reward": 1.6570810675621033, "reward_std": 0.3071891814470291, "rewards/accuracy_reward": 0.6978974342346191, "rewards/format_reward": 0.9591836333274841, "step": 8289 }, { "completion_length": 218.0, "epoch": 0.8342138364779874, "grad_norm": 0.4190845787525177, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8758835792541504, "reward_std": 0.09118317440152168, "rewards/accuracy_reward": 0.8860876858234406, "rewards/format_reward": 0.9897959232330322, "step": 8290 }, { "completion_length": 204.0204086303711, "epoch": 0.834314465408805, "grad_norm": 0.5809005498886108, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8804664611816406, "reward_std": 0.11144223064184189, "rewards/accuracy_reward": 0.8906705379486084, "rewards/format_reward": 0.9897959232330322, "step": 8291 }, { "completion_length": 217.52040100097656, "epoch": 0.8344150943396227, "grad_norm": 0.7198696732521057, "kl": 0.115966796875, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.6609944701194763, "reward_std": 0.11500485986471176, "rewards/accuracy_reward": 0.6609944403171539, "rewards/format_reward": 1.0, "step": 8292 }, { "completion_length": 279.61224365234375, "epoch": 0.8345157232704402, "grad_norm": 0.5949930548667908, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.5637156963348389, "reward_std": 0.10585499182343483, "rewards/accuracy_reward": 0.5943280160427094, "rewards/format_reward": 0.9693877398967743, "step": 8293 }, { "completion_length": 282.46937561035156, "epoch": 0.8346163522012578, "grad_norm": 0.5037947297096252, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.718991994857788, "reward_std": 0.13790279626846313, "rewards/accuracy_reward": 0.7291961014270782, "rewards/format_reward": 0.9897959232330322, "step": 8294 }, { "completion_length": 267.9285659790039, "epoch": 0.8347169811320755, "grad_norm": 1.0772356986999512, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.6530745029449463, "reward_std": 0.22535961866378784, "rewards/accuracy_reward": 0.6836867928504944, "rewards/format_reward": 0.9693877398967743, "step": 8295 }, { "completion_length": 225.34693145751953, "epoch": 0.8348176100628931, "grad_norm": 0.5991337299346924, "kl": 0.08349609375, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8187031149864197, "reward_std": 0.062096216250211, "rewards/accuracy_reward": 0.8289071321487427, "rewards/format_reward": 0.9897959232330322, "step": 8296 }, { "completion_length": 219.8775405883789, "epoch": 0.8349182389937106, "grad_norm": 0.5551031231880188, "kl": 0.0711669921875, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.81426203250885, "reward_std": 0.12754375860095024, "rewards/accuracy_reward": 0.8244661390781403, "rewards/format_reward": 0.9897959232330322, "step": 8297 }, { "completion_length": 188.2244873046875, "epoch": 0.8350188679245283, "grad_norm": 0.7761598825454712, "kl": 0.12158203125, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.8348329663276672, "reward_std": 0.1657058596611023, "rewards/accuracy_reward": 0.8552411496639252, "rewards/format_reward": 0.9795918464660645, "step": 8298 }, { "completion_length": 259.60204315185547, "epoch": 0.8351194968553459, "grad_norm": 0.9658054113388062, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.626166045665741, "reward_std": 0.20647913962602615, "rewards/accuracy_reward": 0.6567783951759338, "rewards/format_reward": 0.9693877398967743, "step": 8299 }, { "completion_length": 228.82652282714844, "epoch": 0.8352201257861636, "grad_norm": 0.7497379183769226, "kl": 0.0537109375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8653473854064941, "reward_std": 0.16493427753448486, "rewards/accuracy_reward": 0.8653474152088165, "rewards/format_reward": 1.0, "step": 8300 }, { "completion_length": 226.53060150146484, "epoch": 0.8353207547169811, "grad_norm": 0.8594685196876526, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.8751881122589111, "reward_std": 0.10182258486747742, "rewards/accuracy_reward": 0.8751880824565887, "rewards/format_reward": 1.0, "step": 8301 }, { "completion_length": 211.15306091308594, "epoch": 0.8354213836477987, "grad_norm": 0.8585649132728577, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7284271121025085, "reward_std": 0.15995457768440247, "rewards/accuracy_reward": 0.7386312186717987, "rewards/format_reward": 0.9897959232330322, "step": 8302 }, { "completion_length": 301.24488830566406, "epoch": 0.8355220125786164, "grad_norm": 0.6476083993911743, "kl": 0.077392578125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8352503776550293, "reward_std": 0.18176454305648804, "rewards/accuracy_reward": 0.8454545140266418, "rewards/format_reward": 0.9897959232330322, "step": 8303 }, { "completion_length": 194.60203552246094, "epoch": 0.835622641509434, "grad_norm": 0.6704359650611877, "kl": 0.0660400390625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.767006754875183, "reward_std": 0.09639645926654339, "rewards/accuracy_reward": 0.7670067846775055, "rewards/format_reward": 1.0, "step": 8304 }, { "completion_length": 218.63265228271484, "epoch": 0.8357232704402515, "grad_norm": 1.165984034538269, "kl": 0.094482421875, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7625272274017334, "reward_std": 0.23594611138105392, "rewards/accuracy_reward": 0.7931395769119263, "rewards/format_reward": 0.9693877398967743, "step": 8305 }, { "completion_length": 244.83673095703125, "epoch": 0.8358238993710692, "grad_norm": 0.9473458528518677, "kl": 0.072998046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.905490756034851, "reward_std": 0.20723570883274078, "rewards/accuracy_reward": 0.9156948626041412, "rewards/format_reward": 0.9897959232330322, "step": 8306 }, { "completion_length": 204.77550506591797, "epoch": 0.8359245283018868, "grad_norm": 5.497875690460205, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7450833320617676, "reward_std": 0.23514901101589203, "rewards/accuracy_reward": 0.7858997881412506, "rewards/format_reward": 0.9591836631298065, "step": 8307 }, { "completion_length": 206.59183502197266, "epoch": 0.8360251572327044, "grad_norm": 0.5204489827156067, "kl": 0.0806884765625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8001375794410706, "reward_std": 0.1073145754635334, "rewards/accuracy_reward": 0.8103417158126831, "rewards/format_reward": 0.9897959232330322, "step": 8308 }, { "completion_length": 219.4693832397461, "epoch": 0.8361257861635221, "grad_norm": 1.3529610633850098, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.834121823310852, "reward_std": 0.1401081271469593, "rewards/accuracy_reward": 0.8443259298801422, "rewards/format_reward": 0.9897959232330322, "step": 8309 }, { "completion_length": 246.15306091308594, "epoch": 0.8362264150943396, "grad_norm": 0.9642765522003174, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7481821775436401, "reward_std": 0.195109024643898, "rewards/accuracy_reward": 0.7787944376468658, "rewards/format_reward": 0.9693877398967743, "step": 8310 }, { "completion_length": 250.71427154541016, "epoch": 0.8363270440251572, "grad_norm": 1.146234154701233, "kl": 0.140380859375, "learning_rate": 1e-06, "loss": 0.0056, "reward": 1.7143201231956482, "reward_std": 0.26431144028902054, "rewards/accuracy_reward": 0.7551364898681641, "rewards/format_reward": 0.9591836631298065, "step": 8311 }, { "completion_length": 280.9183654785156, "epoch": 0.8364276729559749, "grad_norm": 0.7632768750190735, "kl": 0.0751953125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.7506492733955383, "reward_std": 0.1635778620839119, "rewards/accuracy_reward": 0.7812615633010864, "rewards/format_reward": 0.9693877398967743, "step": 8312 }, { "completion_length": 237.54080963134766, "epoch": 0.8365283018867925, "grad_norm": 0.8438541889190674, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7201098799705505, "reward_std": 0.13973984494805336, "rewards/accuracy_reward": 0.7405181527137756, "rewards/format_reward": 0.9795918464660645, "step": 8313 }, { "completion_length": 223.07141876220703, "epoch": 0.83662893081761, "grad_norm": 2.3953332901000977, "kl": 0.10888671875, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7270379662513733, "reward_std": 0.25504330545663834, "rewards/accuracy_reward": 0.7678543329238892, "rewards/format_reward": 0.9591836631298065, "step": 8314 }, { "completion_length": 228.0204086303711, "epoch": 0.8367295597484277, "grad_norm": 0.8522708415985107, "kl": 0.0966796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7490105032920837, "reward_std": 0.18054603040218353, "rewards/accuracy_reward": 0.7694187164306641, "rewards/format_reward": 0.9795918464660645, "step": 8315 }, { "completion_length": 189.38774871826172, "epoch": 0.8368301886792453, "grad_norm": 1.2013165950775146, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7848175764083862, "reward_std": 0.20307853072881699, "rewards/accuracy_reward": 0.8052257001399994, "rewards/format_reward": 0.9795918464660645, "step": 8316 }, { "completion_length": 204.14285278320312, "epoch": 0.8369308176100629, "grad_norm": 1.7317967414855957, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7889769077301025, "reward_std": 0.15088098123669624, "rewards/accuracy_reward": 0.7991810441017151, "rewards/format_reward": 0.9897959232330322, "step": 8317 }, { "completion_length": 176.41836547851562, "epoch": 0.8370314465408805, "grad_norm": 0.6761236190795898, "kl": 0.097900390625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.9505831003189087, "reward_std": 0.10016470402479172, "rewards/accuracy_reward": 0.9709912538528442, "rewards/format_reward": 0.9795918464660645, "step": 8318 }, { "completion_length": 202.79591369628906, "epoch": 0.8371320754716981, "grad_norm": 0.7045959234237671, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.791836678981781, "reward_std": 0.12614328041672707, "rewards/accuracy_reward": 0.7918367087841034, "rewards/format_reward": 1.0, "step": 8319 }, { "completion_length": 240.60204315185547, "epoch": 0.8372327044025157, "grad_norm": 0.9332455992698669, "kl": 0.101318359375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7983478903770447, "reward_std": 0.23220933228731155, "rewards/accuracy_reward": 0.828960120677948, "rewards/format_reward": 0.9693877398967743, "step": 8320 }, { "completion_length": 169.89795684814453, "epoch": 0.8373333333333334, "grad_norm": 0.9447906613349915, "kl": 0.110595703125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8084062337875366, "reward_std": 0.17855913192033768, "rewards/accuracy_reward": 0.8186102211475372, "rewards/format_reward": 0.9897959232330322, "step": 8321 }, { "completion_length": 244.14285278320312, "epoch": 0.8374339622641509, "grad_norm": 0.7751469016075134, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.646594226360321, "reward_std": 0.12098002433776855, "rewards/accuracy_reward": 0.6465941965579987, "rewards/format_reward": 1.0, "step": 8322 }, { "completion_length": 146.1836700439453, "epoch": 0.8375345911949685, "grad_norm": 0.4970473647117615, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.9589406251907349, "reward_std": 0.02969132363796234, "rewards/accuracy_reward": 0.9589406847953796, "rewards/format_reward": 1.0, "step": 8323 }, { "completion_length": 222.43877410888672, "epoch": 0.8376352201257862, "grad_norm": 0.8256581425666809, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7357351183891296, "reward_std": 0.15343868359923363, "rewards/accuracy_reward": 0.7459392547607422, "rewards/format_reward": 0.9897959232330322, "step": 8324 }, { "completion_length": 207.9693832397461, "epoch": 0.8377358490566038, "grad_norm": 0.5789111852645874, "kl": 0.07080078125, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.806122362613678, "reward_std": 0.11584596335887909, "rewards/accuracy_reward": 0.8265305757522583, "rewards/format_reward": 0.9795918464660645, "step": 8325 }, { "completion_length": 253.6326446533203, "epoch": 0.8378364779874213, "grad_norm": 0.6131464838981628, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.767272174358368, "reward_std": 0.1285870224237442, "rewards/accuracy_reward": 0.7876803278923035, "rewards/format_reward": 0.9795918464660645, "step": 8326 }, { "completion_length": 160.2244873046875, "epoch": 0.837937106918239, "grad_norm": 0.43853920698165894, "kl": 0.119384765625, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.854713261127472, "reward_std": 0.04802003130316734, "rewards/accuracy_reward": 0.8547132909297943, "rewards/format_reward": 1.0, "step": 8327 }, { "completion_length": 245.53060913085938, "epoch": 0.8380377358490566, "grad_norm": 1.074096441268921, "kl": 0.078369140625, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6125260591506958, "reward_std": 0.2459537759423256, "rewards/accuracy_reward": 0.622730165719986, "rewards/format_reward": 0.9897959232330322, "step": 8328 }, { "completion_length": 242.8775405883789, "epoch": 0.8381383647798742, "grad_norm": 0.5315381288528442, "kl": 0.06494140625, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8238773941993713, "reward_std": 0.13747763819992542, "rewards/accuracy_reward": 0.8238774538040161, "rewards/format_reward": 1.0, "step": 8329 }, { "completion_length": 216.62244415283203, "epoch": 0.8382389937106918, "grad_norm": 0.7659041881561279, "kl": 0.091552734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8079777359962463, "reward_std": 0.17129744589328766, "rewards/accuracy_reward": 0.8181818127632141, "rewards/format_reward": 0.9897959232330322, "step": 8330 }, { "completion_length": 181.84693908691406, "epoch": 0.8383396226415094, "grad_norm": 0.6171925067901611, "kl": 0.12744140625, "learning_rate": 1e-06, "loss": 0.0051, "reward": 1.8081302642822266, "reward_std": 0.10396432876586914, "rewards/accuracy_reward": 0.8081303238868713, "rewards/format_reward": 1.0, "step": 8331 }, { "completion_length": 194.2040786743164, "epoch": 0.838440251572327, "grad_norm": 0.9664922952651978, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6159480810165405, "reward_std": 0.2575441673398018, "rewards/accuracy_reward": 0.6567644476890564, "rewards/format_reward": 0.9591836631298065, "step": 8332 }, { "completion_length": 184.12244415283203, "epoch": 0.8385408805031447, "grad_norm": 0.5312291979789734, "kl": 0.067138671875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8856292963027954, "reward_std": 0.12298675999045372, "rewards/accuracy_reward": 0.9162415266036987, "rewards/format_reward": 0.9693877398967743, "step": 8333 }, { "completion_length": 277.15306091308594, "epoch": 0.8386415094339623, "grad_norm": 0.692351758480072, "kl": 0.0655517578125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.655832290649414, "reward_std": 0.1583195924758911, "rewards/accuracy_reward": 0.6762405335903168, "rewards/format_reward": 0.9795918166637421, "step": 8334 }, { "completion_length": 234.948974609375, "epoch": 0.8387421383647798, "grad_norm": 0.7639099359512329, "kl": 0.10595703125, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7029696106910706, "reward_std": 0.18918729200959206, "rewards/accuracy_reward": 0.7335818707942963, "rewards/format_reward": 0.9693877398967743, "step": 8335 }, { "completion_length": 236.61224365234375, "epoch": 0.8388427672955975, "grad_norm": 1.2958399057388306, "kl": 0.09765625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.6584261655807495, "reward_std": 0.17648964002728462, "rewards/accuracy_reward": 0.6686302125453949, "rewards/format_reward": 0.9897959232330322, "step": 8336 }, { "completion_length": 227.34693145751953, "epoch": 0.8389433962264151, "grad_norm": 1.5590286254882812, "kl": 0.070068359375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.7754945158958435, "reward_std": 0.2070973962545395, "rewards/accuracy_reward": 0.7754945456981659, "rewards/format_reward": 1.0, "step": 8337 }, { "completion_length": 204.94896697998047, "epoch": 0.8390440251572328, "grad_norm": 0.6524980068206787, "kl": 0.084716796875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8609336614608765, "reward_std": 0.15465989708900452, "rewards/accuracy_reward": 0.8609336614608765, "rewards/format_reward": 1.0, "step": 8338 }, { "completion_length": 225.76529693603516, "epoch": 0.8391446540880503, "grad_norm": 24.43747329711914, "kl": 0.36572265625, "learning_rate": 1e-06, "loss": 0.0147, "reward": 1.7252371311187744, "reward_std": 0.07077646069228649, "rewards/accuracy_reward": 0.735441118478775, "rewards/format_reward": 0.9897959232330322, "step": 8339 }, { "completion_length": 227.54080963134766, "epoch": 0.8392452830188679, "grad_norm": 0.6434046626091003, "kl": 0.0902099609375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8156061172485352, "reward_std": 0.10763397440314293, "rewards/accuracy_reward": 0.8156061470508575, "rewards/format_reward": 1.0, "step": 8340 }, { "completion_length": 233.9795913696289, "epoch": 0.8393459119496856, "grad_norm": 1.3630117177963257, "kl": 0.09814453125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8700250387191772, "reward_std": 0.1379258930683136, "rewards/accuracy_reward": 0.8802291750907898, "rewards/format_reward": 0.9897959232330322, "step": 8341 }, { "completion_length": 285.65306091308594, "epoch": 0.8394465408805032, "grad_norm": 1.429068684577942, "kl": 0.0672607421875, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.762168526649475, "reward_std": 0.2283492460846901, "rewards/accuracy_reward": 0.7927807569503784, "rewards/format_reward": 0.9693877398967743, "step": 8342 }, { "completion_length": 237.6530532836914, "epoch": 0.8395471698113207, "grad_norm": 2.1834280490875244, "kl": 0.10009765625, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.5645499229431152, "reward_std": 0.23880191147327423, "rewards/accuracy_reward": 0.5747540295124054, "rewards/format_reward": 0.9897959232330322, "step": 8343 }, { "completion_length": 247.3571319580078, "epoch": 0.8396477987421384, "grad_norm": 0.7716801762580872, "kl": 0.087890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.5495626330375671, "reward_std": 0.1959376037120819, "rewards/accuracy_reward": 0.5597667396068573, "rewards/format_reward": 0.9897959232330322, "step": 8344 }, { "completion_length": 251.02040100097656, "epoch": 0.839748427672956, "grad_norm": 1.2839616537094116, "kl": 0.14404296875, "learning_rate": 1e-06, "loss": 0.0058, "reward": 1.758985161781311, "reward_std": 0.24747180938720703, "rewards/accuracy_reward": 0.7895973920822144, "rewards/format_reward": 0.9693877398967743, "step": 8345 }, { "completion_length": 212.99999237060547, "epoch": 0.8398490566037736, "grad_norm": 0.6889054179191589, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.720919907093048, "reward_std": 0.09553883224725723, "rewards/accuracy_reward": 0.7311239838600159, "rewards/format_reward": 0.9897959232330322, "step": 8346 }, { "completion_length": 271.57142639160156, "epoch": 0.8399496855345912, "grad_norm": 0.7481138110160828, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8204081654548645, "reward_std": 0.199046041816473, "rewards/accuracy_reward": 0.8510204255580902, "rewards/format_reward": 0.9693877398967743, "step": 8347 }, { "completion_length": 258.8061065673828, "epoch": 0.8400503144654088, "grad_norm": 0.49481290578842163, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.9632651805877686, "reward_std": 0.08862842246890068, "rewards/accuracy_reward": 0.9632652699947357, "rewards/format_reward": 1.0, "step": 8348 }, { "completion_length": 248.85713958740234, "epoch": 0.8401509433962264, "grad_norm": 0.5657550096511841, "kl": 0.0675048828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8400970101356506, "reward_std": 0.15347694233059883, "rewards/accuracy_reward": 0.8503009974956512, "rewards/format_reward": 0.9897959232330322, "step": 8349 }, { "completion_length": 302.4591827392578, "epoch": 0.8402515723270441, "grad_norm": 0.7829546332359314, "kl": 0.0645751953125, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.8392857313156128, "reward_std": 0.11123361438512802, "rewards/accuracy_reward": 0.8494898080825806, "rewards/format_reward": 0.9897959232330322, "step": 8350 }, { "completion_length": 173.9591827392578, "epoch": 0.8403522012578616, "grad_norm": 1.2138681411743164, "kl": 0.09326171875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7304826974868774, "reward_std": 0.15074514597654343, "rewards/accuracy_reward": 0.7304826378822327, "rewards/format_reward": 1.0, "step": 8351 }, { "completion_length": 222.71427154541016, "epoch": 0.8404528301886792, "grad_norm": 0.6529236435890198, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8083819150924683, "reward_std": 0.1473628729581833, "rewards/accuracy_reward": 0.818585991859436, "rewards/format_reward": 0.9897959232330322, "step": 8352 }, { "completion_length": 272.33673095703125, "epoch": 0.8405534591194969, "grad_norm": 0.5258228778839111, "kl": 0.0601806640625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.6068082451820374, "reward_std": 0.21713291108608246, "rewards/accuracy_reward": 0.6272164285182953, "rewards/format_reward": 0.9795918464660645, "step": 8353 }, { "completion_length": 183.35713958740234, "epoch": 0.8406540880503145, "grad_norm": 0.35036224126815796, "kl": 0.0927734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8469387292861938, "reward_std": 0.026997461915016174, "rewards/accuracy_reward": 0.8571428656578064, "rewards/format_reward": 0.9897959232330322, "step": 8354 }, { "completion_length": 276.2448959350586, "epoch": 0.840754716981132, "grad_norm": 0.6764702200889587, "kl": 0.087646484375, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.762239933013916, "reward_std": 0.20846083760261536, "rewards/accuracy_reward": 0.7928521931171417, "rewards/format_reward": 0.9693877398967743, "step": 8355 }, { "completion_length": 278.11224365234375, "epoch": 0.8408553459119497, "grad_norm": 0.6192647814750671, "kl": 0.11328125, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.8252209424972534, "reward_std": 0.145945243537426, "rewards/accuracy_reward": 0.8354251384735107, "rewards/format_reward": 0.9897959232330322, "step": 8356 }, { "completion_length": 162.91836547851562, "epoch": 0.8409559748427673, "grad_norm": 0.9223223924636841, "kl": 0.121826171875, "learning_rate": 1e-06, "loss": 0.0049, "reward": 1.8354257941246033, "reward_std": 0.21479222178459167, "rewards/accuracy_reward": 0.8660381138324738, "rewards/format_reward": 0.9693877398967743, "step": 8357 }, { "completion_length": 294.24488830566406, "epoch": 0.8410566037735849, "grad_norm": 0.6200594902038574, "kl": 0.09765625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8409809470176697, "reward_std": 0.1447349712252617, "rewards/accuracy_reward": 0.8511850833892822, "rewards/format_reward": 0.9897959232330322, "step": 8358 }, { "completion_length": 205.25509643554688, "epoch": 0.8411572327044026, "grad_norm": 0.4541822373867035, "kl": 0.076904296875, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.8540984392166138, "reward_std": 0.10734142735600471, "rewards/accuracy_reward": 0.8540984988212585, "rewards/format_reward": 1.0, "step": 8359 }, { "completion_length": 160.4081573486328, "epoch": 0.8412578616352201, "grad_norm": 1.1841429471969604, "kl": 0.082763671875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.8529855608940125, "reward_std": 0.099847462028265, "rewards/accuracy_reward": 0.8631896674633026, "rewards/format_reward": 0.9897959232330322, "step": 8360 }, { "completion_length": 251.2244873046875, "epoch": 0.8413584905660377, "grad_norm": 0.45586150884628296, "kl": 0.075439453125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.6852040886878967, "reward_std": 0.10687036067247391, "rewards/accuracy_reward": 0.6852040588855743, "rewards/format_reward": 1.0, "step": 8361 }, { "completion_length": 185.2551040649414, "epoch": 0.8414591194968554, "grad_norm": 1.0814961194992065, "kl": 0.093017578125, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.876190423965454, "reward_std": 0.20568659901618958, "rewards/accuracy_reward": 0.8863945305347443, "rewards/format_reward": 0.9897959232330322, "step": 8362 }, { "completion_length": 241.81632232666016, "epoch": 0.841559748427673, "grad_norm": 0.2300650030374527, "kl": 0.0626220703125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8741496801376343, "reward_std": 0.061851032078266144, "rewards/accuracy_reward": 0.8945578038692474, "rewards/format_reward": 0.9795918166637421, "step": 8363 }, { "completion_length": 294.9081573486328, "epoch": 0.8416603773584905, "grad_norm": 0.3074820339679718, "kl": 0.0633544921875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7686004042625427, "reward_std": 0.08256405219435692, "rewards/accuracy_reward": 0.7890085875988007, "rewards/format_reward": 0.9795918464660645, "step": 8364 }, { "completion_length": 188.59183502197266, "epoch": 0.8417610062893082, "grad_norm": 0.7757441997528076, "kl": 0.095947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8799630999565125, "reward_std": 0.11110893078148365, "rewards/accuracy_reward": 0.8901671171188354, "rewards/format_reward": 0.9897959232330322, "step": 8365 }, { "completion_length": 266.2550964355469, "epoch": 0.8418616352201258, "grad_norm": 0.6497303247451782, "kl": 0.0726318359375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.868511140346527, "reward_std": 0.1264425441622734, "rewards/accuracy_reward": 0.8685110509395599, "rewards/format_reward": 1.0, "step": 8366 }, { "completion_length": 231.12244415283203, "epoch": 0.8419622641509434, "grad_norm": 0.5629957318305969, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.9591836333274841, "reward_std": 0.1079898476600647, "rewards/accuracy_reward": 0.9795918464660645, "rewards/format_reward": 0.9795918166637421, "step": 8367 }, { "completion_length": 252.37754821777344, "epoch": 0.842062893081761, "grad_norm": 1.4161555767059326, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7863966822624207, "reward_std": 0.21934301406145096, "rewards/accuracy_reward": 0.7966007888317108, "rewards/format_reward": 0.9897959232330322, "step": 8368 }, { "completion_length": 229.8775405883789, "epoch": 0.8421635220125786, "grad_norm": 0.7172925472259521, "kl": 0.08837890625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.833024024963379, "reward_std": 0.13366283103823662, "rewards/accuracy_reward": 0.8534322679042816, "rewards/format_reward": 0.9795918464660645, "step": 8369 }, { "completion_length": 261.96937561035156, "epoch": 0.8422641509433962, "grad_norm": 0.7633312940597534, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.928571343421936, "reward_std": 0.18898223340511322, "rewards/accuracy_reward": 0.9591836631298065, "rewards/format_reward": 0.9693877398967743, "step": 8370 }, { "completion_length": 330.24488830566406, "epoch": 0.8423647798742139, "grad_norm": 0.8313148617744446, "kl": 0.0665283203125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.861427664756775, "reward_std": 0.12311432138085365, "rewards/accuracy_reward": 0.8614276647567749, "rewards/format_reward": 1.0, "step": 8371 }, { "completion_length": 221.34693908691406, "epoch": 0.8424654088050314, "grad_norm": 0.9179728031158447, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8965014219284058, "reward_std": 0.23091693967580795, "rewards/accuracy_reward": 0.9373177587985992, "rewards/format_reward": 0.9591836631298065, "step": 8372 }, { "completion_length": 229.26529693603516, "epoch": 0.842566037735849, "grad_norm": 1.1916399002075195, "kl": 0.106689453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.7401522397994995, "reward_std": 0.18487005308270454, "rewards/accuracy_reward": 0.7401522397994995, "rewards/format_reward": 1.0, "step": 8373 }, { "completion_length": 187.1530532836914, "epoch": 0.8426666666666667, "grad_norm": 43.08123016357422, "kl": 0.1748046875, "learning_rate": 1e-06, "loss": 0.007, "reward": 1.8829531073570251, "reward_std": 0.17904620617628098, "rewards/accuracy_reward": 0.9033613204956055, "rewards/format_reward": 0.9795918464660645, "step": 8374 }, { "completion_length": 233.3571319580078, "epoch": 0.8427672955974843, "grad_norm": 0.6988670229911804, "kl": 0.100341796875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.871795892715454, "reward_std": 0.10354794934391975, "rewards/accuracy_reward": 0.8820000290870667, "rewards/format_reward": 0.9897959232330322, "step": 8375 }, { "completion_length": 219.39794921875, "epoch": 0.8428679245283018, "grad_norm": 0.38793131709098816, "kl": 0.090576171875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7778837084770203, "reward_std": 0.10790753364562988, "rewards/accuracy_reward": 0.798291802406311, "rewards/format_reward": 0.9795918166637421, "step": 8376 }, { "completion_length": 261.1122360229492, "epoch": 0.8429685534591195, "grad_norm": 1.4394241571426392, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7445603609085083, "reward_std": 0.17195682972669601, "rewards/accuracy_reward": 0.7445603907108307, "rewards/format_reward": 1.0, "step": 8377 }, { "completion_length": 258.0102005004883, "epoch": 0.8430691823899371, "grad_norm": 0.6307600736618042, "kl": 0.078125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6609864234924316, "reward_std": 0.19542250409722328, "rewards/accuracy_reward": 0.6813946962356567, "rewards/format_reward": 0.9795918166637421, "step": 8378 }, { "completion_length": 316.1428527832031, "epoch": 0.8431698113207547, "grad_norm": 0.9251564145088196, "kl": 0.080078125, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7611923813819885, "reward_std": 0.19594888389110565, "rewards/accuracy_reward": 0.7713965475559235, "rewards/format_reward": 0.9897959232330322, "step": 8379 }, { "completion_length": 222.12244415283203, "epoch": 0.8432704402515723, "grad_norm": 1.239492416381836, "kl": 0.120849609375, "learning_rate": 1e-06, "loss": 0.0048, "reward": 1.7895371317863464, "reward_std": 0.16131166368722916, "rewards/accuracy_reward": 0.809945285320282, "rewards/format_reward": 0.9795918166637421, "step": 8380 }, { "completion_length": 214.78571319580078, "epoch": 0.8433710691823899, "grad_norm": 0.6872162818908691, "kl": 0.09765625, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8720393180847168, "reward_std": 0.15090690180659294, "rewards/accuracy_reward": 0.882243424654007, "rewards/format_reward": 0.9897959232330322, "step": 8381 }, { "completion_length": 263.4387664794922, "epoch": 0.8434716981132075, "grad_norm": 0.5912057757377625, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7289865612983704, "reward_std": 0.1149715781211853, "rewards/accuracy_reward": 0.7289866507053375, "rewards/format_reward": 1.0, "step": 8382 }, { "completion_length": 269.9183654785156, "epoch": 0.8435723270440252, "grad_norm": 1.029304027557373, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.805724859237671, "reward_std": 0.1532926857471466, "rewards/accuracy_reward": 0.8159289360046387, "rewards/format_reward": 0.9897959232330322, "step": 8383 }, { "completion_length": 178.27550888061523, "epoch": 0.8436729559748428, "grad_norm": 0.8025779128074646, "kl": 0.073974609375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8452866673469543, "reward_std": 0.07754398882389069, "rewards/accuracy_reward": 0.8452866971492767, "rewards/format_reward": 1.0, "step": 8384 }, { "completion_length": 257.1734619140625, "epoch": 0.8437735849056603, "grad_norm": 0.7261537313461304, "kl": 0.107666015625, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.8716964721679688, "reward_std": 0.09295441955327988, "rewards/accuracy_reward": 0.8716965913772583, "rewards/format_reward": 1.0, "step": 8385 }, { "completion_length": 198.72447967529297, "epoch": 0.843874213836478, "grad_norm": 0.7292105555534363, "kl": 0.083984375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.9079247117042542, "reward_std": 0.10986179765313864, "rewards/accuracy_reward": 0.9181287884712219, "rewards/format_reward": 0.9897959232330322, "step": 8386 }, { "completion_length": 188.1224479675293, "epoch": 0.8439748427672956, "grad_norm": 0.4163362681865692, "kl": 0.0908203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.903554618358612, "reward_std": 0.04089367017149925, "rewards/accuracy_reward": 0.9035545885562897, "rewards/format_reward": 1.0, "step": 8387 }, { "completion_length": 186.28570556640625, "epoch": 0.8440754716981133, "grad_norm": 1.2340537309646606, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.9452381134033203, "reward_std": 0.07623940706253052, "rewards/accuracy_reward": 0.9554421901702881, "rewards/format_reward": 0.9897959232330322, "step": 8388 }, { "completion_length": 273.4591751098633, "epoch": 0.8441761006289308, "grad_norm": 0.7565408945083618, "kl": 0.0758056640625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.9019380807876587, "reward_std": 0.15209456160664558, "rewards/accuracy_reward": 0.9223461747169495, "rewards/format_reward": 0.9795918464660645, "step": 8389 }, { "completion_length": 293.9795837402344, "epoch": 0.8442767295597484, "grad_norm": 0.7792186737060547, "kl": 0.113037109375, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.8497323393821716, "reward_std": 0.12108337506651878, "rewards/accuracy_reward": 0.870140552520752, "rewards/format_reward": 0.9795918464660645, "step": 8390 }, { "completion_length": 305.2244873046875, "epoch": 0.844377358490566, "grad_norm": 0.7903297543525696, "kl": 0.07275390625, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.6064736247062683, "reward_std": 0.13554061204195023, "rewards/accuracy_reward": 0.6166777312755585, "rewards/format_reward": 0.9897959232330322, "step": 8391 }, { "completion_length": 225.52040100097656, "epoch": 0.8444779874213837, "grad_norm": 0.39694303274154663, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7076295018196106, "reward_std": 0.07253762520849705, "rewards/accuracy_reward": 0.7076295018196106, "rewards/format_reward": 1.0, "step": 8392 }, { "completion_length": 193.84693145751953, "epoch": 0.8445786163522012, "grad_norm": 1.1925561428070068, "kl": 0.113037109375, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.6744223833084106, "reward_std": 0.2216980904340744, "rewards/accuracy_reward": 0.694830596446991, "rewards/format_reward": 0.9795918464660645, "step": 8393 }, { "completion_length": 201.03060150146484, "epoch": 0.8446792452830189, "grad_norm": 0.9328656196594238, "kl": 0.08544921875, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8518143892288208, "reward_std": 0.11947184428572655, "rewards/accuracy_reward": 0.8518144190311432, "rewards/format_reward": 1.0, "step": 8394 }, { "completion_length": 178.6530532836914, "epoch": 0.8447798742138365, "grad_norm": 0.5223297476768494, "kl": 0.109130859375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.8627705574035645, "reward_std": 0.09366884455084801, "rewards/accuracy_reward": 0.8627705574035645, "rewards/format_reward": 1.0, "step": 8395 }, { "completion_length": 214.53060913085938, "epoch": 0.8448805031446541, "grad_norm": 0.5402434468269348, "kl": 0.091552734375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.9266984462738037, "reward_std": 0.055610405281186104, "rewards/accuracy_reward": 0.9266984164714813, "rewards/format_reward": 1.0, "step": 8396 }, { "completion_length": 263.39795684814453, "epoch": 0.8449811320754717, "grad_norm": 0.4921088218688965, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.799988329410553, "reward_std": 0.10740010067820549, "rewards/accuracy_reward": 0.8101924359798431, "rewards/format_reward": 0.9897959232330322, "step": 8397 }, { "completion_length": 190.9081573486328, "epoch": 0.8450817610062893, "grad_norm": 0.22470742464065552, "kl": 0.07958984375, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.816223382949829, "reward_std": 0.021651891991496086, "rewards/accuracy_reward": 0.8162234425544739, "rewards/format_reward": 1.0, "step": 8398 }, { "completion_length": 285.01019287109375, "epoch": 0.8451823899371069, "grad_norm": 1.077001929283142, "kl": 0.0716552734375, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.756619155406952, "reward_std": 0.14665735512971878, "rewards/accuracy_reward": 0.7668233513832092, "rewards/format_reward": 0.9897959232330322, "step": 8399 }, { "completion_length": 235.7448959350586, "epoch": 0.8452830188679246, "grad_norm": 1.1203930377960205, "kl": 0.110595703125, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.7818270325660706, "reward_std": 0.26935532689094543, "rewards/accuracy_reward": 0.8430514931678772, "rewards/format_reward": 0.9387754797935486, "step": 8400 }, { "completion_length": 214.27550888061523, "epoch": 0.8453836477987421, "grad_norm": 0.7935632467269897, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7334388494491577, "reward_std": 0.1555185541510582, "rewards/accuracy_reward": 0.7538469731807709, "rewards/format_reward": 0.9795918166637421, "step": 8401 }, { "completion_length": 252.6632537841797, "epoch": 0.8454842767295597, "grad_norm": 0.6452399492263794, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.697392225265503, "reward_std": 0.13095785677433014, "rewards/accuracy_reward": 0.6973922550678253, "rewards/format_reward": 1.0, "step": 8402 }, { "completion_length": 127.85714340209961, "epoch": 0.8455849056603774, "grad_norm": 0.7229364514350891, "kl": 0.09716796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8979591131210327, "reward_std": 0.06517763808369637, "rewards/accuracy_reward": 0.8979591727256775, "rewards/format_reward": 1.0, "step": 8403 }, { "completion_length": 324.5306091308594, "epoch": 0.845685534591195, "grad_norm": 1.0671114921569824, "kl": 0.085205078125, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.5803399682044983, "reward_std": 0.26170819997787476, "rewards/accuracy_reward": 0.6007481813430786, "rewards/format_reward": 0.9795918464660645, "step": 8404 }, { "completion_length": 253.6020278930664, "epoch": 0.8457861635220125, "grad_norm": 0.5643013715744019, "kl": 0.06005859375, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7717848420143127, "reward_std": 0.18713746219873428, "rewards/accuracy_reward": 0.7921930253505707, "rewards/format_reward": 0.9795918166637421, "step": 8405 }, { "completion_length": 300.5918273925781, "epoch": 0.8458867924528302, "grad_norm": 0.4794688820838928, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7500081658363342, "reward_std": 0.14959155023097992, "rewards/accuracy_reward": 0.7602122724056244, "rewards/format_reward": 0.9897959232330322, "step": 8406 }, { "completion_length": 273.58162689208984, "epoch": 0.8459874213836478, "grad_norm": 0.48583316802978516, "kl": 0.111572265625, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.5896722078323364, "reward_std": 0.20354831218719482, "rewards/accuracy_reward": 0.6304885745048523, "rewards/format_reward": 0.9591836333274841, "step": 8407 }, { "completion_length": 276.79591369628906, "epoch": 0.8460880503144654, "grad_norm": 0.7990583777427673, "kl": 0.0819091796875, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.9542436003684998, "reward_std": 0.07906399294734001, "rewards/accuracy_reward": 0.9542436003684998, "rewards/format_reward": 1.0, "step": 8408 }, { "completion_length": 228.02040100097656, "epoch": 0.8461886792452831, "grad_norm": 0.4189535677433014, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.9166990518569946, "reward_std": 0.08640783280134201, "rewards/accuracy_reward": 0.9166990518569946, "rewards/format_reward": 1.0, "step": 8409 }, { "completion_length": 259.3061218261719, "epoch": 0.8462893081761006, "grad_norm": 1.3301712274551392, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.8281005024909973, "reward_std": 0.13672539591789246, "rewards/accuracy_reward": 0.8485085666179657, "rewards/format_reward": 0.9795918464660645, "step": 8410 }, { "completion_length": 300.76529693603516, "epoch": 0.8463899371069182, "grad_norm": 1.3766286373138428, "kl": 0.09423828125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.790192723274231, "reward_std": 0.15401286631822586, "rewards/accuracy_reward": 0.8003967702388763, "rewards/format_reward": 0.9897959232330322, "step": 8411 }, { "completion_length": 230.1428451538086, "epoch": 0.8464905660377359, "grad_norm": 0.2195308357477188, "kl": 0.0675048828125, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.842229187488556, "reward_std": 0.06019715406000614, "rewards/accuracy_reward": 0.8524332642555237, "rewards/format_reward": 0.9897959232330322, "step": 8412 }, { "completion_length": 292.32652282714844, "epoch": 0.8465911949685535, "grad_norm": 0.739592969417572, "kl": 0.0869140625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.6097568273544312, "reward_std": 0.18821844458580017, "rewards/accuracy_reward": 0.6199609935283661, "rewards/format_reward": 0.9897959232330322, "step": 8413 }, { "completion_length": 266.2346878051758, "epoch": 0.846691823899371, "grad_norm": 1.0487720966339111, "kl": 0.115966796875, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.6970446109771729, "reward_std": 0.18464822322130203, "rewards/accuracy_reward": 0.7276569306850433, "rewards/format_reward": 0.9693877398967743, "step": 8414 }, { "completion_length": 275.35713958740234, "epoch": 0.8467924528301887, "grad_norm": 0.6220316290855408, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.826252043247223, "reward_std": 0.15741922706365585, "rewards/accuracy_reward": 0.8568642735481262, "rewards/format_reward": 0.9693877398967743, "step": 8415 }, { "completion_length": 285.68365478515625, "epoch": 0.8468930817610063, "grad_norm": 1.0892152786254883, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7888924479484558, "reward_std": 0.19668275490403175, "rewards/accuracy_reward": 0.8093006610870361, "rewards/format_reward": 0.9795918464660645, "step": 8416 }, { "completion_length": 223.9795913696289, "epoch": 0.8469937106918239, "grad_norm": 0.5371505618095398, "kl": 0.06396484375, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.922092616558075, "reward_std": 0.0879280362278223, "rewards/accuracy_reward": 0.9322966635227203, "rewards/format_reward": 0.9897959232330322, "step": 8417 }, { "completion_length": 260.21427154541016, "epoch": 0.8470943396226415, "grad_norm": 0.8796730041503906, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7522675395011902, "reward_std": 0.1586177721619606, "rewards/accuracy_reward": 0.7726756930351257, "rewards/format_reward": 0.9795918166637421, "step": 8418 }, { "completion_length": 355.38775634765625, "epoch": 0.8471949685534591, "grad_norm": 0.8083921074867249, "kl": 0.0557861328125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.911564588546753, "reward_std": 0.14502697438001633, "rewards/accuracy_reward": 0.9421768486499786, "rewards/format_reward": 0.9693877398967743, "step": 8419 }, { "completion_length": 258.5306091308594, "epoch": 0.8472955974842767, "grad_norm": 0.18983575701713562, "kl": 0.0743408203125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.9897959232330322, "reward_std": 0.026997461915016174, "rewards/accuracy_reward": 0.9897959232330322, "rewards/format_reward": 1.0, "step": 8420 }, { "completion_length": 209.4387664794922, "epoch": 0.8473962264150944, "grad_norm": 0.353103905916214, "kl": 0.096435546875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.9407190680503845, "reward_std": 0.07828805409371853, "rewards/accuracy_reward": 0.9509231746196747, "rewards/format_reward": 0.9897959232330322, "step": 8421 }, { "completion_length": 291.5816192626953, "epoch": 0.8474968553459119, "grad_norm": 0.7436977624893188, "kl": 0.08935546875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7539799809455872, "reward_std": 0.1463182047009468, "rewards/accuracy_reward": 0.7539799809455872, "rewards/format_reward": 1.0, "step": 8422 }, { "completion_length": 351.3673400878906, "epoch": 0.8475974842767295, "grad_norm": 1.1334699392318726, "kl": 0.13671875, "learning_rate": 1e-06, "loss": 0.0055, "reward": 1.774402678012848, "reward_std": 0.2753680944442749, "rewards/accuracy_reward": 0.7948108613491058, "rewards/format_reward": 0.9795918464660645, "step": 8423 }, { "completion_length": 238.1530532836914, "epoch": 0.8476981132075472, "grad_norm": 0.7224227786064148, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8820897340774536, "reward_std": 0.1089751161634922, "rewards/accuracy_reward": 0.8922939002513885, "rewards/format_reward": 0.9897959232330322, "step": 8424 }, { "completion_length": 183.91836547851562, "epoch": 0.8477987421383648, "grad_norm": 0.4529702961444855, "kl": 0.088134765625, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8173468708992004, "reward_std": 0.08189894258975983, "rewards/accuracy_reward": 0.827551007270813, "rewards/format_reward": 0.9897959232330322, "step": 8425 }, { "completion_length": 297.7142791748047, "epoch": 0.8478993710691823, "grad_norm": 0.9808045625686646, "kl": 0.0947265625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.8163264989852905, "reward_std": 0.09670460969209671, "rewards/accuracy_reward": 0.8163265287876129, "rewards/format_reward": 1.0, "step": 8426 }, { "completion_length": 360.29591369628906, "epoch": 0.848, "grad_norm": 1.2532057762145996, "kl": 0.1455078125, "learning_rate": 1e-06, "loss": 0.0058, "reward": 1.5330455303192139, "reward_std": 0.2525502070784569, "rewards/accuracy_reward": 0.6044740974903107, "rewards/format_reward": 0.9285714030265808, "step": 8427 }, { "completion_length": 285.8367156982422, "epoch": 0.8481006289308176, "grad_norm": 0.6519376635551453, "kl": 0.101318359375, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.677416443824768, "reward_std": 0.1470552645623684, "rewards/accuracy_reward": 0.697824627161026, "rewards/format_reward": 0.9795918464660645, "step": 8428 }, { "completion_length": 256.9897918701172, "epoch": 0.8482012578616352, "grad_norm": 0.5016907453536987, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.8246204257011414, "reward_std": 0.07391232997179031, "rewards/accuracy_reward": 0.8246204257011414, "rewards/format_reward": 1.0, "step": 8429 }, { "completion_length": 267.0, "epoch": 0.8483018867924528, "grad_norm": 0.4788660705089569, "kl": 0.071533203125, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7731198072433472, "reward_std": 0.1083965003490448, "rewards/accuracy_reward": 0.7935280799865723, "rewards/format_reward": 0.9795918464660645, "step": 8430 }, { "completion_length": 279.0918273925781, "epoch": 0.8484025157232704, "grad_norm": 1.0527293682098389, "kl": 0.090087890625, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.7611626386642456, "reward_std": 0.21184512227773666, "rewards/accuracy_reward": 0.7815707921981812, "rewards/format_reward": 0.9795918166637421, "step": 8431 }, { "completion_length": 243.97958374023438, "epoch": 0.848503144654088, "grad_norm": 0.7244870662689209, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.7761285901069641, "reward_std": 0.07231643050909042, "rewards/accuracy_reward": 0.7761286199092865, "rewards/format_reward": 1.0, "step": 8432 }, { "completion_length": 160.70407104492188, "epoch": 0.8486037735849057, "grad_norm": 1.3130656480789185, "kl": 0.074462890625, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8537415266036987, "reward_std": 0.15184257179498672, "rewards/accuracy_reward": 0.8741496503353119, "rewards/format_reward": 0.9795918464660645, "step": 8433 }, { "completion_length": 248.1734619140625, "epoch": 0.8487044025157233, "grad_norm": 1.5919514894485474, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.554225742816925, "reward_std": 0.3003913313150406, "rewards/accuracy_reward": 0.6256543099880219, "rewards/format_reward": 0.9285714328289032, "step": 8434 }, { "completion_length": 284.7856979370117, "epoch": 0.8488050314465408, "grad_norm": 0.8637515306472778, "kl": 0.0928955078125, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.5328738689422607, "reward_std": 0.23142919689416885, "rewards/accuracy_reward": 0.5736902356147766, "rewards/format_reward": 0.9591836631298065, "step": 8435 }, { "completion_length": 318.89794921875, "epoch": 0.8489056603773585, "grad_norm": 0.3747035562992096, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.9039195775985718, "reward_std": 0.11691141128540039, "rewards/accuracy_reward": 0.9243277907371521, "rewards/format_reward": 0.9795918464660645, "step": 8436 }, { "completion_length": 259.7244873046875, "epoch": 0.8490062893081761, "grad_norm": 1.0593699216842651, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7916131019592285, "reward_std": 0.3160351812839508, "rewards/accuracy_reward": 0.8528376519680023, "rewards/format_reward": 0.9387754797935486, "step": 8437 }, { "completion_length": 287.82652282714844, "epoch": 0.8491069182389938, "grad_norm": 0.6463584899902344, "kl": 0.0582275390625, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.805067002773285, "reward_std": 0.2315424382686615, "rewards/accuracy_reward": 0.8254752159118652, "rewards/format_reward": 0.9795918464660645, "step": 8438 }, { "completion_length": 238.99998474121094, "epoch": 0.8492075471698113, "grad_norm": 1.12686026096344, "kl": 0.09375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.8364430665969849, "reward_std": 0.24857326596975327, "rewards/accuracy_reward": 0.8874635100364685, "rewards/format_reward": 0.9489795565605164, "step": 8439 }, { "completion_length": 288.89794921875, "epoch": 0.8493081761006289, "grad_norm": 0.9878806471824646, "kl": 0.092529296875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.748474359512329, "reward_std": 0.2573932111263275, "rewards/accuracy_reward": 0.8301071226596832, "rewards/format_reward": 0.918367326259613, "step": 8440 }, { "completion_length": 272.3877410888672, "epoch": 0.8494088050314466, "grad_norm": 0.6215547323226929, "kl": 0.0771484375, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.6882895231246948, "reward_std": 0.14820346236228943, "rewards/accuracy_reward": 0.6984936892986298, "rewards/format_reward": 0.9897959232330322, "step": 8441 }, { "completion_length": 235.07141876220703, "epoch": 0.8495094339622642, "grad_norm": 1.1554497480392456, "kl": 0.114013671875, "learning_rate": 1e-06, "loss": 0.0046, "reward": 1.662234604358673, "reward_std": 0.16138693690299988, "rewards/accuracy_reward": 0.6622346043586731, "rewards/format_reward": 1.0, "step": 8442 }, { "completion_length": 270.98978424072266, "epoch": 0.8496100628930817, "grad_norm": 1.0708339214324951, "kl": 0.091796875, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.6141100525856018, "reward_std": 0.2165824994444847, "rewards/accuracy_reward": 0.6651305556297302, "rewards/format_reward": 0.9489795565605164, "step": 8443 }, { "completion_length": 241.92855834960938, "epoch": 0.8497106918238994, "grad_norm": 1.5026702880859375, "kl": 0.098388671875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7176570296287537, "reward_std": 0.23801124840974808, "rewards/accuracy_reward": 0.748269259929657, "rewards/format_reward": 0.9693877398967743, "step": 8444 }, { "completion_length": 226.26529693603516, "epoch": 0.849811320754717, "grad_norm": 0.8160476684570312, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.743457853794098, "reward_std": 0.1275019496679306, "rewards/accuracy_reward": 0.7638660967350006, "rewards/format_reward": 0.9795918464660645, "step": 8445 }, { "completion_length": 252.30611419677734, "epoch": 0.8499119496855346, "grad_norm": 0.7174353003501892, "kl": 0.09619140625, "learning_rate": 1e-06, "loss": 0.0038, "reward": 1.6810314655303955, "reward_std": 0.18567930534482002, "rewards/accuracy_reward": 0.7218478620052338, "rewards/format_reward": 0.9591836631298065, "step": 8446 }, { "completion_length": 221.55101013183594, "epoch": 0.8500125786163522, "grad_norm": 1.4352657794952393, "kl": 0.089111328125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.741702675819397, "reward_std": 0.14609765587374568, "rewards/accuracy_reward": 0.7621108591556549, "rewards/format_reward": 0.9795918166637421, "step": 8447 }, { "completion_length": 171.83673095703125, "epoch": 0.8501132075471698, "grad_norm": 0.9490923285484314, "kl": 0.10400390625, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.8229613304138184, "reward_std": 0.09074169211089611, "rewards/accuracy_reward": 0.8331654667854309, "rewards/format_reward": 0.9897959232330322, "step": 8448 }, { "completion_length": 252.8775405883789, "epoch": 0.8502138364779874, "grad_norm": 0.5120487213134766, "kl": 0.0791015625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.7973759770393372, "reward_std": 0.14247356727719307, "rewards/accuracy_reward": 0.8075801432132721, "rewards/format_reward": 0.9897959232330322, "step": 8449 }, { "completion_length": 297.62245178222656, "epoch": 0.8503144654088051, "grad_norm": 0.4733385145664215, "kl": 0.08056640625, "learning_rate": 1e-06, "loss": 0.0032, "reward": 1.8261412382125854, "reward_std": 0.16382814571261406, "rewards/accuracy_reward": 0.8567535877227783, "rewards/format_reward": 0.9693877398967743, "step": 8450 }, { "completion_length": 229.8163299560547, "epoch": 0.8504150943396226, "grad_norm": 0.6846742033958435, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.87484610080719, "reward_std": 0.14015286415815353, "rewards/accuracy_reward": 0.8952543437480927, "rewards/format_reward": 0.9795918166637421, "step": 8451 }, { "completion_length": 272.1836700439453, "epoch": 0.8505157232704402, "grad_norm": 0.4370485544204712, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6966140270233154, "reward_std": 0.16349217668175697, "rewards/accuracy_reward": 0.7272263467311859, "rewards/format_reward": 0.9693877398967743, "step": 8452 }, { "completion_length": 283.76529693603516, "epoch": 0.8506163522012579, "grad_norm": 0.5230646133422852, "kl": 0.055908203125, "learning_rate": 1e-06, "loss": 0.0022, "reward": 1.889893114566803, "reward_std": 0.13580353558063507, "rewards/accuracy_reward": 0.9103012382984161, "rewards/format_reward": 0.9795918464660645, "step": 8453 }, { "completion_length": 251.34693908691406, "epoch": 0.8507169811320755, "grad_norm": 1.1028779745101929, "kl": 0.09033203125, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.690452516078949, "reward_std": 0.20347905904054642, "rewards/accuracy_reward": 0.7210648953914642, "rewards/format_reward": 0.9693877398967743, "step": 8454 }, { "completion_length": 227.61224365234375, "epoch": 0.8508176100628931, "grad_norm": 0.689452588558197, "kl": 0.102294921875, "learning_rate": 1e-06, "loss": 0.0041, "reward": 1.8809959292411804, "reward_std": 0.13023748248815536, "rewards/accuracy_reward": 0.9116080701351166, "rewards/format_reward": 0.9693877398967743, "step": 8455 }, { "completion_length": 228.98978424072266, "epoch": 0.8509182389937107, "grad_norm": 0.32628515362739563, "kl": 0.087158203125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7970521450042725, "reward_std": 0.03465702384710312, "rewards/accuracy_reward": 0.7970521450042725, "rewards/format_reward": 1.0, "step": 8456 }, { "completion_length": 290.9081573486328, "epoch": 0.8510188679245283, "grad_norm": 0.7845795154571533, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.7975468039512634, "reward_std": 0.13321276009082794, "rewards/accuracy_reward": 0.807750940322876, "rewards/format_reward": 0.9897959232330322, "step": 8457 }, { "completion_length": 311.3571319580078, "epoch": 0.8511194968553459, "grad_norm": 1.1470016241073608, "kl": 0.1259765625, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.7924981117248535, "reward_std": 0.12707066163420677, "rewards/accuracy_reward": 0.8027022182941437, "rewards/format_reward": 0.9897959232330322, "step": 8458 }, { "completion_length": 285.73468017578125, "epoch": 0.8512201257861636, "grad_norm": 0.5244858860969543, "kl": 0.058349609375, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.877788782119751, "reward_std": 0.19034896790981293, "rewards/accuracy_reward": 0.8981969952583313, "rewards/format_reward": 0.9795918464660645, "step": 8459 }, { "completion_length": 243.9897918701172, "epoch": 0.8513207547169811, "grad_norm": 6.329129695892334, "kl": 0.1241455078125, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.8265305757522583, "reward_std": 0.16302528232336044, "rewards/accuracy_reward": 0.8469386994838715, "rewards/format_reward": 0.9795918166637421, "step": 8460 }, { "completion_length": 318.7346954345703, "epoch": 0.8514213836477987, "grad_norm": 0.8540952801704407, "kl": 0.06787109375, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.5687747597694397, "reward_std": 0.12753700464963913, "rewards/accuracy_reward": 0.5687747299671173, "rewards/format_reward": 1.0, "step": 8461 }, { "completion_length": 292.6632537841797, "epoch": 0.8515220125786164, "grad_norm": 0.5733097195625305, "kl": 0.06201171875, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.7241711616516113, "reward_std": 0.2121816948056221, "rewards/accuracy_reward": 0.7547833919525146, "rewards/format_reward": 0.9693877398967743, "step": 8462 }, { "completion_length": 266.57141876220703, "epoch": 0.851622641509434, "grad_norm": 1.5910314321517944, "kl": 0.0592041015625, "learning_rate": 1e-06, "loss": 0.0024, "reward": 1.7539357542991638, "reward_std": 0.11679880321025848, "rewards/accuracy_reward": 0.753935843706131, "rewards/format_reward": 1.0, "step": 8463 }, { "completion_length": 309.2346878051758, "epoch": 0.8517232704402515, "grad_norm": 0.771267294883728, "kl": 0.070556640625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.6973844766616821, "reward_std": 0.16466697305440903, "rewards/accuracy_reward": 0.7075886130332947, "rewards/format_reward": 0.9897959232330322, "step": 8464 }, { "completion_length": 325.1734619140625, "epoch": 0.8518238993710692, "grad_norm": 0.6062584519386292, "kl": 0.07470703125, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.685692846775055, "reward_std": 0.17550204694271088, "rewards/accuracy_reward": 0.7061009109020233, "rewards/format_reward": 0.9795918464660645, "step": 8465 }, { "completion_length": 293.10203552246094, "epoch": 0.8519245283018868, "grad_norm": 0.7071529030799866, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.728304922580719, "reward_std": 0.20026419311761856, "rewards/accuracy_reward": 0.7589172422885895, "rewards/format_reward": 0.9693877398967743, "step": 8466 }, { "completion_length": 252.9183578491211, "epoch": 0.8520251572327044, "grad_norm": 0.723406195640564, "kl": 0.07373046875, "learning_rate": 1e-06, "loss": 0.0029, "reward": 1.7833120822906494, "reward_std": 0.09842443466186523, "rewards/accuracy_reward": 0.7833121418952942, "rewards/format_reward": 1.0, "step": 8467 }, { "completion_length": 202.18366241455078, "epoch": 0.852125786163522, "grad_norm": 0.5690511465072632, "kl": 0.09814453125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.912694275379181, "reward_std": 0.09043445531278849, "rewards/accuracy_reward": 0.9331024885177612, "rewards/format_reward": 0.9795918166637421, "step": 8468 }, { "completion_length": 199.63264846801758, "epoch": 0.8522264150943396, "grad_norm": 0.4902692139148712, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8363235592842102, "reward_std": 0.018377890810370445, "rewards/accuracy_reward": 0.8363235592842102, "rewards/format_reward": 1.0, "step": 8469 }, { "completion_length": 268.75508880615234, "epoch": 0.8523270440251572, "grad_norm": 0.4987592399120331, "kl": 0.074951171875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8610300421714783, "reward_std": 0.0628388524055481, "rewards/accuracy_reward": 0.861030101776123, "rewards/format_reward": 1.0, "step": 8470 }, { "completion_length": 249.2959213256836, "epoch": 0.8524276729559749, "grad_norm": 0.961799681186676, "kl": 0.082275390625, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.884183645248413, "reward_std": 0.10755090788006783, "rewards/accuracy_reward": 0.8943877220153809, "rewards/format_reward": 0.9897959232330322, "step": 8471 }, { "completion_length": 275.28570556640625, "epoch": 0.8525283018867924, "grad_norm": 0.47653254866600037, "kl": 0.06591796875, "learning_rate": 1e-06, "loss": 0.0026, "reward": 1.9233810305595398, "reward_std": 0.11189838871359825, "rewards/accuracy_reward": 0.9335851073265076, "rewards/format_reward": 0.9897959232330322, "step": 8472 }, { "completion_length": 208.60204315185547, "epoch": 0.85262893081761, "grad_norm": 5.909135341644287, "kl": 0.0732421875, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.678898274898529, "reward_std": 0.12453709170222282, "rewards/accuracy_reward": 0.689102441072464, "rewards/format_reward": 0.9897959232330322, "step": 8473 }, { "completion_length": 187.7959213256836, "epoch": 0.8527295597484277, "grad_norm": 1.2697218656539917, "kl": 0.1097412109375, "learning_rate": 1e-06, "loss": 0.0044, "reward": 1.968415915966034, "reward_std": 0.07641606405377388, "rewards/accuracy_reward": 0.9786200225353241, "rewards/format_reward": 0.9897959232330322, "step": 8474 }, { "completion_length": 257.87754821777344, "epoch": 0.8528301886792453, "grad_norm": 0.48944664001464844, "kl": 0.0537109375, "learning_rate": 1e-06, "loss": 0.0021, "reward": 1.8846629858016968, "reward_std": 0.11754046380519867, "rewards/accuracy_reward": 0.9050710797309875, "rewards/format_reward": 0.9795918166637421, "step": 8475 }, { "completion_length": 257.77550506591797, "epoch": 0.8529308176100628, "grad_norm": 0.5836052894592285, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.9036383628845215, "reward_std": 0.1055346392095089, "rewards/accuracy_reward": 0.9036383926868439, "rewards/format_reward": 1.0, "step": 8476 }, { "completion_length": 237.11224365234375, "epoch": 0.8530314465408805, "grad_norm": 3.328221559524536, "kl": 0.1240234375, "learning_rate": 1e-06, "loss": 0.005, "reward": 1.6769598126411438, "reward_std": 0.16960744932293892, "rewards/accuracy_reward": 0.687163919210434, "rewards/format_reward": 0.9897959232330322, "step": 8477 }, { "completion_length": 258.0408020019531, "epoch": 0.8531320754716981, "grad_norm": 0.657045841217041, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": 0.0027, "reward": 1.8150152564048767, "reward_std": 0.14050616323947906, "rewards/accuracy_reward": 0.8150152564048767, "rewards/format_reward": 1.0, "step": 8478 }, { "completion_length": 221.08163452148438, "epoch": 0.8532327044025158, "grad_norm": 0.7586154937744141, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.7476190328598022, "reward_std": 0.16685125976800919, "rewards/accuracy_reward": 0.7578230798244476, "rewards/format_reward": 0.9897959232330322, "step": 8479 }, { "completion_length": 216.90816497802734, "epoch": 0.8533333333333334, "grad_norm": 0.5600801110267639, "kl": 0.05810546875, "learning_rate": 1e-06, "loss": 0.0023, "reward": 1.8943633437156677, "reward_std": 0.13291273638606071, "rewards/accuracy_reward": 0.904567539691925, "rewards/format_reward": 0.9897959232330322, "step": 8480 }, { "completion_length": 228.6836700439453, "epoch": 0.8534339622641509, "grad_norm": 0.5347253680229187, "kl": 0.0615234375, "learning_rate": 1e-06, "loss": 0.0025, "reward": 1.8347836136817932, "reward_std": 0.0652907881885767, "rewards/accuracy_reward": 0.834783673286438, "rewards/format_reward": 1.0, "step": 8481 }, { "completion_length": 216.86734771728516, "epoch": 0.8535345911949686, "grad_norm": 1.5780795812606812, "kl": 0.08203125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.7513453364372253, "reward_std": 0.1289401762187481, "rewards/accuracy_reward": 0.7819575369358063, "rewards/format_reward": 0.9693877398967743, "step": 8482 }, { "completion_length": 297.83673095703125, "epoch": 0.8536352201257862, "grad_norm": 0.9310901761054993, "kl": 0.0849609375, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6993643641471863, "reward_std": 0.18212373554706573, "rewards/accuracy_reward": 0.7197725772857666, "rewards/format_reward": 0.9795918464660645, "step": 8483 }, { "completion_length": 253.59182739257812, "epoch": 0.8537358490566038, "grad_norm": 0.4393068253993988, "kl": 0.0709228515625, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8290662169456482, "reward_std": 0.07974952086806297, "rewards/accuracy_reward": 0.829066127538681, "rewards/format_reward": 1.0, "step": 8484 }, { "completion_length": 180.2142791748047, "epoch": 0.8538364779874213, "grad_norm": 0.7040023803710938, "kl": 0.09130859375, "learning_rate": 1e-06, "loss": 0.0037, "reward": 1.9048641920089722, "reward_std": 0.1000928021967411, "rewards/accuracy_reward": 0.9048641324043274, "rewards/format_reward": 1.0, "step": 8485 }, { "completion_length": 210.2448959350586, "epoch": 0.853937106918239, "grad_norm": 1.0359981060028076, "kl": 0.10107421875, "learning_rate": 1e-06, "loss": 0.004, "reward": 1.7704344391822815, "reward_std": 0.19588620960712433, "rewards/accuracy_reward": 0.7908426225185394, "rewards/format_reward": 0.9795918464660645, "step": 8486 }, { "completion_length": 229.23468780517578, "epoch": 0.8540377358490566, "grad_norm": 0.9476510882377625, "kl": 0.0966796875, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7858842611312866, "reward_std": 0.18227489292621613, "rewards/accuracy_reward": 0.7960884273052216, "rewards/format_reward": 0.9897959232330322, "step": 8487 }, { "completion_length": 191.08162689208984, "epoch": 0.8541383647798743, "grad_norm": 0.5801463723182678, "kl": 0.086669921875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.8643250465393066, "reward_std": 0.05068320129066706, "rewards/accuracy_reward": 0.8643251359462738, "rewards/format_reward": 1.0, "step": 8488 }, { "completion_length": 235.2244873046875, "epoch": 0.8542389937106918, "grad_norm": 2.20169734954834, "kl": 0.088623046875, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7883520126342773, "reward_std": 0.19797097891569138, "rewards/accuracy_reward": 0.8087601661682129, "rewards/format_reward": 0.9795918166637421, "step": 8489 }, { "completion_length": 192.40816497802734, "epoch": 0.8543396226415094, "grad_norm": 1.3334912061691284, "kl": 0.0986328125, "learning_rate": 1e-06, "loss": 0.0039, "reward": 1.7574520111083984, "reward_std": 0.26629751920700073, "rewards/accuracy_reward": 0.7778601944446564, "rewards/format_reward": 0.9795918464660645, "step": 8490 }, { "completion_length": 224.39794921875, "epoch": 0.8544402515723271, "grad_norm": 0.46027451753616333, "kl": 0.07568359375, "learning_rate": 1e-06, "loss": 0.003, "reward": 1.8276923298835754, "reward_std": 0.08960902690887451, "rewards/accuracy_reward": 0.8378964364528656, "rewards/format_reward": 0.9897959232330322, "step": 8491 }, { "completion_length": 220.1734619140625, "epoch": 0.8545408805031447, "grad_norm": 1.0257421731948853, "kl": 0.1064453125, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.765437662601471, "reward_std": 0.17191039770841599, "rewards/accuracy_reward": 0.7858457565307617, "rewards/format_reward": 0.9795918166637421, "step": 8492 }, { "completion_length": 270.0102005004883, "epoch": 0.8546415094339622, "grad_norm": 0.8608911633491516, "kl": 0.1048583984375, "learning_rate": 1e-06, "loss": 0.0042, "reward": 1.7593323588371277, "reward_std": 0.1226922795176506, "rewards/accuracy_reward": 0.7695364952087402, "rewards/format_reward": 0.9897959232330322, "step": 8493 }, { "completion_length": 237.3163299560547, "epoch": 0.8547421383647799, "grad_norm": 0.6628730297088623, "kl": 0.08642578125, "learning_rate": 1e-06, "loss": 0.0035, "reward": 1.7829607129096985, "reward_std": 0.19583790749311447, "rewards/accuracy_reward": 0.8033689260482788, "rewards/format_reward": 0.9795918464660645, "step": 8494 }, { "completion_length": 221.75509643554688, "epoch": 0.8548427672955975, "grad_norm": 0.45860153436660767, "kl": 0.08447265625, "learning_rate": 1e-06, "loss": 0.0034, "reward": 1.6520408391952515, "reward_std": 0.1370668113231659, "rewards/accuracy_reward": 0.6826530396938324, "rewards/format_reward": 0.9693877398967743, "step": 8495 }, { "completion_length": 176.38775634765625, "epoch": 0.8549433962264151, "grad_norm": 0.8189226388931274, "kl": 0.107177734375, "learning_rate": 1e-06, "loss": 0.0043, "reward": 1.8336167335510254, "reward_std": 0.22052157297730446, "rewards/accuracy_reward": 0.8744330704212189, "rewards/format_reward": 0.9591836631298065, "step": 8496 }, { "completion_length": 193.93877410888672, "epoch": 0.8550440251572327, "grad_norm": 0.7332956790924072, "kl": 0.07861328125, "learning_rate": 1e-06, "loss": 0.0031, "reward": 1.875551462173462, "reward_std": 0.17264509946107864, "rewards/accuracy_reward": 0.8857555389404297, "rewards/format_reward": 0.9897959232330322, "step": 8497 }, { "completion_length": 180.02040100097656, "epoch": 0.8551446540880503, "grad_norm": 0.9196969866752625, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": 0.0033, "reward": 1.9285714030265808, "reward_std": 0.15069952234625816, "rewards/accuracy_reward": 0.9489795565605164, "rewards/format_reward": 0.9795918166637421, "step": 8498 }, { "completion_length": 171.4693832397461, "epoch": 0.8552452830188679, "grad_norm": 0.7968505620956421, "kl": 0.0888671875, "learning_rate": 1e-06, "loss": 0.0036, "reward": 1.8442177176475525, "reward_std": 0.10310907661914825, "rewards/accuracy_reward": 0.8442176878452301, "rewards/format_reward": 1.0, "step": 8499 }, { "completion_length": 271.82652282714844, "epoch": 0.8553459119496856, "grad_norm": 0.6009476780891418, "kl": 0.0706787109375, "learning_rate": 1e-06, "loss": 0.0028, "reward": 1.8979591131210327, "reward_std": 0.18102359771728516, "rewards/accuracy_reward": 0.938775509595871, "rewards/format_reward": 0.9591836631298065, "step": 8500 } ], "logging_steps": 1.0, "max_steps": 19874, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }