{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997759689343589, "eval_steps": 500, "global_step": 3347, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 1731.712158203125, "epoch": 0.00029870808752146963, "grad_norm": 0.09788785129785538, "kl": 0.0, "learning_rate": 2.9850746268656717e-09, "loss": 0.0, "reward": 0.4743303805589676, "reward_std": 0.19470911845564842, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.380580373108387, "step": 1 }, { "completion_length": 1652.8371276855469, "epoch": 0.0005974161750429393, "grad_norm": 0.1114732101559639, "kl": 0.0, "learning_rate": 5.970149253731343e-09, "loss": 0.0, "reward": 0.522879496216774, "reward_std": 0.20094318687915802, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3867187723517418, "step": 2 }, { "completion_length": 1721.6853637695312, "epoch": 0.0008961242625644089, "grad_norm": 0.09151389449834824, "kl": 2.4110078811645508e-05, "learning_rate": 8.955223880597015e-09, "loss": 0.0, "reward": 0.439732164144516, "reward_std": 0.15008491091430187, "rewards/accuracy_reward": 0.05580357322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.383928582072258, "step": 3 }, { "completion_length": 1734.0380554199219, "epoch": 0.0011948323500858785, "grad_norm": 0.09528391063213348, "kl": 3.8623809814453125e-05, "learning_rate": 1.1940298507462687e-08, "loss": 0.0, "reward": 0.5000000149011612, "reward_std": 0.18512177467346191, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3549107387661934, "step": 4 }, { "completion_length": 1620.9420471191406, "epoch": 0.0014935404376073482, "grad_norm": 0.10255090892314911, "kl": 4.172325134277344e-05, "learning_rate": 1.4925373134328357e-08, "loss": 0.0, "reward": 0.4927455633878708, "reward_std": 0.1358396727591753, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4012276977300644, "step": 5 }, { "completion_length": 1566.665283203125, "epoch": 0.0017922485251288178, "grad_norm": 0.106447733938694, "kl": 4.208087921142578e-05, "learning_rate": 1.791044776119403e-08, "loss": 0.0, "reward": 0.5429687723517418, "reward_std": 0.2521444857120514, "rewards/accuracy_reward": 0.12723214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.415736623108387, "step": 6 }, { "completion_length": 1733.1741943359375, "epoch": 0.0020909566126502874, "grad_norm": 0.12672096490859985, "kl": 4.202127456665039e-05, "learning_rate": 2.08955223880597e-08, "loss": 0.0, "reward": 0.404017873108387, "reward_std": 0.1595960035920143, "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.357142873108387, "step": 7 }, { "completion_length": 1706.6027526855469, "epoch": 0.002389664700171757, "grad_norm": 0.0969104915857315, "kl": 4.0411949157714844e-05, "learning_rate": 2.3880597014925373e-08, "loss": 0.0, "reward": 0.4921875149011612, "reward_std": 0.1547120325267315, "rewards/accuracy_reward": 0.10937500465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3828125149011612, "step": 8 }, { "completion_length": 1730.0960388183594, "epoch": 0.0026883727876932267, "grad_norm": 0.09655702114105225, "kl": 4.07099723815918e-05, "learning_rate": 2.6865671641791042e-08, "loss": 0.0, "reward": 0.5340401977300644, "reward_std": 0.2513280287384987, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4023437723517418, "step": 9 }, { "completion_length": 1733.7009887695312, "epoch": 0.0029870808752146963, "grad_norm": 0.1060670018196106, "kl": 4.166364669799805e-05, "learning_rate": 2.9850746268656714e-08, "loss": 0.0, "reward": 0.5111607387661934, "reward_std": 0.220967099070549, "rewards/accuracy_reward": 0.13839286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3727678805589676, "step": 10 }, { "completion_length": 1699.6875610351562, "epoch": 0.003285788962736166, "grad_norm": 0.0981292650103569, "kl": 4.363059997558594e-05, "learning_rate": 3.2835820895522386e-08, "loss": 0.0, "reward": 0.4698660969734192, "reward_std": 0.2001437172293663, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.380580373108387, "step": 11 }, { "completion_length": 1682.30810546875, "epoch": 0.0035844970502576356, "grad_norm": 0.10234557092189789, "kl": 3.5434961318969727e-05, "learning_rate": 3.582089552238806e-08, "loss": 0.0, "reward": 0.5251116380095482, "reward_std": 0.17725737020373344, "rewards/accuracy_reward": 0.14062500605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.384486623108387, "step": 12 }, { "completion_length": 1752.2835388183594, "epoch": 0.003883205137779105, "grad_norm": 0.099834144115448, "kl": 4.279613494873047e-05, "learning_rate": 3.880597014925373e-08, "loss": 0.0, "reward": 0.4715401977300644, "reward_std": 0.18836596608161926, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3710937649011612, "step": 13 }, { "completion_length": 1728.7679443359375, "epoch": 0.004181913225300575, "grad_norm": 0.9206768870353699, "kl": 4.0143728256225586e-05, "learning_rate": 4.17910447761194e-08, "loss": 0.0, "reward": 0.5457589402794838, "reward_std": 0.23335089907050133, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3850446566939354, "step": 14 }, { "completion_length": 1619.3861999511719, "epoch": 0.004480621312822045, "grad_norm": 0.11546648293733597, "kl": 4.8279762268066406e-05, "learning_rate": 4.477611940298507e-08, "loss": 0.0, "reward": 0.557477705180645, "reward_std": 0.24492931738495827, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3878348395228386, "step": 15 }, { "completion_length": 1746.80810546875, "epoch": 0.004779329400343514, "grad_norm": 0.0968732014298439, "kl": 4.51207160949707e-05, "learning_rate": 4.776119402985075e-08, "loss": 0.0, "reward": 0.4581473395228386, "reward_std": 0.19553445652127266, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3844866156578064, "step": 16 }, { "completion_length": 1639.7076721191406, "epoch": 0.005078037487864984, "grad_norm": 0.08759453147649765, "kl": 3.439188003540039e-05, "learning_rate": 5.074626865671641e-08, "loss": 0.0, "reward": 0.5212053880095482, "reward_std": 0.20874397084116936, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3984375149011612, "step": 17 }, { "completion_length": 1690.1161499023438, "epoch": 0.005376745575386453, "grad_norm": 0.09248816967010498, "kl": 4.667043685913086e-05, "learning_rate": 5.3731343283582085e-08, "loss": 0.0, "reward": 0.510044664144516, "reward_std": 0.1873716525733471, "rewards/accuracy_reward": 0.12723215040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3828125223517418, "step": 18 }, { "completion_length": 1568.1741638183594, "epoch": 0.0056754536629079234, "grad_norm": 0.10534868389368057, "kl": 4.845857620239258e-05, "learning_rate": 5.671641791044776e-08, "loss": 0.0, "reward": 0.5753348469734192, "reward_std": 0.2201700620353222, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3900669813156128, "step": 19 }, { "completion_length": 1759.2121276855469, "epoch": 0.005974161750429393, "grad_norm": 0.09074649214744568, "kl": 4.57763671875e-05, "learning_rate": 5.970149253731343e-08, "loss": 0.0, "reward": 0.550781287252903, "reward_std": 0.23554643988609314, "rewards/accuracy_reward": 0.17410715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3766741305589676, "step": 20 }, { "completion_length": 1704.1831359863281, "epoch": 0.006272869837950863, "grad_norm": 0.09509158134460449, "kl": 4.494190216064453e-05, "learning_rate": 6.26865671641791e-08, "loss": 0.0, "reward": 0.503348246216774, "reward_std": 0.18247173726558685, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3828125074505806, "step": 21 }, { "completion_length": 1712.0112609863281, "epoch": 0.006571577925472332, "grad_norm": 0.08644744008779526, "kl": 4.6372413635253906e-05, "learning_rate": 6.567164179104477e-08, "loss": 0.0, "reward": 0.4425223395228386, "reward_std": 0.16477284207940102, "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3733259066939354, "step": 22 }, { "completion_length": 1770.8081359863281, "epoch": 0.006870286012993802, "grad_norm": 0.09731516987085342, "kl": 4.2438507080078125e-05, "learning_rate": 6.865671641791045e-08, "loss": 0.0, "reward": 0.484375037252903, "reward_std": 0.22544516623020172, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3705357313156128, "step": 23 }, { "completion_length": 1642.0023193359375, "epoch": 0.007168994100515271, "grad_norm": 0.09602506458759308, "kl": 3.6597251892089844e-05, "learning_rate": 7.164179104477612e-08, "loss": 0.0, "reward": 0.6071428805589676, "reward_std": 0.22998524084687233, "rewards/accuracy_reward": 0.212053582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3950893059372902, "step": 24 }, { "completion_length": 1772.8036804199219, "epoch": 0.007467702188036741, "grad_norm": 0.08349844813346863, "kl": 4.088878631591797e-05, "learning_rate": 7.462686567164178e-08, "loss": 0.0, "reward": 0.4977678880095482, "reward_std": 0.206555787473917, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.372767873108387, "step": 25 }, { "completion_length": 1729.7679138183594, "epoch": 0.00776641027555821, "grad_norm": 0.09636993706226349, "kl": 4.559755325317383e-05, "learning_rate": 7.761194029850746e-08, "loss": 0.0, "reward": 0.5027901977300644, "reward_std": 0.23796116560697556, "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3800223469734192, "step": 26 }, { "completion_length": 1684.16748046875, "epoch": 0.00806511836307968, "grad_norm": 0.10422854125499725, "kl": 5.2988529205322266e-05, "learning_rate": 8.059701492537313e-08, "loss": 0.0, "reward": 0.5848214477300644, "reward_std": 0.1551265586167574, "rewards/accuracy_reward": 0.21205358440056443, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.372767873108387, "step": 27 }, { "completion_length": 1678.1072387695312, "epoch": 0.00836382645060115, "grad_norm": 0.08989456295967102, "kl": 3.7729740142822266e-05, "learning_rate": 8.35820895522388e-08, "loss": 0.0, "reward": 0.5027901977300644, "reward_std": 0.18517549708485603, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4045759066939354, "step": 28 }, { "completion_length": 1617.3772888183594, "epoch": 0.008662534538122619, "grad_norm": 0.10604710131883621, "kl": 4.7266483306884766e-05, "learning_rate": 8.656716417910448e-08, "loss": 0.0, "reward": 0.4614955484867096, "reward_std": 0.18813891895115376, "rewards/accuracy_reward": 0.07589286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3856026902794838, "step": 29 }, { "completion_length": 1672.2098999023438, "epoch": 0.00896124262564409, "grad_norm": 0.10479862987995148, "kl": 3.999471664428711e-05, "learning_rate": 8.955223880597014e-08, "loss": 0.0, "reward": 0.6205357313156128, "reward_std": 0.290972713381052, "rewards/accuracy_reward": 0.2299107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3906250149011612, "step": 30 }, { "completion_length": 1589.4866638183594, "epoch": 0.009259950713165559, "grad_norm": 0.1230715662240982, "kl": 4.151463508605957e-05, "learning_rate": 9.253731343283581e-08, "loss": 0.0, "reward": 0.4620535895228386, "reward_std": 0.16982948780059814, "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4017857313156128, "step": 31 }, { "completion_length": 1699.5380249023438, "epoch": 0.009558658800687028, "grad_norm": 0.09880749136209488, "kl": 4.363059997558594e-05, "learning_rate": 9.55223880597015e-08, "loss": 0.0, "reward": 0.5055803656578064, "reward_std": 0.19491765089333057, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3671875223517418, "step": 32 }, { "completion_length": 1744.4107971191406, "epoch": 0.009857366888208497, "grad_norm": 0.08696916699409485, "kl": 4.363059997558594e-05, "learning_rate": 9.850746268656717e-08, "loss": 0.0, "reward": 0.4575892984867096, "reward_std": 0.19077984616160393, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.368303582072258, "step": 33 }, { "completion_length": 1705.6630249023438, "epoch": 0.010156074975729968, "grad_norm": 0.0873563140630722, "kl": 4.011392593383789e-05, "learning_rate": 1.0149253731343282e-07, "loss": 0.0, "reward": 0.4843750223517418, "reward_std": 0.16615015268325806, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3638393059372902, "step": 34 }, { "completion_length": 1695.2567749023438, "epoch": 0.010454783063251438, "grad_norm": 0.09463194012641907, "kl": 4.07099723815918e-05, "learning_rate": 1.044776119402985e-07, "loss": 0.0, "reward": 0.5290178656578064, "reward_std": 0.2458721473813057, "rewards/accuracy_reward": 0.1294642873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.399553582072258, "step": 35 }, { "completion_length": 1661.5648193359375, "epoch": 0.010753491150772907, "grad_norm": 0.0998857393860817, "kl": 4.9591064453125e-05, "learning_rate": 1.0746268656716417e-07, "loss": 0.0, "reward": 0.4553571715950966, "reward_std": 0.19854752533137798, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3928571566939354, "step": 36 }, { "completion_length": 1698.774658203125, "epoch": 0.011052199238294378, "grad_norm": 0.09051767736673355, "kl": 4.13060188293457e-05, "learning_rate": 1.1044776119402985e-07, "loss": 0.0, "reward": 0.5312500149011612, "reward_std": 0.23656188696622849, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3906250149011612, "step": 37 }, { "completion_length": 1716.5447082519531, "epoch": 0.011350907325815847, "grad_norm": 0.10254305601119995, "kl": 4.83393669128418e-05, "learning_rate": 1.1343283582089553e-07, "loss": 0.0, "reward": 0.4068080559372902, "reward_std": 0.11078167520463467, "rewards/accuracy_reward": 0.0379464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3688616156578064, "step": 38 }, { "completion_length": 1693.0625610351562, "epoch": 0.011649615413337316, "grad_norm": 0.11015334725379944, "kl": 5.048513412475586e-05, "learning_rate": 1.1641791044776119e-07, "loss": 0.0, "reward": 0.4704241305589676, "reward_std": 0.2572677992284298, "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3856026902794838, "step": 39 }, { "completion_length": 1540.2500610351562, "epoch": 0.011948323500858785, "grad_norm": 0.10758980363607407, "kl": 3.784894943237305e-05, "learning_rate": 1.1940298507462686e-07, "loss": 0.0, "reward": 0.6367187723517418, "reward_std": 0.2426343448460102, "rewards/accuracy_reward": 0.2254464402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4112723395228386, "step": 40 }, { "completion_length": 1728.399658203125, "epoch": 0.012247031588380256, "grad_norm": 0.08851924538612366, "kl": 4.6372413635253906e-05, "learning_rate": 1.2238805970149254e-07, "loss": 0.0, "reward": 0.5412946715950966, "reward_std": 0.18982231989502907, "rewards/accuracy_reward": 0.16517857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3761160895228386, "step": 41 }, { "completion_length": 1788.1540832519531, "epoch": 0.012545739675901725, "grad_norm": 0.10405179113149643, "kl": 4.51207160949707e-05, "learning_rate": 1.253731343283582e-07, "loss": 0.0, "reward": 0.4397321566939354, "reward_std": 0.1698441468179226, "rewards/accuracy_reward": 0.07142857369035482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.368303582072258, "step": 42 }, { "completion_length": 1749.1116638183594, "epoch": 0.012844447763423195, "grad_norm": 0.1624377965927124, "kl": 4.00543212890625e-05, "learning_rate": 1.2835820895522386e-07, "loss": 0.0, "reward": 0.4776785895228386, "reward_std": 0.22758764028549194, "rewards/accuracy_reward": 0.09151785937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3861607313156128, "step": 43 }, { "completion_length": 1646.5782165527344, "epoch": 0.013143155850944664, "grad_norm": 0.09527420997619629, "kl": 4.4286251068115234e-05, "learning_rate": 1.3134328358208955e-07, "loss": 0.0, "reward": 0.5290178880095482, "reward_std": 0.2314729280769825, "rewards/accuracy_reward": 0.14955358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3794643059372902, "step": 44 }, { "completion_length": 1750.0379943847656, "epoch": 0.013441863938466135, "grad_norm": 0.07447757571935654, "kl": 4.166364669799805e-05, "learning_rate": 1.343283582089552e-07, "loss": 0.0, "reward": 0.4408482387661934, "reward_std": 0.13501118356361985, "rewards/accuracy_reward": 0.06919643189758062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3716518059372902, "step": 45 }, { "completion_length": 1768.9978637695312, "epoch": 0.013740572025987604, "grad_norm": 0.09661541134119034, "kl": 4.380941390991211e-05, "learning_rate": 1.373134328358209e-07, "loss": 0.0, "reward": 0.489397332072258, "reward_std": 0.2091980166733265, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3599330559372902, "step": 46 }, { "completion_length": 1737.4085693359375, "epoch": 0.014039280113509073, "grad_norm": 0.08969924598932266, "kl": 3.796815872192383e-05, "learning_rate": 1.4029850746268658e-07, "loss": 0.0, "reward": 0.489955373108387, "reward_std": 0.17258083261549473, "rewards/accuracy_reward": 0.11607143329456449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3738839477300644, "step": 47 }, { "completion_length": 1827.5692749023438, "epoch": 0.014337988201030542, "grad_norm": 0.11278776824474335, "kl": 4.3392181396484375e-05, "learning_rate": 1.4328358208955223e-07, "loss": 0.0, "reward": 0.3761160969734192, "reward_std": 0.16061189025640488, "rewards/accuracy_reward": 0.04241071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.333705373108387, "step": 48 }, { "completion_length": 1793.9442749023438, "epoch": 0.014636696288552013, "grad_norm": 0.08285469561815262, "kl": 3.635883331298828e-05, "learning_rate": 1.4626865671641792e-07, "loss": 0.0, "reward": 0.463169664144516, "reward_std": 0.18783023953437805, "rewards/accuracy_reward": 0.10491071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3582589477300644, "step": 49 }, { "completion_length": 1682.509033203125, "epoch": 0.014935404376073482, "grad_norm": 1.8306422233581543, "kl": 3.534555435180664e-05, "learning_rate": 1.4925373134328355e-07, "loss": 0.0, "reward": 0.4810268133878708, "reward_std": 0.21270304918289185, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.391741082072258, "step": 50 }, { "completion_length": 1662.8661499023438, "epoch": 0.015234112463594952, "grad_norm": 0.0911092683672905, "kl": 4.64022159576416e-05, "learning_rate": 1.5223880597014924e-07, "loss": 0.0, "reward": 0.5976562798023224, "reward_std": 0.1816614232957363, "rewards/accuracy_reward": 0.2053571566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3922991305589676, "step": 51 }, { "completion_length": 1720.8527526855469, "epoch": 0.01553282055111642, "grad_norm": 0.08785812556743622, "kl": 3.463029861450195e-05, "learning_rate": 1.5522388059701492e-07, "loss": 0.0, "reward": 0.4497767984867096, "reward_std": 0.15873262286186218, "rewards/accuracy_reward": 0.06696428661234677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3828125074505806, "step": 52 }, { "completion_length": 1789.1228332519531, "epoch": 0.01583152863863789, "grad_norm": 0.08890232443809509, "kl": 4.4405460357666016e-05, "learning_rate": 1.5820895522388058e-07, "loss": 0.0, "reward": 0.5379464477300644, "reward_std": 0.20846235752105713, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.361607164144516, "step": 53 }, { "completion_length": 1741.29248046875, "epoch": 0.01613023672615936, "grad_norm": 0.09099102020263672, "kl": 3.9964914321899414e-05, "learning_rate": 1.6119402985074627e-07, "loss": 0.0, "reward": 0.4994419813156128, "reward_std": 0.15593552216887474, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3811384066939354, "step": 54 }, { "completion_length": 1784.1451416015625, "epoch": 0.016428944813680832, "grad_norm": 0.0919862911105156, "kl": 4.011392593383789e-05, "learning_rate": 1.6417910447761193e-07, "loss": 0.0, "reward": 0.4140625149011612, "reward_std": 0.1503710001707077, "rewards/accuracy_reward": 0.05133928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3627232313156128, "step": 55 }, { "completion_length": 1683.4129943847656, "epoch": 0.0167276529012023, "grad_norm": 0.10855896025896072, "kl": 4.026293754577637e-05, "learning_rate": 1.671641791044776e-07, "loss": 0.0, "reward": 0.4168526902794838, "reward_std": 0.1604956705123186, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3744419738650322, "step": 56 }, { "completion_length": 1624.5536499023438, "epoch": 0.01702636098872377, "grad_norm": 0.09681912511587143, "kl": 4.1425228118896484e-05, "learning_rate": 1.701492537313433e-07, "loss": 0.0, "reward": 0.4687500298023224, "reward_std": 0.21658793464303017, "rewards/accuracy_reward": 0.07366071967408061, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3950892984867096, "step": 57 }, { "completion_length": 1721.727783203125, "epoch": 0.017325069076245238, "grad_norm": 0.08706501126289368, "kl": 3.993511199951172e-05, "learning_rate": 1.7313432835820896e-07, "loss": 0.0, "reward": 0.546875037252903, "reward_std": 0.2532654255628586, "rewards/accuracy_reward": 0.16517857951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3816964402794838, "step": 58 }, { "completion_length": 1654.5335693359375, "epoch": 0.01762377716376671, "grad_norm": 2.2248997688293457, "kl": 4.166364669799805e-05, "learning_rate": 1.7611940298507461e-07, "loss": 0.0, "reward": 0.4966518059372902, "reward_std": 0.1712179072201252, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3962053805589676, "step": 59 }, { "completion_length": 1768.5558776855469, "epoch": 0.01792248525128818, "grad_norm": 0.09939350932836533, "kl": 3.7044286727905273e-05, "learning_rate": 1.7910447761194027e-07, "loss": 0.0, "reward": 0.525669664144516, "reward_std": 0.23858246952295303, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3761160895228386, "step": 60 }, { "completion_length": 1703.8304748535156, "epoch": 0.018221193338809647, "grad_norm": 0.103017158806324, "kl": 4.273653030395508e-05, "learning_rate": 1.8208955223880596e-07, "loss": 0.0, "reward": 0.4877232238650322, "reward_std": 0.21207648888230324, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3783482238650322, "step": 61 }, { "completion_length": 1592.9688110351562, "epoch": 0.018519901426331118, "grad_norm": 0.11344347149133682, "kl": 4.458427429199219e-05, "learning_rate": 1.8507462686567162e-07, "loss": 0.0, "reward": 0.4804687649011612, "reward_std": 0.2328757457435131, "rewards/accuracy_reward": 0.07366071688011289, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4068080559372902, "step": 62 }, { "completion_length": 1690.96435546875, "epoch": 0.01881860951385259, "grad_norm": 0.09457223117351532, "kl": 3.784894943237305e-05, "learning_rate": 1.880597014925373e-07, "loss": 0.0, "reward": 0.4503348469734192, "reward_std": 0.21534838154911995, "rewards/accuracy_reward": 0.0535714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.396763414144516, "step": 63 }, { "completion_length": 1569.2969360351562, "epoch": 0.019117317601374056, "grad_norm": 0.10470817983150482, "kl": 4.482269287109375e-05, "learning_rate": 1.91044776119403e-07, "loss": 0.0, "reward": 0.6233259066939354, "reward_std": 0.2182648740708828, "rewards/accuracy_reward": 0.2031250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4202009066939354, "step": 64 }, { "completion_length": 1781.5447692871094, "epoch": 0.019416025688895527, "grad_norm": 0.09510259330272675, "kl": 3.647804260253906e-05, "learning_rate": 1.9402985074626865e-07, "loss": 0.0, "reward": 0.415736623108387, "reward_std": 0.21049348078668118, "rewards/accuracy_reward": 0.05803571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3577009066939354, "step": 65 }, { "completion_length": 1790.4286499023438, "epoch": 0.019714733776416995, "grad_norm": 0.09395762532949448, "kl": 4.4226646423339844e-05, "learning_rate": 1.9701492537313433e-07, "loss": 0.0, "reward": 0.4369419887661934, "reward_std": 0.15210401080548763, "rewards/accuracy_reward": 0.08705357369035482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3498883992433548, "step": 66 }, { "completion_length": 1832.7523193359375, "epoch": 0.020013441863938466, "grad_norm": 0.08337168395519257, "kl": 3.24249267578125e-05, "learning_rate": 2e-07, "loss": 0.0, "reward": 0.4464285969734192, "reward_std": 0.1982463076710701, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.357142873108387, "step": 67 }, { "completion_length": 1608.3728637695312, "epoch": 0.020312149951459937, "grad_norm": 0.08835510164499283, "kl": 3.725290298461914e-05, "learning_rate": 2.0298507462686565e-07, "loss": 0.0, "reward": 0.5625000223517418, "reward_std": 0.17978400364518166, "rewards/accuracy_reward": 0.15178572479635477, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4107143059372902, "step": 68 }, { "completion_length": 1651.6853637695312, "epoch": 0.020610858038981404, "grad_norm": 0.0891580879688263, "kl": 3.802776336669922e-05, "learning_rate": 2.0597014925373134e-07, "loss": 0.0, "reward": 0.4659598395228386, "reward_std": 0.20213166624307632, "rewards/accuracy_reward": 0.0714285762514919, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3945312649011612, "step": 69 }, { "completion_length": 1673.9822082519531, "epoch": 0.020909566126502875, "grad_norm": 0.08729001134634018, "kl": 3.600120544433594e-05, "learning_rate": 2.08955223880597e-07, "loss": 0.0, "reward": 0.530691996216774, "reward_std": 0.260345745831728, "rewards/accuracy_reward": 0.13169643515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3989955559372902, "step": 70 }, { "completion_length": 1589.4085693359375, "epoch": 0.021208274214024346, "grad_norm": 0.09430055320262909, "kl": 3.272294998168945e-05, "learning_rate": 2.1194029850746268e-07, "loss": 0.0, "reward": 0.6294642984867096, "reward_std": 0.2932361327111721, "rewards/accuracy_reward": 0.2053571566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4241071566939354, "step": 71 }, { "completion_length": 1574.5023193359375, "epoch": 0.021506982301545814, "grad_norm": 0.10432137548923492, "kl": 4.488229751586914e-05, "learning_rate": 2.1492537313432834e-07, "loss": 0.0, "reward": 0.5340401977300644, "reward_std": 0.17076869308948517, "rewards/accuracy_reward": 0.12276786239817739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4112723395228386, "step": 72 }, { "completion_length": 1553.7991333007812, "epoch": 0.021805690389067284, "grad_norm": 0.11849450320005417, "kl": 3.695487976074219e-05, "learning_rate": 2.1791044776119402e-07, "loss": 0.0, "reward": 0.6456473469734192, "reward_std": 0.22900506481528282, "rewards/accuracy_reward": 0.2075892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4380580484867096, "step": 73 }, { "completion_length": 1652.6161499023438, "epoch": 0.022104398476588755, "grad_norm": 0.0865560919046402, "kl": 3.796815872192383e-05, "learning_rate": 2.208955223880597e-07, "loss": 0.0, "reward": 0.4296875149011612, "reward_std": 0.16279350128024817, "rewards/accuracy_reward": 0.04017857392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3895089477300644, "step": 74 }, { "completion_length": 1667.4085388183594, "epoch": 0.022403106564110223, "grad_norm": 0.19182462990283966, "kl": 3.814697265625e-05, "learning_rate": 2.2388059701492537e-07, "loss": 0.0, "reward": 0.488281287252903, "reward_std": 0.20007126033306122, "rewards/accuracy_reward": 0.09821429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3900669813156128, "step": 75 }, { "completion_length": 1667.8170471191406, "epoch": 0.022701814651631694, "grad_norm": 0.10130313038825989, "kl": 4.106760025024414e-05, "learning_rate": 2.2686567164179105e-07, "loss": 0.0, "reward": 0.4609375149011612, "reward_std": 0.18744872137904167, "rewards/accuracy_reward": 0.06473214505240321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.396205373108387, "step": 76 }, { "completion_length": 1712.6831359863281, "epoch": 0.02300052273915316, "grad_norm": 0.08452694863080978, "kl": 3.3527612686157227e-05, "learning_rate": 2.2985074626865669e-07, "loss": 0.0, "reward": 0.482142873108387, "reward_std": 0.1867868173867464, "rewards/accuracy_reward": 0.11830357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3638392984867096, "step": 77 }, { "completion_length": 1842.7590026855469, "epoch": 0.023299230826674632, "grad_norm": 0.09354701638221741, "kl": 3.5434961318969727e-05, "learning_rate": 2.3283582089552237e-07, "loss": 0.0, "reward": 0.4921875149011612, "reward_std": 0.15936844982206821, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.356026791036129, "step": 78 }, { "completion_length": 1822.2411499023438, "epoch": 0.023597938914196103, "grad_norm": 0.10105336457490921, "kl": 3.8683414459228516e-05, "learning_rate": 2.3582089552238803e-07, "loss": 0.0, "reward": 0.415736623108387, "reward_std": 0.12063143402338028, "rewards/accuracy_reward": 0.06696428707800806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3487723395228386, "step": 79 }, { "completion_length": 1645.2545471191406, "epoch": 0.02389664700171757, "grad_norm": 0.10076173394918442, "kl": 3.725290298461914e-05, "learning_rate": 2.388059701492537e-07, "loss": 0.0, "reward": 0.4927455484867096, "reward_std": 0.21939844265580177, "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.396763414144516, "step": 80 }, { "completion_length": 1723.8438415527344, "epoch": 0.02419535508923904, "grad_norm": 0.09291433542966843, "kl": 3.6776065826416016e-05, "learning_rate": 2.417910447761194e-07, "loss": 0.0, "reward": 0.5005580633878708, "reward_std": 0.15327144600450993, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3777901977300644, "step": 81 }, { "completion_length": 1720.7188415527344, "epoch": 0.024494063176760512, "grad_norm": 0.10621005296707153, "kl": 3.629922866821289e-05, "learning_rate": 2.447761194029851e-07, "loss": 0.0, "reward": 0.4430803656578064, "reward_std": 0.20475571043789387, "rewards/accuracy_reward": 0.0602678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3828125149011612, "step": 82 }, { "completion_length": 1632.3148193359375, "epoch": 0.02479277126428198, "grad_norm": 0.09364974498748779, "kl": 3.409385681152344e-05, "learning_rate": 2.4776119402985074e-07, "loss": 0.0, "reward": 0.4235491380095482, "reward_std": 0.14354151487350464, "rewards/accuracy_reward": 0.04241071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.381138414144516, "step": 83 }, { "completion_length": 1617.1786804199219, "epoch": 0.02509147935180345, "grad_norm": 0.10703151673078537, "kl": 3.4421682357788086e-05, "learning_rate": 2.507462686567164e-07, "loss": 0.0, "reward": 0.5334821715950966, "reward_std": 0.20099983550608158, "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4107142984867096, "step": 84 }, { "completion_length": 1679.1318054199219, "epoch": 0.02539018743932492, "grad_norm": 0.09163054823875427, "kl": 3.331899642944336e-05, "learning_rate": 2.537313432835821e-07, "loss": 0.0, "reward": 0.576450914144516, "reward_std": 0.21376424469053745, "rewards/accuracy_reward": 0.20089286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3755580484867096, "step": 85 }, { "completion_length": 1744.6898193359375, "epoch": 0.02568889552684639, "grad_norm": 0.09814386069774628, "kl": 3.4421682357788086e-05, "learning_rate": 2.567164179104477e-07, "loss": 0.0, "reward": 0.4765625149011612, "reward_std": 0.2085466831922531, "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3671875149011612, "step": 86 }, { "completion_length": 1651.1027526855469, "epoch": 0.02598760361436786, "grad_norm": 0.08685163408517838, "kl": 3.591179847717285e-05, "learning_rate": 2.5970149253731343e-07, "loss": 0.0, "reward": 0.6104910969734192, "reward_std": 0.26690854877233505, "rewards/accuracy_reward": 0.1986607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.411830373108387, "step": 87 }, { "completion_length": 1791.8482971191406, "epoch": 0.026286311701889328, "grad_norm": 0.08842063695192337, "kl": 3.5881996154785156e-05, "learning_rate": 2.626865671641791e-07, "loss": 0.0, "reward": 0.400669664144516, "reward_std": 0.15479809790849686, "rewards/accuracy_reward": 0.040178573690354824, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3604910895228386, "step": 88 }, { "completion_length": 1639.4755249023438, "epoch": 0.0265850197894108, "grad_norm": 0.1283779740333557, "kl": 4.363059997558594e-05, "learning_rate": 2.656716417910448e-07, "loss": 0.0, "reward": 0.6015625149011612, "reward_std": 0.17994916625320911, "rewards/accuracy_reward": 0.2053571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3962053805589676, "step": 89 }, { "completion_length": 1766.7880249023438, "epoch": 0.02688372787693227, "grad_norm": 0.10726066678762436, "kl": 3.546476364135742e-05, "learning_rate": 2.686567164179104e-07, "loss": 0.0, "reward": 0.4547991305589676, "reward_std": 0.18674319423735142, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3722098395228386, "step": 90 }, { "completion_length": 1578.8705749511719, "epoch": 0.027182435964453737, "grad_norm": 0.1145951896905899, "kl": 4.202127456665039e-05, "learning_rate": 2.7164179104477607e-07, "loss": 0.0, "reward": 0.4642857313156128, "reward_std": 0.24205880239605904, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.397321455180645, "step": 91 }, { "completion_length": 1597.2902526855469, "epoch": 0.027481144051975208, "grad_norm": 0.09421837329864502, "kl": 2.8401613235473633e-05, "learning_rate": 2.746268656716418e-07, "loss": 0.0, "reward": 0.4916294887661934, "reward_std": 0.19976810738444328, "rewards/accuracy_reward": 0.09375000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3978794887661934, "step": 92 }, { "completion_length": 1611.5335693359375, "epoch": 0.027779852139496675, "grad_norm": 0.11520054191350937, "kl": 2.810359001159668e-05, "learning_rate": 2.7761194029850744e-07, "loss": 0.0, "reward": 0.6283482238650322, "reward_std": 0.29316865652799606, "rewards/accuracy_reward": 0.20312500605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4252232313156128, "step": 93 }, { "completion_length": 1766.9264221191406, "epoch": 0.028078560227018146, "grad_norm": 0.09128382802009583, "kl": 3.406405448913574e-05, "learning_rate": 2.8059701492537315e-07, "loss": 0.0, "reward": 0.411830373108387, "reward_std": 0.2072727084159851, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3582589477300644, "step": 94 }, { "completion_length": 1758.2902526855469, "epoch": 0.028377268314539617, "grad_norm": 0.10039829462766647, "kl": 3.841519355773926e-05, "learning_rate": 2.8358208955223876e-07, "loss": 0.0, "reward": 0.4430803805589676, "reward_std": 0.17539134249091148, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3604910895228386, "step": 95 }, { "completion_length": 1713.6898193359375, "epoch": 0.028675976402061085, "grad_norm": 0.0873020738363266, "kl": 2.9131770133972168e-05, "learning_rate": 2.8656716417910447e-07, "loss": 0.0, "reward": 0.481584832072258, "reward_std": 0.1658504232764244, "rewards/accuracy_reward": 0.09821429406292737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3833705559372902, "step": 96 }, { "completion_length": 1807.3527526855469, "epoch": 0.028974684489582556, "grad_norm": 0.10078015178442001, "kl": 2.8878450393676758e-05, "learning_rate": 2.8955223880597013e-07, "loss": 0.0, "reward": 0.3973214477300644, "reward_std": 0.17818782106041908, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.361607164144516, "step": 97 }, { "completion_length": 1750.8036499023438, "epoch": 0.029273392577104027, "grad_norm": 0.09035319834947586, "kl": 3.388524055480957e-05, "learning_rate": 2.9253731343283584e-07, "loss": 0.0, "reward": 0.5697544887661934, "reward_std": 0.17090018093585968, "rewards/accuracy_reward": 0.2098214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3599330559372902, "step": 98 }, { "completion_length": 1544.9844665527344, "epoch": 0.029572100664625494, "grad_norm": 0.09495872259140015, "kl": 3.6716461181640625e-05, "learning_rate": 2.955223880597015e-07, "loss": 0.0, "reward": 0.4665178805589676, "reward_std": 0.20586640387773514, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.392857164144516, "step": 99 }, { "completion_length": 1667.9465026855469, "epoch": 0.029870808752146965, "grad_norm": 0.10436111688613892, "kl": 4.488229751586914e-05, "learning_rate": 2.985074626865671e-07, "loss": 0.0, "reward": 0.5072544813156128, "reward_std": 0.21826307103037834, "rewards/accuracy_reward": 0.11830357951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.388950914144516, "step": 100 }, { "completion_length": 1649.5357666015625, "epoch": 0.030169516839668432, "grad_norm": 0.09146342426538467, "kl": 3.0517578125e-05, "learning_rate": 3.014925373134328e-07, "loss": 0.0, "reward": 0.5357143059372902, "reward_std": 0.21302506513893604, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3906250223517418, "step": 101 }, { "completion_length": 1730.4152526855469, "epoch": 0.030468224927189903, "grad_norm": 0.09710928052663803, "kl": 2.442300319671631e-05, "learning_rate": 3.044776119402985e-07, "loss": 0.0, "reward": 0.4832589477300644, "reward_std": 0.1666710190474987, "rewards/accuracy_reward": 0.10714285844005644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3761160895228386, "step": 102 }, { "completion_length": 1592.6406860351562, "epoch": 0.030766933014711374, "grad_norm": 0.6133878827095032, "kl": 3.622472286224365e-05, "learning_rate": 3.074626865671642e-07, "loss": 0.0, "reward": 0.5546875298023224, "reward_std": 0.24835949018597603, "rewards/accuracy_reward": 0.14955358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4051339477300644, "step": 103 }, { "completion_length": 1649.3795166015625, "epoch": 0.03106564110223284, "grad_norm": 0.10204572230577469, "kl": 2.6851892471313477e-05, "learning_rate": 3.1044776119402985e-07, "loss": 0.0, "reward": 0.565290205180645, "reward_std": 0.2323456034064293, "rewards/accuracy_reward": 0.1696428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3956473395228386, "step": 104 }, { "completion_length": 1839.6585693359375, "epoch": 0.03136434918975431, "grad_norm": 0.12458612024784088, "kl": 3.3289194107055664e-05, "learning_rate": 3.134328358208955e-07, "loss": 0.0, "reward": 0.4609375149011612, "reward_std": 0.16889125108718872, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3493303656578064, "step": 105 }, { "completion_length": 1584.01123046875, "epoch": 0.03166305727727578, "grad_norm": 0.20422856509685516, "kl": 4.553794860839844e-05, "learning_rate": 3.1641791044776116e-07, "loss": 0.0, "reward": 0.5016741380095482, "reward_std": 0.24019799008965492, "rewards/accuracy_reward": 0.09598215017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4056919738650322, "step": 106 }, { "completion_length": 1669.5804443359375, "epoch": 0.03196176536479725, "grad_norm": 0.09646247327327728, "kl": 3.650784492492676e-05, "learning_rate": 3.194029850746269e-07, "loss": 0.0, "reward": 0.452008955180645, "reward_std": 0.15100442804396152, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3828125149011612, "step": 107 }, { "completion_length": 1746.4889221191406, "epoch": 0.03226047345231872, "grad_norm": 0.09981522709131241, "kl": 3.3795833587646484e-05, "learning_rate": 3.2238805970149253e-07, "loss": 0.0, "reward": 0.447544664144516, "reward_std": 0.2124188169836998, "rewards/accuracy_reward": 0.07589286123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3716518059372902, "step": 108 }, { "completion_length": 1644.8907165527344, "epoch": 0.03255918153984019, "grad_norm": 0.11548830568790436, "kl": 4.470348358154297e-05, "learning_rate": 3.253731343283582e-07, "loss": 0.0, "reward": 0.5837053805589676, "reward_std": 0.24576372280716896, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4073660895228386, "step": 109 }, { "completion_length": 1665.07373046875, "epoch": 0.032857889627361664, "grad_norm": 0.10769415646791458, "kl": 4.214048385620117e-05, "learning_rate": 3.2835820895522385e-07, "loss": 0.0, "reward": 0.4441964477300644, "reward_std": 0.18239102885127068, "rewards/accuracy_reward": 0.04464286030270159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.399553582072258, "step": 110 }, { "completion_length": 1749.2367248535156, "epoch": 0.03315659771488313, "grad_norm": 0.09705200791358948, "kl": 5.030632019042969e-05, "learning_rate": 3.313432835820895e-07, "loss": 0.0, "reward": 0.446986623108387, "reward_std": 0.17424746416509151, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3666294887661934, "step": 111 }, { "completion_length": 1717.6429138183594, "epoch": 0.0334553058024046, "grad_norm": 0.09485188126564026, "kl": 4.571676254272461e-05, "learning_rate": 3.343283582089552e-07, "loss": 0.0, "reward": 0.4402901977300644, "reward_std": 0.16322433575987816, "rewards/accuracy_reward": 0.07589286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3643973395228386, "step": 112 }, { "completion_length": 1665.5514221191406, "epoch": 0.03375401388992607, "grad_norm": 0.18508239090442657, "kl": 7.367134094238281e-05, "learning_rate": 3.373134328358209e-07, "loss": 0.0, "reward": 0.530691996216774, "reward_std": 0.16950234584510326, "rewards/accuracy_reward": 0.14062500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3900669813156128, "step": 113 }, { "completion_length": 1616.2232971191406, "epoch": 0.03405272197744754, "grad_norm": 0.09916747361421585, "kl": 4.76837158203125e-05, "learning_rate": 3.402985074626866e-07, "loss": 0.0, "reward": 0.5066964477300644, "reward_std": 0.17245000414550304, "rewards/accuracy_reward": 0.11383929336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3928571566939354, "step": 114 }, { "completion_length": 1838.8773193359375, "epoch": 0.03435143006496901, "grad_norm": 0.09292024374008179, "kl": 4.392862319946289e-05, "learning_rate": 3.432835820895522e-07, "loss": 0.0, "reward": 0.4118303880095482, "reward_std": 0.17887437716126442, "rewards/accuracy_reward": 0.04910714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3627232238650322, "step": 115 }, { "completion_length": 1662.7478332519531, "epoch": 0.034650138152490476, "grad_norm": 1.5603234767913818, "kl": 0.00029659271240234375, "learning_rate": 3.462686567164179e-07, "loss": 0.0, "reward": 0.430803582072258, "reward_std": 0.16717659123241901, "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3906250149011612, "step": 116 }, { "completion_length": 1749.8036499023438, "epoch": 0.034948846240011947, "grad_norm": 0.08466564863920212, "kl": 4.2557716369628906e-05, "learning_rate": 3.4925373134328357e-07, "loss": 0.0, "reward": 0.4441964477300644, "reward_std": 0.11986074782907963, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3638393059372902, "step": 117 }, { "completion_length": 1662.7969665527344, "epoch": 0.03524755432753342, "grad_norm": 0.1252823919057846, "kl": 5.036592483520508e-05, "learning_rate": 3.5223880597014923e-07, "loss": 0.0, "reward": 0.5915178656578064, "reward_std": 0.1934937946498394, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4084821566939354, "step": 118 }, { "completion_length": 1555.0491943359375, "epoch": 0.03554626241505489, "grad_norm": 0.08959536999464035, "kl": 6.35981559753418e-05, "learning_rate": 3.552238805970149e-07, "loss": 0.0, "reward": 0.5982143133878708, "reward_std": 0.2847423031926155, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4330357387661934, "step": 119 }, { "completion_length": 1742.5223999023438, "epoch": 0.03584497050257636, "grad_norm": 0.11460559070110321, "kl": 5.793571472167969e-05, "learning_rate": 3.5820895522388055e-07, "loss": 0.0, "reward": 0.4285714477300644, "reward_std": 0.16908713802695274, "rewards/accuracy_reward": 0.06919643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3593750149011612, "step": 120 }, { "completion_length": 1606.4085388183594, "epoch": 0.03614367859009783, "grad_norm": 0.11421116441488266, "kl": 0.0001093149185180664, "learning_rate": 3.6119402985074626e-07, "loss": 0.0, "reward": 0.5055803805589676, "reward_std": 0.24104154109954834, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4051339402794838, "step": 121 }, { "completion_length": 1608.0960388183594, "epoch": 0.036442386677619294, "grad_norm": 0.10718186944723129, "kl": 7.224082946777344e-05, "learning_rate": 3.641791044776119e-07, "loss": 0.0, "reward": 0.4860491380095482, "reward_std": 0.21168795228004456, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4101562649011612, "step": 122 }, { "completion_length": 1644.5201721191406, "epoch": 0.036741094765140765, "grad_norm": 0.08423773944377899, "kl": 6.765127182006836e-05, "learning_rate": 3.6716417910447763e-07, "loss": 0.0, "reward": 0.482700914144516, "reward_std": 0.16482926905155182, "rewards/accuracy_reward": 0.0870535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3956473469734192, "step": 123 }, { "completion_length": 1669.6272888183594, "epoch": 0.037039802852662236, "grad_norm": 0.10820944607257843, "kl": 7.772445678710938e-05, "learning_rate": 3.7014925373134323e-07, "loss": 0.0, "reward": 0.5122767984867096, "reward_std": 0.23525218293070793, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3939732313156128, "step": 124 }, { "completion_length": 1579.6585388183594, "epoch": 0.03733851094018371, "grad_norm": 0.09345671534538269, "kl": 9.816884994506836e-05, "learning_rate": 3.7313432835820895e-07, "loss": 0.0, "reward": 0.5747767984867096, "reward_std": 0.17410613782703876, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4185268133878708, "step": 125 }, { "completion_length": 1728.57373046875, "epoch": 0.03763721902770518, "grad_norm": 0.09918724000453949, "kl": 0.00010466575622558594, "learning_rate": 3.761194029850746e-07, "loss": 0.0, "reward": 0.5731027126312256, "reward_std": 0.20254946313798428, "rewards/accuracy_reward": 0.18750000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3856026977300644, "step": 126 }, { "completion_length": 1664.7456359863281, "epoch": 0.03793592711522664, "grad_norm": 0.1100236102938652, "kl": 0.00010132789611816406, "learning_rate": 3.7910447761194026e-07, "loss": 0.0, "reward": 0.553571455180645, "reward_std": 0.2087230309844017, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3950893059372902, "step": 127 }, { "completion_length": 1586.6607971191406, "epoch": 0.03823463520274811, "grad_norm": 0.11955131590366364, "kl": 0.00016760826110839844, "learning_rate": 3.82089552238806e-07, "loss": 0.0, "reward": 0.5011160895228386, "reward_std": 0.21755006164312363, "rewards/accuracy_reward": 0.07589285913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4252232313156128, "step": 128 }, { "completion_length": 1581.6763916015625, "epoch": 0.038533343290269584, "grad_norm": 0.09760059416294098, "kl": 0.00010478496551513672, "learning_rate": 3.850746268656716e-07, "loss": 0.0, "reward": 0.6093750149011612, "reward_std": 0.2287341132760048, "rewards/accuracy_reward": 0.1919642984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4174107313156128, "step": 129 }, { "completion_length": 1399.1317749023438, "epoch": 0.038832051377791055, "grad_norm": 0.1142062395811081, "kl": 0.0001634359359741211, "learning_rate": 3.880597014925373e-07, "loss": 0.0, "reward": 0.698660746216774, "reward_std": 0.2904626838862896, "rewards/accuracy_reward": 0.2053571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4933035895228386, "step": 130 }, { "completion_length": 1741.05810546875, "epoch": 0.039130759465312526, "grad_norm": 0.09741470962762833, "kl": 0.00014138221740722656, "learning_rate": 3.9104477611940295e-07, "loss": 0.0, "reward": 0.4771205484867096, "reward_std": 0.24382667243480682, "rewards/accuracy_reward": 0.10714286426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3699776977300644, "step": 131 }, { "completion_length": 1633.4532165527344, "epoch": 0.03942946755283399, "grad_norm": 0.102176733314991, "kl": 0.0001437664031982422, "learning_rate": 3.9402985074626866e-07, "loss": 0.0, "reward": 0.4547991305589676, "reward_std": 0.21192003786563873, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4123884066939354, "step": 132 }, { "completion_length": 1549.1540832519531, "epoch": 0.03972817564035546, "grad_norm": 0.12706615030765533, "kl": 0.00017571449279785156, "learning_rate": 3.970149253731343e-07, "loss": 0.0, "reward": 0.6004464477300644, "reward_std": 0.22355956584215164, "rewards/accuracy_reward": 0.17633928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4241071566939354, "step": 133 }, { "completion_length": 1541.4732666015625, "epoch": 0.04002688372787693, "grad_norm": 0.11232905834913254, "kl": 0.0001919269561767578, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.5217634066939354, "reward_std": 0.17277025431394577, "rewards/accuracy_reward": 0.08928571850992739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4324776902794838, "step": 134 }, { "completion_length": 1614.0759887695312, "epoch": 0.0403255918153984, "grad_norm": 0.1062486320734024, "kl": 0.00020456314086914062, "learning_rate": 4.0298507462686564e-07, "loss": 0.0, "reward": 0.5106026977300644, "reward_std": 0.19221326895058155, "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4190848469734192, "step": 135 }, { "completion_length": 1786.4241638183594, "epoch": 0.04062429990291987, "grad_norm": 0.09834981709718704, "kl": 0.00018715858459472656, "learning_rate": 4.059701492537313e-07, "loss": 0.0, "reward": 0.424107164144516, "reward_std": 0.1763131394982338, "rewards/accuracy_reward": 0.046875002793967724, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.377232164144516, "step": 136 }, { "completion_length": 1679.0581359863281, "epoch": 0.040923007990441344, "grad_norm": 0.09654120355844498, "kl": 0.00017118453979492188, "learning_rate": 4.08955223880597e-07, "loss": 0.0, "reward": 0.4698660895228386, "reward_std": 0.20757461339235306, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4051339477300644, "step": 137 }, { "completion_length": 1645.9978332519531, "epoch": 0.04122171607796281, "grad_norm": 0.10565762966871262, "kl": 0.00020599365234375, "learning_rate": 4.1194029850746267e-07, "loss": 0.0, "reward": 0.5005580559372902, "reward_std": 0.20559922233223915, "rewards/accuracy_reward": 0.09821429057046771, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4023437574505806, "step": 138 }, { "completion_length": 1658.3170471191406, "epoch": 0.04152042416548428, "grad_norm": 0.10835479944944382, "kl": 0.00022077560424804688, "learning_rate": 4.1492537313432833e-07, "loss": 0.0, "reward": 0.623325914144516, "reward_std": 0.2049240544438362, "rewards/accuracy_reward": 0.2366071566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3867187649011612, "step": 139 }, { "completion_length": 1575.2813110351562, "epoch": 0.04181913225300575, "grad_norm": 0.10850559920072556, "kl": 0.00022792816162109375, "learning_rate": 4.17910447761194e-07, "loss": 0.0, "reward": 0.5502232313156128, "reward_std": 0.19880719482898712, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4319196566939354, "step": 140 }, { "completion_length": 1658.0960693359375, "epoch": 0.04211784034052722, "grad_norm": 0.09431391209363937, "kl": 0.0002562999725341797, "learning_rate": 4.208955223880597e-07, "loss": 0.0, "reward": 0.5496651902794838, "reward_std": 0.2672116383910179, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3956473395228386, "step": 141 }, { "completion_length": 1473.8460388183594, "epoch": 0.04241654842804869, "grad_norm": 0.09856575727462769, "kl": 0.0003666877746582031, "learning_rate": 4.2388059701492536e-07, "loss": 0.0, "reward": 0.5636161044239998, "reward_std": 0.22439932823181152, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.467633955180645, "step": 142 }, { "completion_length": 1594.3415832519531, "epoch": 0.042715256515570156, "grad_norm": 0.1780392974615097, "kl": 0.0003223419189453125, "learning_rate": 4.2686567164179107e-07, "loss": 0.0, "reward": 0.5295759215950966, "reward_std": 0.21716298907995224, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4090401902794838, "step": 143 }, { "completion_length": 1672.1630249023438, "epoch": 0.04301396460309163, "grad_norm": 0.11662290245294571, "kl": 0.00032138824462890625, "learning_rate": 4.298507462686567e-07, "loss": 0.0, "reward": 0.4425223469734192, "reward_std": 0.20117394253611565, "rewards/accuracy_reward": 0.0401785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4023437798023224, "step": 144 }, { "completion_length": 1415.3460388183594, "epoch": 0.0433126726906131, "grad_norm": 0.11631136387586594, "kl": 0.0003008842468261719, "learning_rate": 4.3283582089552234e-07, "loss": 0.0, "reward": 0.5747768208384514, "reward_std": 0.24364250525832176, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4631696566939354, "step": 145 }, { "completion_length": 1640.6429138183594, "epoch": 0.04361138077813457, "grad_norm": 0.11498496681451797, "kl": 0.0003190040588378906, "learning_rate": 4.3582089552238805e-07, "loss": 0.0, "reward": 0.4581473395228386, "reward_std": 0.18868930637836456, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4001116156578064, "step": 146 }, { "completion_length": 1584.2232666015625, "epoch": 0.04391008886565604, "grad_norm": 0.11958017200231552, "kl": 0.000301361083984375, "learning_rate": 4.388059701492537e-07, "loss": 0.0, "reward": 0.6099330633878708, "reward_std": 0.2426753081381321, "rewards/accuracy_reward": 0.2053571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4045759066939354, "step": 147 }, { "completion_length": 1605.5090026855469, "epoch": 0.04420879695317751, "grad_norm": 0.10188373178243637, "kl": 0.0003604888916015625, "learning_rate": 4.417910447761194e-07, "loss": 0.0, "reward": 0.5195312798023224, "reward_std": 0.2138482667505741, "rewards/accuracy_reward": 0.10044643515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4190848395228386, "step": 148 }, { "completion_length": 1549.3594360351562, "epoch": 0.044507505040698975, "grad_norm": 0.11765708774328232, "kl": 0.00040721893310546875, "learning_rate": 4.44776119402985e-07, "loss": 0.0, "reward": 0.5825893059372902, "reward_std": 0.2697291225194931, "rewards/accuracy_reward": 0.13839286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4441964477300644, "step": 149 }, { "completion_length": 1786.5022888183594, "epoch": 0.044806213128220446, "grad_norm": 0.09118742495775223, "kl": 0.0002875328063964844, "learning_rate": 4.4776119402985074e-07, "loss": 0.0, "reward": 0.4737723544239998, "reward_std": 0.235809076577425, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3643973395228386, "step": 150 }, { "completion_length": 1649.1607666015625, "epoch": 0.04510492121574192, "grad_norm": 0.11287020146846771, "kl": 0.0004096031188964844, "learning_rate": 4.507462686567164e-07, "loss": 0.0, "reward": 0.4570312649011612, "reward_std": 0.20643066242337227, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4213169813156128, "step": 151 }, { "completion_length": 1717.0156860351562, "epoch": 0.04540362930326339, "grad_norm": 0.11617482453584671, "kl": 0.0004100799560546875, "learning_rate": 4.537313432835821e-07, "loss": 0.0, "reward": 0.5111607238650322, "reward_std": 0.18071607314050198, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3906250223517418, "step": 152 }, { "completion_length": 1584.8393859863281, "epoch": 0.04570233739078486, "grad_norm": 0.10379640758037567, "kl": 0.0004353523254394531, "learning_rate": 4.567164179104477e-07, "loss": 0.0, "reward": 0.6702009290456772, "reward_std": 0.24500708654522896, "rewards/accuracy_reward": 0.2142857313156128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455915205180645, "step": 153 }, { "completion_length": 1563.44873046875, "epoch": 0.04600104547830632, "grad_norm": 0.11173922568559647, "kl": 0.0004868507385253906, "learning_rate": 4.5970149253731337e-07, "loss": 0.0, "reward": 0.5329241305589676, "reward_std": 0.2591385617852211, "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4414062723517418, "step": 154 }, { "completion_length": 1514.9554443359375, "epoch": 0.04629975356582779, "grad_norm": 0.10911253094673157, "kl": 0.0005884170532226562, "learning_rate": 4.626865671641791e-07, "loss": 0.0, "reward": 0.6768973469734192, "reward_std": 0.262552373111248, "rewards/accuracy_reward": 0.2410714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.435825914144516, "step": 155 }, { "completion_length": 1882.1295471191406, "epoch": 0.046598461653349264, "grad_norm": 0.13059338927268982, "kl": 0.0005574226379394531, "learning_rate": 4.6567164179104474e-07, "loss": 0.0, "reward": 0.4614955708384514, "reward_std": 0.16418969258666039, "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.345424123108387, "step": 156 }, { "completion_length": 1556.1451416015625, "epoch": 0.046897169740870735, "grad_norm": 0.10782728344202042, "kl": 0.0005626678466796875, "learning_rate": 4.6865671641791045e-07, "loss": 0.0, "reward": 0.5591518133878708, "reward_std": 0.22835727035999298, "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4207589402794838, "step": 157 }, { "completion_length": 1574.0313110351562, "epoch": 0.047195877828392206, "grad_norm": 0.10796396434307098, "kl": 0.0005588531494140625, "learning_rate": 4.7164179104477606e-07, "loss": 0.0, "reward": 0.4910714477300644, "reward_std": 0.22960058599710464, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4330357313156128, "step": 158 }, { "completion_length": 1704.247802734375, "epoch": 0.04749458591591367, "grad_norm": 0.12359526008367538, "kl": 0.0006213188171386719, "learning_rate": 4.7462686567164177e-07, "loss": 0.0, "reward": 0.5239955633878708, "reward_std": 0.16867272555828094, "rewards/accuracy_reward": 0.1339285767171532, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3900669813156128, "step": 159 }, { "completion_length": 1560.0447082519531, "epoch": 0.04779329400343514, "grad_norm": 0.12023913860321045, "kl": 0.00067901611328125, "learning_rate": 4.776119402985074e-07, "loss": 0.0, "reward": 0.6696428954601288, "reward_std": 0.23205870389938354, "rewards/accuracy_reward": 0.2209821566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4486607313156128, "step": 160 }, { "completion_length": 1570.5670471191406, "epoch": 0.04809200209095661, "grad_norm": 0.14560416340827942, "kl": 0.0006780624389648438, "learning_rate": 4.805970149253731e-07, "loss": 0.0, "reward": 0.5323660895228386, "reward_std": 0.24525152891874313, "rewards/accuracy_reward": 0.09821428963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4341518059372902, "step": 161 }, { "completion_length": 1558.1920471191406, "epoch": 0.04839071017847808, "grad_norm": 0.12767083942890167, "kl": 0.0008420944213867188, "learning_rate": 4.835820895522387e-07, "loss": 0.0, "reward": 0.5831473469734192, "reward_std": 0.2786228824406862, "rewards/accuracy_reward": 0.1473214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4358259066939354, "step": 162 }, { "completion_length": 1616.7210693359375, "epoch": 0.048689418265999554, "grad_norm": 0.10763923823833466, "kl": 0.00069427490234375, "learning_rate": 4.865671641791044e-07, "loss": 0.0, "reward": 0.5223214626312256, "reward_std": 0.24353572726249695, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4151785895228386, "step": 163 }, { "completion_length": 1612.5156860351562, "epoch": 0.048988126353521025, "grad_norm": 0.10262947529554367, "kl": 0.0006885528564453125, "learning_rate": 4.895522388059702e-07, "loss": 0.0, "reward": 0.5797991156578064, "reward_std": 0.23679952695965767, "rewards/accuracy_reward": 0.17410715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4056919738650322, "step": 164 }, { "completion_length": 1565.6875610351562, "epoch": 0.04928683444104249, "grad_norm": 0.12141844630241394, "kl": 0.000911712646484375, "learning_rate": 4.925373134328357e-07, "loss": 0.0, "reward": 0.5747767984867096, "reward_std": 0.3075713813304901, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4430803805589676, "step": 165 }, { "completion_length": 1336.9129943847656, "epoch": 0.04958554252856396, "grad_norm": 0.1332685947418213, "kl": 0.0011615753173828125, "learning_rate": 4.955223880597015e-07, "loss": 0.0, "reward": 0.6841518133878708, "reward_std": 0.2911071442067623, "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5011160969734192, "step": 166 }, { "completion_length": 1511.0447082519531, "epoch": 0.04988425061608543, "grad_norm": 0.18312767148017883, "kl": 0.0013856887817382812, "learning_rate": 4.985074626865671e-07, "loss": 0.0001, "reward": 0.6311384066939354, "reward_std": 0.22322136908769608, "rewards/accuracy_reward": 0.17857143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4525669738650322, "step": 167 }, { "completion_length": 1594.43310546875, "epoch": 0.0501829587036069, "grad_norm": 0.11108998209238052, "kl": 0.0008707046508789062, "learning_rate": 5.014925373134328e-07, "loss": 0.0, "reward": 0.5139509215950966, "reward_std": 0.22339818999171257, "rewards/accuracy_reward": 0.08482142956927419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4291294887661934, "step": 168 }, { "completion_length": 1541.2567443847656, "epoch": 0.05048166679112837, "grad_norm": 0.10337429493665695, "kl": 0.0010366439819335938, "learning_rate": 5.044776119402985e-07, "loss": 0.0, "reward": 0.5820312947034836, "reward_std": 0.23566661402583122, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4570312723517418, "step": 169 }, { "completion_length": 1724.2210693359375, "epoch": 0.05078037487864984, "grad_norm": 0.18144862353801727, "kl": 0.0012559890747070312, "learning_rate": 5.074626865671642e-07, "loss": 0.0001, "reward": 0.6026785969734192, "reward_std": 0.2605712376534939, "rewards/accuracy_reward": 0.2031250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3995535895228386, "step": 170 }, { "completion_length": 1540.6451721191406, "epoch": 0.05107908296617131, "grad_norm": 0.1291673183441162, "kl": 0.001331329345703125, "learning_rate": 5.104477611940299e-07, "loss": 0.0001, "reward": 0.5150669887661934, "reward_std": 0.22825362533330917, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4347098469734192, "step": 171 }, { "completion_length": 1528.5513916015625, "epoch": 0.05137779105369278, "grad_norm": 0.12473644316196442, "kl": 0.0013484954833984375, "learning_rate": 5.134328358208954e-07, "loss": 0.0001, "reward": 0.5452009215950966, "reward_std": 0.237271286547184, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4626116305589676, "step": 172 }, { "completion_length": 1586.2813415527344, "epoch": 0.05167649914121425, "grad_norm": 0.12337406724691391, "kl": 0.0012607574462890625, "learning_rate": 5.164179104477612e-07, "loss": 0.0001, "reward": 0.5669643059372902, "reward_std": 0.26265063509345055, "rewards/accuracy_reward": 0.12276786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4441964477300644, "step": 173 }, { "completion_length": 1684.0938110351562, "epoch": 0.05197520722873572, "grad_norm": 0.10951121151447296, "kl": 0.0011444091796875, "learning_rate": 5.194029850746269e-07, "loss": 0.0, "reward": 0.5094866305589676, "reward_std": 0.24980808421969414, "rewards/accuracy_reward": 0.07812500349245965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.431361623108387, "step": 174 }, { "completion_length": 1634.3594360351562, "epoch": 0.052273915316257184, "grad_norm": 0.12181877344846725, "kl": 0.0011138916015625, "learning_rate": 5.223880597014924e-07, "loss": 0.0, "reward": 0.527343787252903, "reward_std": 0.23291247338056564, "rewards/accuracy_reward": 0.10267857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4246651902794838, "step": 175 }, { "completion_length": 1575.8929138183594, "epoch": 0.052572623403778655, "grad_norm": 0.10959277302026749, "kl": 0.0015106201171875, "learning_rate": 5.253731343283582e-07, "loss": 0.0001, "reward": 0.631138414144516, "reward_std": 0.2849116362631321, "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4570312723517418, "step": 176 }, { "completion_length": 1525.3773193359375, "epoch": 0.052871331491300126, "grad_norm": 0.1901940405368805, "kl": 0.0045185089111328125, "learning_rate": 5.283582089552238e-07, "loss": 0.0002, "reward": 0.6640625298023224, "reward_std": 0.27322958782315254, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625223517418, "step": 177 }, { "completion_length": 1473.07373046875, "epoch": 0.0531700395788216, "grad_norm": 0.14421828091144562, "kl": 0.00183868408203125, "learning_rate": 5.313432835820896e-07, "loss": 0.0001, "reward": 0.6328125149011612, "reward_std": 0.26656974479556084, "rewards/accuracy_reward": 0.1830357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4497768208384514, "step": 178 }, { "completion_length": 1512.1072082519531, "epoch": 0.05346874766634307, "grad_norm": 0.15023967623710632, "kl": 0.0018138885498046875, "learning_rate": 5.343283582089552e-07, "loss": 0.0001, "reward": 0.631138414144516, "reward_std": 0.27349531650543213, "rewards/accuracy_reward": 0.16517858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4659598469734192, "step": 179 }, { "completion_length": 1574.1741638183594, "epoch": 0.05376745575386454, "grad_norm": 0.11440388858318329, "kl": 0.0015544891357421875, "learning_rate": 5.373134328358208e-07, "loss": 0.0001, "reward": 0.5909598544239998, "reward_std": 0.25212647020816803, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4547991380095482, "step": 180 }, { "completion_length": 1580.3929443359375, "epoch": 0.054066163841386, "grad_norm": 0.10884010046720505, "kl": 0.0020923614501953125, "learning_rate": 5.402985074626866e-07, "loss": 0.0001, "reward": 0.5373884215950966, "reward_std": 0.2551763877272606, "rewards/accuracy_reward": 0.08258928707800806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4547991305589676, "step": 181 }, { "completion_length": 1561.0268859863281, "epoch": 0.054364871928907474, "grad_norm": 0.13236317038536072, "kl": 0.00182342529296875, "learning_rate": 5.432835820895521e-07, "loss": 0.0001, "reward": 0.7142857313156128, "reward_std": 0.2829049304127693, "rewards/accuracy_reward": 0.2477678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466517873108387, "step": 182 }, { "completion_length": 1474.52685546875, "epoch": 0.054663580016428945, "grad_norm": 0.12894752621650696, "kl": 0.001903533935546875, "learning_rate": 5.462686567164179e-07, "loss": 0.0001, "reward": 0.6579241305589676, "reward_std": 0.28933998197317123, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4771205559372902, "step": 183 }, { "completion_length": 1454.4978332519531, "epoch": 0.054962288103950416, "grad_norm": 0.3658800721168518, "kl": 0.0033664703369140625, "learning_rate": 5.492537313432836e-07, "loss": 0.0001, "reward": 0.6099330484867096, "reward_std": 0.21772530302405357, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4715401977300644, "step": 184 }, { "completion_length": 1472.5491943359375, "epoch": 0.05526099619147189, "grad_norm": 0.13115745782852173, "kl": 0.002277374267578125, "learning_rate": 5.522388059701492e-07, "loss": 0.0001, "reward": 0.6065848618745804, "reward_std": 0.27465086802840233, "rewards/accuracy_reward": 0.12276786100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4838169813156128, "step": 185 }, { "completion_length": 1481.888427734375, "epoch": 0.05555970427899335, "grad_norm": 0.12489021569490433, "kl": 0.002216339111328125, "learning_rate": 5.552238805970149e-07, "loss": 0.0001, "reward": 0.6640625223517418, "reward_std": 0.2997805327177048, "rewards/accuracy_reward": 0.17857143934816122, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4854911044239998, "step": 186 }, { "completion_length": 1602.1875305175781, "epoch": 0.05585841236651482, "grad_norm": 0.11913973838090897, "kl": 0.0019168853759765625, "learning_rate": 5.582089552238805e-07, "loss": 0.0001, "reward": 0.6138393133878708, "reward_std": 0.24115237593650818, "rewards/accuracy_reward": 0.1674107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4464285895228386, "step": 187 }, { "completion_length": 1488.2009887695312, "epoch": 0.05615712045403629, "grad_norm": 0.16364610195159912, "kl": 0.0031986236572265625, "learning_rate": 5.611940298507463e-07, "loss": 0.0001, "reward": 0.658482164144516, "reward_std": 0.2570516802370548, "rewards/accuracy_reward": 0.17857144121080637, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4799107387661934, "step": 188 }, { "completion_length": 1572.2790832519531, "epoch": 0.056455828541557763, "grad_norm": 0.12233714759349823, "kl": 0.0022068023681640625, "learning_rate": 5.64179104477612e-07, "loss": 0.0001, "reward": 0.6171875298023224, "reward_std": 0.285783015191555, "rewards/accuracy_reward": 0.12946428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4877232313156128, "step": 189 }, { "completion_length": 1441.2656860351562, "epoch": 0.056754536629079234, "grad_norm": 0.12394828349351883, "kl": 0.00252532958984375, "learning_rate": 5.671641791044775e-07, "loss": 0.0001, "reward": 0.6897321790456772, "reward_std": 0.3268979415297508, "rewards/accuracy_reward": 0.18303572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5066964477300644, "step": 190 }, { "completion_length": 1528.6541137695312, "epoch": 0.057053244716600705, "grad_norm": 0.1391298919916153, "kl": 0.002780914306640625, "learning_rate": 5.701492537313433e-07, "loss": 0.0001, "reward": 0.6160714775323868, "reward_std": 0.28208761662244797, "rewards/accuracy_reward": 0.15178572572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4642857313156128, "step": 191 }, { "completion_length": 1595.6920471191406, "epoch": 0.05735195280412217, "grad_norm": 0.1443793624639511, "kl": 0.002567291259765625, "learning_rate": 5.731343283582089e-07, "loss": 0.0001, "reward": 0.6010044813156128, "reward_std": 0.26129278913140297, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4581473544239998, "step": 192 }, { "completion_length": 1381.7500610351562, "epoch": 0.05765066089164364, "grad_norm": 0.1285824328660965, "kl": 0.003200531005859375, "learning_rate": 5.761194029850746e-07, "loss": 0.0001, "reward": 0.698660746216774, "reward_std": 0.25494490563869476, "rewards/accuracy_reward": 0.20089287124574184, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4977678880095482, "step": 193 }, { "completion_length": 1527.7255249023438, "epoch": 0.05794936897916511, "grad_norm": 0.12871329486370087, "kl": 0.00298309326171875, "learning_rate": 5.791044776119403e-07, "loss": 0.0001, "reward": 0.537388414144516, "reward_std": 0.2527937553822994, "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.470424123108387, "step": 194 }, { "completion_length": 1436.9643249511719, "epoch": 0.05824807706668658, "grad_norm": 0.16672013700008392, "kl": 0.00359344482421875, "learning_rate": 5.820895522388059e-07, "loss": 0.0001, "reward": 0.6026785969734192, "reward_std": 0.3201080448925495, "rewards/accuracy_reward": 0.09821428870782256, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5044643059372902, "step": 195 }, { "completion_length": 1371.29248046875, "epoch": 0.05854678515420805, "grad_norm": 0.1427551656961441, "kl": 0.004100799560546875, "learning_rate": 5.850746268656717e-07, "loss": 0.0002, "reward": 0.647879496216774, "reward_std": 0.27486152946949005, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5094866305589676, "step": 196 }, { "completion_length": 1494.9933776855469, "epoch": 0.05884549324172952, "grad_norm": 0.13985468447208405, "kl": 0.0040283203125, "learning_rate": 5.880597014925372e-07, "loss": 0.0002, "reward": 0.576450914144516, "reward_std": 0.28847233206033707, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4871651977300644, "step": 197 }, { "completion_length": 1465.7991638183594, "epoch": 0.05914420132925099, "grad_norm": 0.20387613773345947, "kl": 0.004711151123046875, "learning_rate": 5.91044776119403e-07, "loss": 0.0002, "reward": 0.6121652126312256, "reward_std": 0.27360785752534866, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4804687723517418, "step": 198 }, { "completion_length": 1409.9620971679688, "epoch": 0.05944290941677246, "grad_norm": 0.16919183731079102, "kl": 0.0041351318359375, "learning_rate": 5.940298507462687e-07, "loss": 0.0002, "reward": 0.6032366454601288, "reward_std": 0.29441192001104355, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5005580708384514, "step": 199 }, { "completion_length": 1343.05810546875, "epoch": 0.05974161750429393, "grad_norm": 0.1505434811115265, "kl": 0.005046844482421875, "learning_rate": 5.970149253731342e-07, "loss": 0.0002, "reward": 0.6250000298023224, "reward_std": 0.2936793640255928, "rewards/accuracy_reward": 0.07142857532016933, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5535714700818062, "step": 200 }, { "completion_length": 1321.2678833007812, "epoch": 0.0600403255918154, "grad_norm": 0.16912706196308136, "kl": 0.00548553466796875, "learning_rate": 6e-07, "loss": 0.0002, "reward": 0.6685268133878708, "reward_std": 0.3188823387026787, "rewards/accuracy_reward": 0.12500000791624188, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5435268059372902, "step": 201 }, { "completion_length": 1372.2723693847656, "epoch": 0.060339033679336865, "grad_norm": 0.20725470781326294, "kl": 0.00530242919921875, "learning_rate": 6.029850746268656e-07, "loss": 0.0002, "reward": 0.701450914144516, "reward_std": 0.300742469727993, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5251116305589676, "step": 202 }, { "completion_length": 1469.9107971191406, "epoch": 0.060637741766858336, "grad_norm": 0.1649634838104248, "kl": 0.004520416259765625, "learning_rate": 6.059701492537314e-07, "loss": 0.0002, "reward": 0.6188616305589676, "reward_std": 0.3240756429731846, "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5228794738650322, "step": 203 }, { "completion_length": 1359.6786193847656, "epoch": 0.06093644985437981, "grad_norm": 0.14997589588165283, "kl": 0.0056304931640625, "learning_rate": 6.08955223880597e-07, "loss": 0.0002, "reward": 0.6969866454601288, "reward_std": 0.287303127348423, "rewards/accuracy_reward": 0.16071429220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5362723395228386, "step": 204 }, { "completion_length": 1278.9955749511719, "epoch": 0.06123515794190128, "grad_norm": 0.17116151750087738, "kl": 0.0063323974609375, "learning_rate": 6.119402985074626e-07, "loss": 0.0003, "reward": 0.7137277200818062, "reward_std": 0.3095347359776497, "rewards/accuracy_reward": 0.1674107275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5463169887661934, "step": 205 }, { "completion_length": 1432.2679138183594, "epoch": 0.06153386602942275, "grad_norm": 0.28548645973205566, "kl": 0.00678253173828125, "learning_rate": 6.149253731343284e-07, "loss": 0.0003, "reward": 0.5664062798023224, "reward_std": 0.277177382260561, "rewards/accuracy_reward": 0.060267861699685454, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.506138414144516, "step": 206 }, { "completion_length": 1229.3616485595703, "epoch": 0.06183257411694422, "grad_norm": 0.18746012449264526, "kl": 0.0072021484375, "learning_rate": 6.17910447761194e-07, "loss": 0.0003, "reward": 0.8320313096046448, "reward_std": 0.3444887585937977, "rewards/accuracy_reward": 0.2299107313156128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6021205633878708, "step": 207 }, { "completion_length": 1480.3928833007812, "epoch": 0.06213128220446568, "grad_norm": 0.1562262326478958, "kl": 0.005558013916015625, "learning_rate": 6.208955223880597e-07, "loss": 0.0002, "reward": 0.5636161044239998, "reward_std": 0.2675609663128853, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125223517418, "step": 208 }, { "completion_length": 1224.3482666015625, "epoch": 0.062429990291987154, "grad_norm": 0.4371950328350067, "kl": 0.01300811767578125, "learning_rate": 6.238805970149253e-07, "loss": 0.0005, "reward": 0.737723246216774, "reward_std": 0.3353317938745022, "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5837053805589676, "step": 209 }, { "completion_length": 1387.4688415527344, "epoch": 0.06272869837950862, "grad_norm": 0.13533955812454224, "kl": 0.00604248046875, "learning_rate": 6.26865671641791e-07, "loss": 0.0002, "reward": 0.643973246216774, "reward_std": 0.2798839583992958, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5479910895228386, "step": 210 }, { "completion_length": 1324.16748046875, "epoch": 0.06302740646703009, "grad_norm": 0.16900187730789185, "kl": 0.00873565673828125, "learning_rate": 6.298507462686567e-07, "loss": 0.0003, "reward": 0.7589285969734192, "reward_std": 0.28099749982357025, "rewards/accuracy_reward": 0.1919642984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5669643133878708, "step": 211 }, { "completion_length": 1352.9375915527344, "epoch": 0.06332611455455156, "grad_norm": 0.1635826677083969, "kl": 0.00756072998046875, "learning_rate": 6.328358208955223e-07, "loss": 0.0003, "reward": 0.701450914144516, "reward_std": 0.28838997706770897, "rewards/accuracy_reward": 0.1696428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5318080633878708, "step": 212 }, { "completion_length": 1324.3371276855469, "epoch": 0.06362482264207303, "grad_norm": 0.15075698494911194, "kl": 0.00772857666015625, "learning_rate": 6.358208955223881e-07, "loss": 0.0003, "reward": 0.736607164144516, "reward_std": 0.34223324060440063, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5736607387661934, "step": 213 }, { "completion_length": 1194.5871276855469, "epoch": 0.0639235307295945, "grad_norm": 0.17338323593139648, "kl": 0.0109710693359375, "learning_rate": 6.388059701492537e-07, "loss": 0.0004, "reward": 0.8392857611179352, "reward_std": 0.34494439512491226, "rewards/accuracy_reward": 0.2187500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.620535746216774, "step": 214 }, { "completion_length": 1340.7165832519531, "epoch": 0.06422223881711597, "grad_norm": 0.15607264637947083, "kl": 0.00772857666015625, "learning_rate": 6.417910447761193e-07, "loss": 0.0003, "reward": 0.667410746216774, "reward_std": 0.2824026867747307, "rewards/accuracy_reward": 0.0825892873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5848214477300644, "step": 215 }, { "completion_length": 1143.9844665527344, "epoch": 0.06452094690463744, "grad_norm": 0.1686001867055893, "kl": 0.0121612548828125, "learning_rate": 6.447761194029851e-07, "loss": 0.0005, "reward": 0.7444196790456772, "reward_std": 0.34773679822683334, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.635044664144516, "step": 216 }, { "completion_length": 1149.6250610351562, "epoch": 0.06481965499215891, "grad_norm": 0.1927129328250885, "kl": 0.0115509033203125, "learning_rate": 6.477611940298507e-07, "loss": 0.0005, "reward": 0.7940848618745804, "reward_std": 0.32051873207092285, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6222098469734192, "step": 217 }, { "completion_length": 1165.10498046875, "epoch": 0.06511836307968039, "grad_norm": 0.1944277584552765, "kl": 0.01129150390625, "learning_rate": 6.507462686567164e-07, "loss": 0.0005, "reward": 0.7232143133878708, "reward_std": 0.32150761410593987, "rewards/accuracy_reward": 0.10714286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6160714626312256, "step": 218 }, { "completion_length": 1333.6250610351562, "epoch": 0.06541707116720186, "grad_norm": 0.15580515563488007, "kl": 0.010009765625, "learning_rate": 6.53731343283582e-07, "loss": 0.0004, "reward": 0.7382812798023224, "reward_std": 0.3412829414010048, "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5954241305589676, "step": 219 }, { "completion_length": 1340.9286193847656, "epoch": 0.06571577925472333, "grad_norm": 0.16867880523204803, "kl": 0.0110015869140625, "learning_rate": 6.567164179104477e-07, "loss": 0.0004, "reward": 0.7070312947034836, "reward_std": 0.28838157653808594, "rewards/accuracy_reward": 0.11830357555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5887277126312256, "step": 220 }, { "completion_length": 1197.88623046875, "epoch": 0.06601448734224478, "grad_norm": 0.17371873557567596, "kl": 0.012725830078125, "learning_rate": 6.597014925373135e-07, "loss": 0.0005, "reward": 0.7343750298023224, "reward_std": 0.2726942002773285, "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6495536118745804, "step": 221 }, { "completion_length": 962.8125305175781, "epoch": 0.06631319542976626, "grad_norm": 0.20173048973083496, "kl": 0.0165252685546875, "learning_rate": 6.62686567164179e-07, "loss": 0.0007, "reward": 0.886160746216774, "reward_std": 0.3159068673849106, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7165178954601288, "step": 222 }, { "completion_length": 1158.4866790771484, "epoch": 0.06661190351728773, "grad_norm": 0.36521288752555847, "kl": 0.01702880859375, "learning_rate": 6.656716417910448e-07, "loss": 0.0007, "reward": 0.827566996216774, "reward_std": 0.3042806535959244, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6512276977300644, "step": 223 }, { "completion_length": 1130.49560546875, "epoch": 0.0669106116048092, "grad_norm": 0.15882663428783417, "kl": 0.0147857666015625, "learning_rate": 6.686567164179104e-07, "loss": 0.0006, "reward": 0.9034598469734192, "reward_std": 0.3012891598045826, "rewards/accuracy_reward": 0.2321428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.671316996216774, "step": 224 }, { "completion_length": 1097.7991485595703, "epoch": 0.06720931969233067, "grad_norm": 0.18464437127113342, "kl": 0.0144500732421875, "learning_rate": 6.716417910447761e-07, "loss": 0.0006, "reward": 0.788504496216774, "reward_std": 0.30130813270807266, "rewards/accuracy_reward": 0.10937500838190317, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.679129496216774, "step": 225 }, { "completion_length": 900.4911041259766, "epoch": 0.06750802777985214, "grad_norm": 0.19357509911060333, "kl": 0.0206298828125, "learning_rate": 6.746268656716418e-07, "loss": 0.0008, "reward": 0.820870578289032, "reward_std": 0.31263313442468643, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7472098618745804, "step": 226 }, { "completion_length": 961.9643402099609, "epoch": 0.06780673586737361, "grad_norm": 0.18347138166427612, "kl": 0.020111083984375, "learning_rate": 6.776119402985074e-07, "loss": 0.0008, "reward": 0.8632812947034836, "reward_std": 0.32562802359461784, "rewards/accuracy_reward": 0.14062500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7226562798023224, "step": 227 }, { "completion_length": 1032.4844207763672, "epoch": 0.06810544395489508, "grad_norm": 0.1712559461593628, "kl": 0.018157958984375, "learning_rate": 6.805970149253732e-07, "loss": 0.0007, "reward": 0.8878348618745804, "reward_std": 0.33772359788417816, "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7047991454601288, "step": 228 }, { "completion_length": 970.5045166015625, "epoch": 0.06840415204241655, "grad_norm": 0.2262369990348816, "kl": 0.018707275390625, "learning_rate": 6.835820895522387e-07, "loss": 0.0007, "reward": 0.823660746216774, "reward_std": 0.28983018919825554, "rewards/accuracy_reward": 0.08035714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7433035969734192, "step": 229 }, { "completion_length": 1069.464340209961, "epoch": 0.06870286012993802, "grad_norm": 0.17583560943603516, "kl": 0.020294189453125, "learning_rate": 6.865671641791044e-07, "loss": 0.0008, "reward": 0.8253348618745804, "reward_std": 0.30967529118061066, "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6824777126312256, "step": 230 }, { "completion_length": 1022.1674652099609, "epoch": 0.0690015682174595, "grad_norm": 0.2375975251197815, "kl": 0.024444580078125, "learning_rate": 6.895522388059702e-07, "loss": 0.001, "reward": 0.9335937649011612, "reward_std": 0.3707151561975479, "rewards/accuracy_reward": 0.22321429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.710379496216774, "step": 231 }, { "completion_length": 1047.5625305175781, "epoch": 0.06930027630498095, "grad_norm": 0.23702803254127502, "kl": 0.022186279296875, "learning_rate": 6.925373134328358e-07, "loss": 0.0009, "reward": 0.8582589626312256, "reward_std": 0.31339147686958313, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7243303954601288, "step": 232 }, { "completion_length": 812.6964797973633, "epoch": 0.06959898439250242, "grad_norm": 0.41636043787002563, "kl": 0.03936767578125, "learning_rate": 6.955223880597014e-07, "loss": 0.0016, "reward": 0.8811384290456772, "reward_std": 0.26799363642930984, "rewards/accuracy_reward": 0.10044643492437899, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.780691996216774, "step": 233 }, { "completion_length": 947.3437957763672, "epoch": 0.06989769248002389, "grad_norm": 0.22805839776992798, "kl": 0.026214599609375, "learning_rate": 6.985074626865671e-07, "loss": 0.001, "reward": 0.9492187798023224, "reward_std": 0.3153246343135834, "rewards/accuracy_reward": 0.18526786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7639509290456772, "step": 234 }, { "completion_length": 1062.0201416015625, "epoch": 0.07019640056754536, "grad_norm": 0.22195197641849518, "kl": 0.024871826171875, "learning_rate": 7.014925373134328e-07, "loss": 0.001, "reward": 0.8219866454601288, "reward_std": 0.3215860649943352, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.734933078289032, "step": 235 }, { "completion_length": 915.2321624755859, "epoch": 0.07049510865506683, "grad_norm": 0.22285960614681244, "kl": 0.028045654296875, "learning_rate": 7.044776119402985e-07, "loss": 0.0011, "reward": 0.9575893133878708, "reward_std": 0.30197346210479736, "rewards/accuracy_reward": 0.18526786658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7723214626312256, "step": 236 }, { "completion_length": 855.7321624755859, "epoch": 0.0707938167425883, "grad_norm": 0.2375722974538803, "kl": 0.031646728515625, "learning_rate": 7.074626865671641e-07, "loss": 0.0013, "reward": 0.9079241454601288, "reward_std": 0.3001979626715183, "rewards/accuracy_reward": 0.1205357164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.787388414144516, "step": 237 }, { "completion_length": 798.7455749511719, "epoch": 0.07109252483010978, "grad_norm": 0.2473047524690628, "kl": 0.04052734375, "learning_rate": 7.104477611940298e-07, "loss": 0.0016, "reward": 0.9397321790456772, "reward_std": 0.28343552350997925, "rewards/accuracy_reward": 0.13839286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.801339328289032, "step": 238 }, { "completion_length": 957.8393402099609, "epoch": 0.07139123291763125, "grad_norm": 0.2373952567577362, "kl": 0.030853271484375, "learning_rate": 7.134328358208955e-07, "loss": 0.0012, "reward": 0.9520089477300644, "reward_std": 0.3290429413318634, "rewards/accuracy_reward": 0.20089286752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7511160969734192, "step": 239 }, { "completion_length": 887.6027374267578, "epoch": 0.07168994100515272, "grad_norm": 0.27610161900520325, "kl": 0.040771484375, "learning_rate": 7.164179104477611e-07, "loss": 0.0016, "reward": 0.8755580633878708, "reward_std": 0.3097836412489414, "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7907366454601288, "step": 240 }, { "completion_length": 786.3013916015625, "epoch": 0.07198864909267419, "grad_norm": 0.24617484211921692, "kl": 0.03619384765625, "learning_rate": 7.194029850746269e-07, "loss": 0.0014, "reward": 1.0563616454601288, "reward_std": 0.36723170429468155, "rewards/accuracy_reward": 0.2366071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8197545111179352, "step": 241 }, { "completion_length": 963.3683471679688, "epoch": 0.07228735718019566, "grad_norm": 0.24794961512088776, "kl": 0.0362548828125, "learning_rate": 7.223880597014925e-07, "loss": 0.0014, "reward": 0.8917411118745804, "reward_std": 0.30830957368016243, "rewards/accuracy_reward": 0.14508929220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.746651828289032, "step": 242 }, { "completion_length": 1143.024642944336, "epoch": 0.07258606526771712, "grad_norm": 0.27705350518226624, "kl": 0.03033447265625, "learning_rate": 7.253731343283582e-07, "loss": 0.0012, "reward": 0.8331473469734192, "reward_std": 0.3886076286435127, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6902902126312256, "step": 243 }, { "completion_length": 1040.6250610351562, "epoch": 0.07288477335523859, "grad_norm": 0.23521064221858978, "kl": 0.031219482421875, "learning_rate": 7.283582089552238e-07, "loss": 0.0013, "reward": 0.902901828289032, "reward_std": 0.31987424939870834, "rewards/accuracy_reward": 0.1986607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7042410969734192, "step": 244 }, { "completion_length": 890.7656860351562, "epoch": 0.07318348144276006, "grad_norm": 0.24313679337501526, "kl": 0.03411865234375, "learning_rate": 7.313432835820895e-07, "loss": 0.0014, "reward": 0.8409598618745804, "reward_std": 0.2721391096711159, "rewards/accuracy_reward": 0.04241071571595967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.79854916036129, "step": 245 }, { "completion_length": 991.8214721679688, "epoch": 0.07348218953028153, "grad_norm": 0.3283228874206543, "kl": 0.03729248046875, "learning_rate": 7.343283582089553e-07, "loss": 0.0015, "reward": 1.027901828289032, "reward_std": 0.3455253168940544, "rewards/accuracy_reward": 0.2611607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7667410969734192, "step": 246 }, { "completion_length": 930.0915679931641, "epoch": 0.073780897617803, "grad_norm": 0.25138550996780396, "kl": 0.0390625, "learning_rate": 7.373134328358208e-07, "loss": 0.0016, "reward": 0.9380580931901932, "reward_std": 0.3052614890038967, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7795759439468384, "step": 247 }, { "completion_length": 870.5625457763672, "epoch": 0.07407960570532447, "grad_norm": 0.25149625539779663, "kl": 0.03631591796875, "learning_rate": 7.402985074626865e-07, "loss": 0.0015, "reward": 0.9140625447034836, "reward_std": 0.31608083471655846, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8113839775323868, "step": 248 }, { "completion_length": 997.9576263427734, "epoch": 0.07437831379284594, "grad_norm": 0.3233741819858551, "kl": 0.04364013671875, "learning_rate": 7.432835820895522e-07, "loss": 0.0017, "reward": 0.9095982611179352, "reward_std": 0.3534677103161812, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.737723246216774, "step": 249 }, { "completion_length": 889.0625457763672, "epoch": 0.07467702188036741, "grad_norm": 0.3512294888496399, "kl": 0.0445556640625, "learning_rate": 7.462686567164179e-07, "loss": 0.0018, "reward": 0.863839328289032, "reward_std": 0.31356821209192276, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7857143133878708, "step": 250 }, { "completion_length": 1084.8192596435547, "epoch": 0.07497572996788888, "grad_norm": 0.2696700692176819, "kl": 0.03643798828125, "learning_rate": 7.492537313432836e-07, "loss": 0.0015, "reward": 0.835379496216774, "reward_std": 0.3249930515885353, "rewards/accuracy_reward": 0.10714286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7282366305589676, "step": 251 }, { "completion_length": 1093.7590026855469, "epoch": 0.07527443805541036, "grad_norm": 0.3355739712715149, "kl": 0.04107666015625, "learning_rate": 7.522388059701492e-07, "loss": 0.0016, "reward": 0.823660746216774, "reward_std": 0.3369763568043709, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7388393133878708, "step": 252 }, { "completion_length": 861.4129791259766, "epoch": 0.07557314614293181, "grad_norm": 0.43197500705718994, "kl": 0.050537109375, "learning_rate": 7.552238805970149e-07, "loss": 0.002, "reward": 1.0982143580913544, "reward_std": 0.32622935622930527, "rewards/accuracy_reward": 0.2767857313156128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8214286118745804, "step": 253 }, { "completion_length": 957.9621124267578, "epoch": 0.07587185423045328, "grad_norm": 0.42714792490005493, "kl": 0.04486083984375, "learning_rate": 7.582089552238805e-07, "loss": 0.0018, "reward": 0.9196429252624512, "reward_std": 0.3307085707783699, "rewards/accuracy_reward": 0.13616071734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7834821939468384, "step": 254 }, { "completion_length": 964.607177734375, "epoch": 0.07617056231797475, "grad_norm": 0.28734493255615234, "kl": 0.04437255859375, "learning_rate": 7.611940298507462e-07, "loss": 0.0018, "reward": 0.9760045260190964, "reward_std": 0.3666205555200577, "rewards/accuracy_reward": 0.20312500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.772879496216774, "step": 255 }, { "completion_length": 827.8036041259766, "epoch": 0.07646927040549623, "grad_norm": 0.35414305329322815, "kl": 0.04815673828125, "learning_rate": 7.64179104477612e-07, "loss": 0.0019, "reward": 0.9614955633878708, "reward_std": 0.32179535180330276, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8253348469734192, "step": 256 }, { "completion_length": 929.4018096923828, "epoch": 0.0767679784930177, "grad_norm": 0.3888576626777649, "kl": 0.0491943359375, "learning_rate": 7.671641791044776e-07, "loss": 0.002, "reward": 0.8995536118745804, "reward_std": 0.32228659093379974, "rewards/accuracy_reward": 0.12946428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7700893133878708, "step": 257 }, { "completion_length": 838.341552734375, "epoch": 0.07706668658053917, "grad_norm": 0.3943783640861511, "kl": 0.0596923828125, "learning_rate": 7.701492537313432e-07, "loss": 0.0024, "reward": 1.0898437798023224, "reward_std": 0.3411543220281601, "rewards/accuracy_reward": 0.2611607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.828683078289032, "step": 258 }, { "completion_length": 887.1027221679688, "epoch": 0.07736539466806064, "grad_norm": 0.4431280791759491, "kl": 0.04974365234375, "learning_rate": 7.731343283582089e-07, "loss": 0.002, "reward": 0.9363839775323868, "reward_std": 0.3403652086853981, "rewards/accuracy_reward": 0.13839286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7979910969734192, "step": 259 }, { "completion_length": 931.4308624267578, "epoch": 0.07766410275558211, "grad_norm": 0.4595802426338196, "kl": 0.0543212890625, "learning_rate": 7.761194029850746e-07, "loss": 0.0022, "reward": 0.9475446790456772, "reward_std": 0.3834797367453575, "rewards/accuracy_reward": 0.15178572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7957589775323868, "step": 260 }, { "completion_length": 776.5558319091797, "epoch": 0.07796281084310358, "grad_norm": 0.32563984394073486, "kl": 0.0535888671875, "learning_rate": 7.791044776119404e-07, "loss": 0.0021, "reward": 0.963727742433548, "reward_std": 0.2971910759806633, "rewards/accuracy_reward": 0.10714286426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8565848618745804, "step": 261 }, { "completion_length": 902.6361999511719, "epoch": 0.07826151893062505, "grad_norm": 16.813087463378906, "kl": 0.22808837890625, "learning_rate": 7.820895522388059e-07, "loss": 0.0091, "reward": 0.9280134439468384, "reward_std": 0.32230929285287857, "rewards/accuracy_reward": 0.1473214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7806920111179352, "step": 262 }, { "completion_length": 953.560302734375, "epoch": 0.07856022701814652, "grad_norm": 0.506950318813324, "kl": 0.0599365234375, "learning_rate": 7.850746268656716e-07, "loss": 0.0024, "reward": 0.8950893431901932, "reward_std": 0.3102293461561203, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.767857164144516, "step": 263 }, { "completion_length": 918.310302734375, "epoch": 0.07885893510566798, "grad_norm": 0.4997296929359436, "kl": 0.0552978515625, "learning_rate": 7.880597014925373e-07, "loss": 0.0022, "reward": 0.9737723618745804, "reward_std": 0.3345959410071373, "rewards/accuracy_reward": 0.16964286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8041295111179352, "step": 264 }, { "completion_length": 820.3661193847656, "epoch": 0.07915764319318945, "grad_norm": 26.551298141479492, "kl": 0.6507568359375, "learning_rate": 7.910447761194029e-07, "loss": 0.0261, "reward": 0.9073661118745804, "reward_std": 0.2818864658474922, "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8225446790456772, "step": 265 }, { "completion_length": 886.5491638183594, "epoch": 0.07945635128071092, "grad_norm": 0.5615645051002502, "kl": 0.06573486328125, "learning_rate": 7.940298507462686e-07, "loss": 0.0026, "reward": 0.926339328289032, "reward_std": 0.33697639405727386, "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8214286118745804, "step": 266 }, { "completion_length": 966.1361999511719, "epoch": 0.07975505936823239, "grad_norm": 0.5273261070251465, "kl": 0.06048583984375, "learning_rate": 7.970149253731343e-07, "loss": 0.0024, "reward": 0.8872768133878708, "reward_std": 0.3385256305336952, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7600446790456772, "step": 267 }, { "completion_length": 907.3705902099609, "epoch": 0.08005376745575386, "grad_norm": 0.58387690782547, "kl": 0.07080078125, "learning_rate": 8e-07, "loss": 0.0028, "reward": 0.9090402126312256, "reward_std": 0.29424942284822464, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.788504496216774, "step": 268 }, { "completion_length": 948.2544860839844, "epoch": 0.08035247554327533, "grad_norm": 0.7119466066360474, "kl": 0.073486328125, "learning_rate": 8.029850746268656e-07, "loss": 0.0029, "reward": 0.97433041036129, "reward_std": 0.3452170193195343, "rewards/accuracy_reward": 0.1919642984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7823660969734192, "step": 269 }, { "completion_length": 926.4442291259766, "epoch": 0.0806511836307968, "grad_norm": 0.6976232528686523, "kl": 0.0760498046875, "learning_rate": 8.059701492537313e-07, "loss": 0.003, "reward": 1.0189732760190964, "reward_std": 0.32481062412261963, "rewards/accuracy_reward": 0.22991071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7890625447034836, "step": 270 }, { "completion_length": 1042.5268249511719, "epoch": 0.08094989171831828, "grad_norm": 0.7255579233169556, "kl": 0.0850830078125, "learning_rate": 8.08955223880597e-07, "loss": 0.0034, "reward": 0.9620535969734192, "reward_std": 0.36870668083429337, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7522321790456772, "step": 271 }, { "completion_length": 1019.5089721679688, "epoch": 0.08124859980583975, "grad_norm": 6.428774356842041, "kl": 0.1597900390625, "learning_rate": 8.119402985074626e-07, "loss": 0.0064, "reward": 0.8426339477300644, "reward_std": 0.29865841567516327, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7354910969734192, "step": 272 }, { "completion_length": 916.7165679931641, "epoch": 0.08154730789336122, "grad_norm": 0.8199161291122437, "kl": 0.0965576171875, "learning_rate": 8.149253731343283e-07, "loss": 0.0039, "reward": 0.9174107313156128, "reward_std": 0.36216193437576294, "rewards/accuracy_reward": 0.14508929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7723214626312256, "step": 273 }, { "completion_length": 806.3393249511719, "epoch": 0.08184601598088269, "grad_norm": 1.1163442134857178, "kl": 0.099609375, "learning_rate": 8.17910447761194e-07, "loss": 0.004, "reward": 1.0686384290456772, "reward_std": 0.3036719933152199, "rewards/accuracy_reward": 0.243303582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8253348469734192, "step": 274 }, { "completion_length": 843.8594055175781, "epoch": 0.08214472406840415, "grad_norm": 0.8757635354995728, "kl": 0.11376953125, "learning_rate": 8.208955223880597e-07, "loss": 0.0045, "reward": 1.0011161267757416, "reward_std": 0.39344770461320877, "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7957589626312256, "step": 275 }, { "completion_length": 816.2455749511719, "epoch": 0.08244343215592562, "grad_norm": 1.0250307321548462, "kl": 0.12060546875, "learning_rate": 8.238805970149253e-07, "loss": 0.0048, "reward": 0.929129496216774, "reward_std": 0.33202407136559486, "rewards/accuracy_reward": 0.13392857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7952009290456772, "step": 276 }, { "completion_length": 887.8281707763672, "epoch": 0.08274214024344709, "grad_norm": 1.2605938911437988, "kl": 0.129638671875, "learning_rate": 8.26865671641791e-07, "loss": 0.0052, "reward": 0.9274553805589676, "reward_std": 0.33730924874544144, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.7801339626312256, "step": 277 }, { "completion_length": 809.8214569091797, "epoch": 0.08304084833096856, "grad_norm": 0.9439035654067993, "kl": 0.124267578125, "learning_rate": 8.298507462686567e-07, "loss": 0.005, "reward": 0.9268973618745804, "reward_std": 0.3121991902589798, "rewards/accuracy_reward": 0.10267857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8242187947034836, "step": 278 }, { "completion_length": 711.6942138671875, "epoch": 0.08333955641849003, "grad_norm": 0.8930893540382385, "kl": 0.1424560546875, "learning_rate": 8.328358208955224e-07, "loss": 0.0057, "reward": 0.9893973767757416, "reward_std": 0.30987589806318283, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8577009290456772, "step": 279 }, { "completion_length": 850.0938110351562, "epoch": 0.0836382645060115, "grad_norm": 1.7441383600234985, "kl": 0.179931640625, "learning_rate": 8.35820895522388e-07, "loss": 0.0072, "reward": 0.888950914144516, "reward_std": 0.31583722308278084, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7952009290456772, "step": 280 }, { "completion_length": 770.6339569091797, "epoch": 0.08393697259353297, "grad_norm": 3.7019577026367188, "kl": 0.25732421875, "learning_rate": 8.388059701492537e-07, "loss": 0.0103, "reward": 0.9642857313156128, "reward_std": 0.3170778900384903, "rewards/accuracy_reward": 0.13839286658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8258928954601288, "step": 281 }, { "completion_length": 968.763427734375, "epoch": 0.08423568068105444, "grad_norm": 1.755023717880249, "kl": 0.2431640625, "learning_rate": 8.417910447761194e-07, "loss": 0.0097, "reward": 0.9587053954601288, "reward_std": 0.3463284969329834, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7756696939468384, "step": 282 }, { "completion_length": 839.2969055175781, "epoch": 0.08453438876857591, "grad_norm": 1.7507258653640747, "kl": 0.2578125, "learning_rate": 8.44776119402985e-07, "loss": 0.0103, "reward": 0.8945312798023224, "reward_std": 0.2956850156188011, "rewards/accuracy_reward": 0.07142857671715319, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8231027126312256, "step": 283 }, { "completion_length": 670.7277069091797, "epoch": 0.08483309685609738, "grad_norm": 1.7746942043304443, "kl": 0.214599609375, "learning_rate": 8.477611940298507e-07, "loss": 0.0086, "reward": 0.998325914144516, "reward_std": 0.28632770851254463, "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8554687947034836, "step": 284 }, { "completion_length": 845.6183166503906, "epoch": 0.08513180494361886, "grad_norm": 1.9679439067840576, "kl": 0.274658203125, "learning_rate": 8.507462686567164e-07, "loss": 0.011, "reward": 0.8995536267757416, "reward_std": 0.32051240652799606, "rewards/accuracy_reward": 0.10044643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7991071939468384, "step": 285 }, { "completion_length": 606.4419860839844, "epoch": 0.08543051303114031, "grad_norm": 2.082448959350586, "kl": 0.276611328125, "learning_rate": 8.537313432835821e-07, "loss": 0.0111, "reward": 1.0474330931901932, "reward_std": 0.28509248048067093, "rewards/accuracy_reward": 0.17410714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8733259439468384, "step": 286 }, { "completion_length": 866.0268096923828, "epoch": 0.08572922111866178, "grad_norm": 2.771641254425049, "kl": 0.47705078125, "learning_rate": 8.567164179104477e-07, "loss": 0.0191, "reward": 0.9453125447034836, "reward_std": 0.31955815851688385, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7801339626312256, "step": 287 }, { "completion_length": 682.7969055175781, "epoch": 0.08602792920618325, "grad_norm": 1.6097460985183716, "kl": 0.381103515625, "learning_rate": 8.597014925373134e-07, "loss": 0.0152, "reward": 0.8856027126312256, "reward_std": 0.2517332583665848, "rewards/accuracy_reward": 0.040178574388846755, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8454241454601288, "step": 288 }, { "completion_length": 727.1161041259766, "epoch": 0.08632663729370472, "grad_norm": 3.0123565196990967, "kl": 0.56396484375, "learning_rate": 8.626865671641791e-07, "loss": 0.0226, "reward": 0.9921875447034836, "reward_std": 0.30435463413596153, "rewards/accuracy_reward": 0.1629464402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8292411118745804, "step": 289 }, { "completion_length": 614.8794784545898, "epoch": 0.0866253453812262, "grad_norm": 2.14713191986084, "kl": 0.4736328125, "learning_rate": 8.656716417910447e-07, "loss": 0.0189, "reward": 1.0524554252624512, "reward_std": 0.296479769051075, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8761161267757416, "step": 290 }, { "completion_length": 613.4107208251953, "epoch": 0.08692405346874767, "grad_norm": 2.5508530139923096, "kl": 0.56689453125, "learning_rate": 8.686567164179104e-07, "loss": 0.0227, "reward": 1.0117188096046448, "reward_std": 0.326076403260231, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8398437947034836, "step": 291 }, { "completion_length": 495.5111846923828, "epoch": 0.08722276155626914, "grad_norm": 2.20749568939209, "kl": 0.43994140625, "learning_rate": 8.716417910447761e-07, "loss": 0.0176, "reward": 0.9207589775323868, "reward_std": 0.281779408454895, "rewards/accuracy_reward": 0.04241071571595967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.878348246216774, "step": 292 }, { "completion_length": 514.0223541259766, "epoch": 0.08752146964379061, "grad_norm": 2.3525404930114746, "kl": 0.6357421875, "learning_rate": 8.746268656716418e-07, "loss": 0.0254, "reward": 1.02511166036129, "reward_std": 0.25326941534876823, "rewards/accuracy_reward": 0.15848214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.866629496216774, "step": 293 }, { "completion_length": 581.0290374755859, "epoch": 0.08782017773131208, "grad_norm": 1.9360665082931519, "kl": 0.8125, "learning_rate": 8.776119402985074e-07, "loss": 0.0325, "reward": 0.9073661267757416, "reward_std": 0.3237651214003563, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.824776828289032, "step": 294 }, { "completion_length": 625.3594055175781, "epoch": 0.08811888581883355, "grad_norm": 2.11419939994812, "kl": 0.8447265625, "learning_rate": 8.805970149253731e-07, "loss": 0.0338, "reward": 0.8532366454601288, "reward_std": 0.2828482538461685, "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.7907366305589676, "step": 295 }, { "completion_length": 575.9955596923828, "epoch": 0.08841759390635502, "grad_norm": 2.3190066814422607, "kl": 0.83984375, "learning_rate": 8.835820895522388e-07, "loss": 0.0336, "reward": 0.9425223767757416, "reward_std": 0.3221564143896103, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8286830633878708, "step": 296 }, { "completion_length": 573.2768096923828, "epoch": 0.08871630199387648, "grad_norm": 3.0915167331695557, "kl": 0.9560546875, "learning_rate": 8.865671641791045e-07, "loss": 0.0383, "reward": 0.9285714626312256, "reward_std": 0.28945494443178177, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.808035746216774, "step": 297 }, { "completion_length": 587.8705596923828, "epoch": 0.08901501008139795, "grad_norm": 2.6132864952087402, "kl": 1.068359375, "learning_rate": 8.8955223880597e-07, "loss": 0.0428, "reward": 0.8504464626312256, "reward_std": 0.2913218215107918, "rewards/accuracy_reward": 0.04464285750873387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8058035969734192, "step": 298 }, { "completion_length": 547.3080444335938, "epoch": 0.08931371816891942, "grad_norm": 1.7895452976226807, "kl": 0.869140625, "learning_rate": 8.925373134328358e-07, "loss": 0.0348, "reward": 0.8883928954601288, "reward_std": 0.28627971559762955, "rewards/accuracy_reward": 0.06250000209547579, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.823660746216774, "step": 299 }, { "completion_length": 459.82144927978516, "epoch": 0.08961242625644089, "grad_norm": 2.1383309364318848, "kl": 0.7529296875, "learning_rate": 8.955223880597015e-07, "loss": 0.0301, "reward": 1.0714286267757416, "reward_std": 0.2720639519393444, "rewards/accuracy_reward": 0.20089286658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8705357611179352, "step": 300 }, { "completion_length": 549.9844055175781, "epoch": 0.08991113434396236, "grad_norm": 13.322843551635742, "kl": 0.837890625, "learning_rate": 8.98507462686567e-07, "loss": 0.0335, "reward": 0.9514509290456772, "reward_std": 0.29133379459381104, "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.004464285913854837, "rewards/tag_count_reward": 0.8777902275323868, "step": 301 }, { "completion_length": 482.74778747558594, "epoch": 0.09020984243148383, "grad_norm": 1.7197874784469604, "kl": 0.62890625, "learning_rate": 9.014925373134328e-07, "loss": 0.0251, "reward": 1.0083705633878708, "reward_std": 0.2790837213397026, "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8945313096046448, "step": 302 }, { "completion_length": 461.10047149658203, "epoch": 0.0905085505190053, "grad_norm": 1.2922329902648926, "kl": 0.579345703125, "learning_rate": 9.044776119402984e-07, "loss": 0.0231, "reward": 1.0117187947034836, "reward_std": 0.25325537100434303, "rewards/accuracy_reward": 0.10267857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9090402275323868, "step": 303 }, { "completion_length": 573.3616256713867, "epoch": 0.09080725860652678, "grad_norm": 1.820452332496643, "kl": 0.76904296875, "learning_rate": 9.074626865671642e-07, "loss": 0.0307, "reward": 0.9436384439468384, "reward_std": 0.29183412343263626, "rewards/accuracy_reward": 0.06250000465661287, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.8789062947034836, "step": 304 }, { "completion_length": 513.912971496582, "epoch": 0.09110596669404825, "grad_norm": 1.9214797019958496, "kl": 0.7392578125, "learning_rate": 9.104477611940298e-07, "loss": 0.0296, "reward": 0.9581473618745804, "reward_std": 0.25060325115919113, "rewards/accuracy_reward": 0.0602678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.897879496216774, "step": 305 }, { "completion_length": 613.6027069091797, "epoch": 0.09140467478156972, "grad_norm": 2.4161040782928467, "kl": 0.91796875, "learning_rate": 9.134328358208954e-07, "loss": 0.0367, "reward": 0.9592634439468384, "reward_std": 0.3131456635892391, "rewards/accuracy_reward": 0.10937500675208867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8498884290456772, "step": 306 }, { "completion_length": 502.69422912597656, "epoch": 0.09170338286909117, "grad_norm": 1.34571373462677, "kl": 0.6826171875, "learning_rate": 9.164179104477612e-07, "loss": 0.0273, "reward": 1.0306920111179352, "reward_std": 0.2767278626561165, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.8878348618745804, "step": 307 }, { "completion_length": 493.654052734375, "epoch": 0.09200209095661264, "grad_norm": 0.9290879964828491, "kl": 0.42919921875, "learning_rate": 9.194029850746267e-07, "loss": 0.0172, "reward": 1.0111607611179352, "reward_std": 0.2370922826230526, "rewards/accuracy_reward": 0.09151786379516125, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.917410746216774, "step": 308 }, { "completion_length": 465.94644927978516, "epoch": 0.09230079904413412, "grad_norm": 1.608235239982605, "kl": 0.439453125, "learning_rate": 9.223880597014925e-07, "loss": 0.0176, "reward": 1.013950914144516, "reward_std": 0.2821110785007477, "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.906808078289032, "step": 309 }, { "completion_length": 540.3951110839844, "epoch": 0.09259950713165559, "grad_norm": 1.2861707210540771, "kl": 0.51025390625, "learning_rate": 9.253731343283582e-07, "loss": 0.0204, "reward": 1.0167411118745804, "reward_std": 0.24189509078860283, "rewards/accuracy_reward": 0.13839286495931447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.878348246216774, "step": 310 }, { "completion_length": 623.7277069091797, "epoch": 0.09289821521917706, "grad_norm": 1.13076913356781, "kl": 0.5517578125, "learning_rate": 9.283582089552238e-07, "loss": 0.0221, "reward": 0.9040178954601288, "reward_std": 0.25855884701013565, "rewards/accuracy_reward": 0.0424107164144516, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.8593750596046448, "step": 311 }, { "completion_length": 557.8370819091797, "epoch": 0.09319692330669853, "grad_norm": 1.6536146402359009, "kl": 0.5517578125, "learning_rate": 9.313432835820895e-07, "loss": 0.0221, "reward": 1.093191996216774, "reward_std": 0.30593252927064896, "rewards/accuracy_reward": 0.20758929662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8856027126312256, "step": 312 }, { "completion_length": 652.3549346923828, "epoch": 0.09349563139422, "grad_norm": 2.1762278079986572, "kl": 0.73779296875, "learning_rate": 9.343283582089551e-07, "loss": 0.0295, "reward": 0.9520089626312256, "reward_std": 0.2799290716648102, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8560268133878708, "step": 313 }, { "completion_length": 676.9397735595703, "epoch": 0.09379433948174147, "grad_norm": 2.94340443611145, "kl": 0.7373046875, "learning_rate": 9.373134328358209e-07, "loss": 0.0295, "reward": 0.8917411118745804, "reward_std": 0.28018655627965927, "rewards/accuracy_reward": 0.06473214505240321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8270089626312256, "step": 314 }, { "completion_length": 785.2143249511719, "epoch": 0.09409304756926294, "grad_norm": 8.931953430175781, "kl": 1.84765625, "learning_rate": 9.402985074626866e-07, "loss": 0.0738, "reward": 0.8364955633878708, "reward_std": 0.2974684610962868, "rewards/accuracy_reward": 0.03571428661234677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8007812798023224, "step": 315 }, { "completion_length": 862.2366333007812, "epoch": 0.09439175565678441, "grad_norm": 47.71930694580078, "kl": 3.099609375, "learning_rate": 9.432835820895521e-07, "loss": 0.124, "reward": 0.9799107611179352, "reward_std": 0.3470269665122032, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8013393133878708, "step": 316 }, { "completion_length": 818.9397735595703, "epoch": 0.09469046374430588, "grad_norm": 57.180572509765625, "kl": 3.248046875, "learning_rate": 9.462686567164179e-07, "loss": 0.1298, "reward": 0.9146205931901932, "reward_std": 0.3036615513265133, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.8007812798023224, "step": 317 }, { "completion_length": 974.4487152099609, "epoch": 0.09498917183182734, "grad_norm": 6.06685209274292, "kl": 1.408203125, "learning_rate": 9.492537313432835e-07, "loss": 0.0563, "reward": 0.8046875447034836, "reward_std": 0.30171678215265274, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.753348246216774, "step": 318 }, { "completion_length": 822.5402221679688, "epoch": 0.09528787991934881, "grad_norm": 8.978278160095215, "kl": 0.8486328125, "learning_rate": 9.522388059701492e-07, "loss": 0.034, "reward": 0.9084821939468384, "reward_std": 0.32547464221715927, "rewards/accuracy_reward": 0.11607143748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.792410746216774, "step": 319 }, { "completion_length": 910.8928985595703, "epoch": 0.09558658800687028, "grad_norm": 7.897408962249756, "kl": 1.1279296875, "learning_rate": 9.552238805970149e-07, "loss": 0.0451, "reward": 0.806919664144516, "reward_std": 0.3116587996482849, "rewards/accuracy_reward": 0.03571428754366934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7712053954601288, "step": 320 }, { "completion_length": 892.9308319091797, "epoch": 0.09588529609439175, "grad_norm": 7.168715000152588, "kl": 0.6396484375, "learning_rate": 9.582089552238805e-07, "loss": 0.0256, "reward": 0.8560268133878708, "reward_std": 0.33947332948446274, "rewards/accuracy_reward": 0.0982142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7578125298023224, "step": 321 }, { "completion_length": 991.3326416015625, "epoch": 0.09618400418191322, "grad_norm": 10.877771377563477, "kl": 0.7822265625, "learning_rate": 9.611940298507462e-07, "loss": 0.0313, "reward": 0.8833705931901932, "reward_std": 0.3454548195004463, "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7695312947034836, "step": 322 }, { "completion_length": 1022.4420318603516, "epoch": 0.0964827122694347, "grad_norm": 29.091175079345703, "kl": 2.6328125, "learning_rate": 9.641791044776118e-07, "loss": 0.1051, "reward": 0.7985491454601288, "reward_std": 0.2883128374814987, "rewards/accuracy_reward": 0.0825892873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7159598767757416, "step": 323 }, { "completion_length": 939.3705596923828, "epoch": 0.09678142035695617, "grad_norm": 198.4163818359375, "kl": 8.44921875, "learning_rate": 9.671641791044775e-07, "loss": 0.3382, "reward": 0.8030134290456772, "reward_std": 0.3363336995244026, "rewards/accuracy_reward": 0.06919643376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.733816996216774, "step": 324 }, { "completion_length": 1049.5580596923828, "epoch": 0.09708012844447764, "grad_norm": 62.468788146972656, "kl": 3.591796875, "learning_rate": 9.701492537313434e-07, "loss": 0.1435, "reward": 0.8292410969734192, "reward_std": 0.35294970870018005, "rewards/accuracy_reward": 0.10491071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7243303805589676, "step": 325 }, { "completion_length": 1056.0335388183594, "epoch": 0.09737883653199911, "grad_norm": 8.6136474609375, "kl": 1.30078125, "learning_rate": 9.731343283582088e-07, "loss": 0.052, "reward": 0.8476562947034836, "reward_std": 0.33781056106090546, "rewards/accuracy_reward": 0.13616071944124997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.711495578289032, "step": 326 }, { "completion_length": 1062.216552734375, "epoch": 0.09767754461952058, "grad_norm": 7.482530117034912, "kl": 1.28125, "learning_rate": 9.761194029850745e-07, "loss": 0.0512, "reward": 0.8537946939468384, "reward_std": 0.33956485241651535, "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7243303954601288, "step": 327 }, { "completion_length": 997.2210235595703, "epoch": 0.09797625270704205, "grad_norm": 7.585559368133545, "kl": 1.505859375, "learning_rate": 9.791044776119403e-07, "loss": 0.0603, "reward": 0.8409598767757416, "reward_std": 0.32539038360118866, "rewards/accuracy_reward": 0.11383929150179029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.727120578289032, "step": 328 }, { "completion_length": 1137.52685546875, "epoch": 0.0982749607945635, "grad_norm": 31.401552200317383, "kl": 3.57421875, "learning_rate": 9.82089552238806e-07, "loss": 0.1431, "reward": 0.8041295111179352, "reward_std": 0.3440176025032997, "rewards/accuracy_reward": 0.12500000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6791295111179352, "step": 329 }, { "completion_length": 1018.8058624267578, "epoch": 0.09857366888208498, "grad_norm": 49.80457305908203, "kl": 3.5, "learning_rate": 9.850746268656714e-07, "loss": 0.1399, "reward": 0.7762277126312256, "reward_std": 0.34160156548023224, "rewards/accuracy_reward": 0.05133928870782256, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7248884290456772, "step": 330 }, { "completion_length": 942.1719055175781, "epoch": 0.09887237696960645, "grad_norm": 8.197003364562988, "kl": 1.7041015625, "learning_rate": 9.880597014925373e-07, "loss": 0.0683, "reward": 0.8476562947034836, "reward_std": 0.33648477494716644, "rewards/accuracy_reward": 0.10714286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.740513414144516, "step": 331 }, { "completion_length": 1129.4620666503906, "epoch": 0.09917108505712792, "grad_norm": 220.84298706054688, "kl": 3.828125, "learning_rate": 9.91044776119403e-07, "loss": 0.1533, "reward": 0.7823661118745804, "reward_std": 0.30339415371418, "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6953125149011612, "step": 332 }, { "completion_length": 924.1919860839844, "epoch": 0.09946979314464939, "grad_norm": 945.56396484375, "kl": 9.8408203125, "learning_rate": 9.940298507462686e-07, "loss": 0.394, "reward": 0.8738839775323868, "reward_std": 0.3767590820789337, "rewards/accuracy_reward": 0.12276786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7511161118745804, "step": 333 }, { "completion_length": 851.6116485595703, "epoch": 0.09976850123217086, "grad_norm": 9.404172897338867, "kl": 0.60302734375, "learning_rate": 9.970149253731343e-07, "loss": 0.0241, "reward": 0.8973214626312256, "reward_std": 0.36599206924438477, "rewards/accuracy_reward": 0.12276786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7745535969734192, "step": 334 }, { "completion_length": 845.9152069091797, "epoch": 0.10006720931969233, "grad_norm": 6.976377487182617, "kl": 0.49755859375, "learning_rate": 1e-06, "loss": 0.0199, "reward": 0.88058041036129, "reward_std": 0.36095599085092545, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7667410969734192, "step": 335 }, { "completion_length": 966.4821624755859, "epoch": 0.1003659174072138, "grad_norm": 86.86793518066406, "kl": 2.576171875, "learning_rate": 9.999997552220525e-07, "loss": 0.1031, "reward": 0.8203125596046448, "reward_std": 0.36472930014133453, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7399553954601288, "step": 336 }, { "completion_length": 868.2053985595703, "epoch": 0.10066462549473527, "grad_norm": 378.9661865234375, "kl": 7.57421875, "learning_rate": 9.999990208884757e-07, "loss": 0.3031, "reward": 0.844308078289032, "reward_std": 0.3130945563316345, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7706473469734192, "step": 337 }, { "completion_length": 872.5201416015625, "epoch": 0.10096333358225675, "grad_norm": 21.67607879638672, "kl": 1.25, "learning_rate": 9.99997797000069e-07, "loss": 0.05, "reward": 0.97042416036129, "reward_std": 0.32219576090574265, "rewards/accuracy_reward": 0.17187501303851604, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7985491454601288, "step": 338 }, { "completion_length": 898.6071929931641, "epoch": 0.10126204166977822, "grad_norm": 6.92557954788208, "kl": 0.9482421875, "learning_rate": 9.999960835581636e-07, "loss": 0.0379, "reward": 0.8409598618745804, "reward_std": 0.31669773161411285, "rewards/accuracy_reward": 0.07812500302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7628348618745804, "step": 339 }, { "completion_length": 805.4040374755859, "epoch": 0.10156074975729967, "grad_norm": 6.456977844238281, "kl": 0.58740234375, "learning_rate": 9.999938805646239e-07, "loss": 0.0235, "reward": 0.9012277275323868, "reward_std": 0.29714617878198624, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8164062798023224, "step": 340 }, { "completion_length": 945.6495971679688, "epoch": 0.10185945784482114, "grad_norm": 5.702192306518555, "kl": 0.8427734375, "learning_rate": 9.999911880218462e-07, "loss": 0.0337, "reward": 0.9776786118745804, "reward_std": 0.3825265094637871, "rewards/accuracy_reward": 0.19642858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7812500447034836, "step": 341 }, { "completion_length": 815.7165679931641, "epoch": 0.10215816593234262, "grad_norm": 4.592239856719971, "kl": 1.0380859375, "learning_rate": 9.999880059327598e-07, "loss": 0.0415, "reward": 0.9804687947034836, "reward_std": 0.3339777886867523, "rewards/accuracy_reward": 0.16071429569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.819754496216774, "step": 342 }, { "completion_length": 827.9598693847656, "epoch": 0.10245687401986409, "grad_norm": 4.449698448181152, "kl": 1.1357421875, "learning_rate": 9.999843343008264e-07, "loss": 0.0454, "reward": 0.8984375447034836, "reward_std": 0.2954647168517113, "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7935268133878708, "step": 343 }, { "completion_length": 782.6250305175781, "epoch": 0.10275558210738556, "grad_norm": 4.136514186859131, "kl": 1.076171875, "learning_rate": 9.999801731300407e-07, "loss": 0.043, "reward": 0.9575893431901932, "reward_std": 0.35978956520557404, "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8147321790456772, "step": 344 }, { "completion_length": 703.0357360839844, "epoch": 0.10305429019490703, "grad_norm": 4.677806377410889, "kl": 1.029296875, "learning_rate": 9.999755224249292e-07, "loss": 0.0412, "reward": 0.92745541036129, "reward_std": 0.31162983924150467, "rewards/accuracy_reward": 0.08482143119908869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8426339775323868, "step": 345 }, { "completion_length": 708.4174499511719, "epoch": 0.1033529982824285, "grad_norm": 3.841837167739868, "kl": 1.1015625, "learning_rate": 9.999703821905516e-07, "loss": 0.0441, "reward": 0.930245578289032, "reward_std": 0.30159537494182587, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.843191996216774, "step": 346 }, { "completion_length": 982.0201416015625, "epoch": 0.10365170636994997, "grad_norm": 7.139371395111084, "kl": 1.556640625, "learning_rate": 9.999647524325e-07, "loss": 0.0624, "reward": 0.858816996216774, "reward_std": 0.3069397881627083, "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7695312798023224, "step": 347 }, { "completion_length": 786.8861923217773, "epoch": 0.10395041445747144, "grad_norm": 3.4473373889923096, "kl": 1.1748046875, "learning_rate": 9.999586331568992e-07, "loss": 0.047, "reward": 0.9358259290456772, "reward_std": 0.32131045311689377, "rewards/accuracy_reward": 0.09821429150179029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8376116454601288, "step": 348 }, { "completion_length": 830.216552734375, "epoch": 0.10424912254499291, "grad_norm": 5.231162071228027, "kl": 1.01953125, "learning_rate": 9.999520243704064e-07, "loss": 0.0408, "reward": 0.9776786118745804, "reward_std": 0.3416283652186394, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8125000298023224, "step": 349 }, { "completion_length": 752.3817291259766, "epoch": 0.10454783063251437, "grad_norm": 8.525174140930176, "kl": 1.1181640625, "learning_rate": 9.999449260802107e-07, "loss": 0.0447, "reward": 1.0385045111179352, "reward_std": 0.3034193776547909, "rewards/accuracy_reward": 0.18526786752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8532366454601288, "step": 350 }, { "completion_length": 762.3795166015625, "epoch": 0.10484653872003584, "grad_norm": 14.298070907592773, "kl": 1.458984375, "learning_rate": 9.99937338294035e-07, "loss": 0.0584, "reward": 0.9776786118745804, "reward_std": 0.26264291256666183, "rewards/accuracy_reward": 0.13839286682195961, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8392857611179352, "step": 351 }, { "completion_length": 787.200927734375, "epoch": 0.10514524680755731, "grad_norm": 7.653049468994141, "kl": 1.484375, "learning_rate": 9.999292610201339e-07, "loss": 0.0594, "reward": 1.0133928954601288, "reward_std": 0.32014793902635574, "rewards/accuracy_reward": 0.16517858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.848214328289032, "step": 352 }, { "completion_length": 694.325927734375, "epoch": 0.10544395489507878, "grad_norm": 7.478911399841309, "kl": 0.9306640625, "learning_rate": 9.999206942672944e-07, "loss": 0.0373, "reward": 1.00948666036129, "reward_std": 0.29871026054024696, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8710937947034836, "step": 353 }, { "completion_length": 629.3236923217773, "epoch": 0.10574266298260025, "grad_norm": 8.495597839355469, "kl": 0.890625, "learning_rate": 9.999116380448367e-07, "loss": 0.0356, "reward": 1.1037946790456772, "reward_std": 0.2801349125802517, "rewards/accuracy_reward": 0.2232142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8805803805589676, "step": 354 }, { "completion_length": 671.5178985595703, "epoch": 0.10604137107012172, "grad_norm": 7.397022247314453, "kl": 0.859375, "learning_rate": 9.999020923626128e-07, "loss": 0.0343, "reward": 0.9285714626312256, "reward_std": 0.2577027715742588, "rewards/accuracy_reward": 0.07142857508733869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8571428954601288, "step": 355 }, { "completion_length": 717.9085006713867, "epoch": 0.1063400791576432, "grad_norm": 55.62885284423828, "kl": 3.166015625, "learning_rate": 9.998920572310075e-07, "loss": 0.1266, "reward": 0.92745541036129, "reward_std": 0.24279388412833214, "rewards/accuracy_reward": 0.07142857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.856026828289032, "step": 356 }, { "completion_length": 743.4397735595703, "epoch": 0.10663878724516467, "grad_norm": 82.17473602294922, "kl": 3.76171875, "learning_rate": 9.998815326609384e-07, "loss": 0.1503, "reward": 0.9118303954601288, "reward_std": 0.2697236016392708, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8537946790456772, "step": 357 }, { "completion_length": 725.6116333007812, "epoch": 0.10693749533268614, "grad_norm": 4.441887855529785, "kl": 1.181640625, "learning_rate": 9.998705186638546e-07, "loss": 0.0473, "reward": 0.9402902275323868, "reward_std": 0.27364571020007133, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8376116454601288, "step": 358 }, { "completion_length": 738.2143096923828, "epoch": 0.10723620342020761, "grad_norm": 3.8747568130493164, "kl": 0.97265625, "learning_rate": 9.998590152517387e-07, "loss": 0.0389, "reward": 0.8956473618745804, "reward_std": 0.2784431055188179, "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.842075914144516, "step": 359 }, { "completion_length": 776.8147583007812, "epoch": 0.10753491150772908, "grad_norm": 4.197185516357422, "kl": 1.2529296875, "learning_rate": 9.99847022437105e-07, "loss": 0.0501, "reward": 0.9581473469734192, "reward_std": 0.3217723146080971, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8152902275323868, "step": 360 }, { "completion_length": 876.1562957763672, "epoch": 0.10783361959525053, "grad_norm": 17.90641212463379, "kl": 2.140625, "learning_rate": 9.998345402330006e-07, "loss": 0.0857, "reward": 0.8560268431901932, "reward_std": 0.27818749099969864, "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8091518133878708, "step": 361 }, { "completion_length": 668.7120819091797, "epoch": 0.108132327682772, "grad_norm": 9.282286643981934, "kl": 1.556640625, "learning_rate": 9.998215686530048e-07, "loss": 0.0623, "reward": 0.9414062947034836, "reward_std": 0.30153394490480423, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8476563096046448, "step": 362 }, { "completion_length": 582.709846496582, "epoch": 0.10843103577029348, "grad_norm": 3.0636022090911865, "kl": 0.8447265625, "learning_rate": 9.998081077112299e-07, "loss": 0.0338, "reward": 1.0792411267757416, "reward_std": 0.2565823383629322, "rewards/accuracy_reward": 0.21428572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8649553954601288, "step": 363 }, { "completion_length": 754.6830749511719, "epoch": 0.10872974385781495, "grad_norm": 4.354219436645508, "kl": 1.2841796875, "learning_rate": 9.997941574223196e-07, "loss": 0.0513, "reward": 0.992745578289032, "reward_std": 0.2648867070674896, "rewards/accuracy_reward": 0.1897321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8030134290456772, "step": 364 }, { "completion_length": 710.5536041259766, "epoch": 0.10902845194533642, "grad_norm": 4.237581729888916, "kl": 0.9248046875, "learning_rate": 9.997797178014505e-07, "loss": 0.037, "reward": 0.9715402126312256, "reward_std": 0.31267424672842026, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.819754496216774, "step": 365 }, { "completion_length": 621.6518096923828, "epoch": 0.10932716003285789, "grad_norm": 10.358372688293457, "kl": 0.9296875, "learning_rate": 9.99764788864332e-07, "loss": 0.0372, "reward": 0.9670759439468384, "reward_std": 0.3088495582342148, "rewards/accuracy_reward": 0.11160714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8554688096046448, "step": 366 }, { "completion_length": 772.2276916503906, "epoch": 0.10962586812037936, "grad_norm": 5.2954559326171875, "kl": 1.0244140625, "learning_rate": 9.997493706272045e-07, "loss": 0.041, "reward": 0.8867187947034836, "reward_std": 0.2999427318572998, "rewards/accuracy_reward": 0.08482143189758062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8018973618745804, "step": 367 }, { "completion_length": 640.0312805175781, "epoch": 0.10992457620790083, "grad_norm": 5.187159061431885, "kl": 0.921875, "learning_rate": 9.997334631068419e-07, "loss": 0.0368, "reward": 0.9330357611179352, "reward_std": 0.3141906037926674, "rewards/accuracy_reward": 0.08705357555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8459821790456772, "step": 368 }, { "completion_length": 792.0870971679688, "epoch": 0.1102232842954223, "grad_norm": 4.137279033660889, "kl": 1.3232421875, "learning_rate": 9.9971706632055e-07, "loss": 0.0529, "reward": 0.8565848618745804, "reward_std": 0.31572649627923965, "rewards/accuracy_reward": 0.06250000093132257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7940848618745804, "step": 369 }, { "completion_length": 714.8527221679688, "epoch": 0.11052199238294377, "grad_norm": 16.535587310791016, "kl": 1.5615234375, "learning_rate": 9.997001802861675e-07, "loss": 0.0624, "reward": 0.940848246216774, "reward_std": 0.28913331031799316, "rewards/accuracy_reward": 0.14955357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7912946790456772, "step": 370 }, { "completion_length": 596.600471496582, "epoch": 0.11082070047046524, "grad_norm": 5.4031081199646, "kl": 0.65283203125, "learning_rate": 9.996828050220636e-07, "loss": 0.0261, "reward": 0.9419643431901932, "reward_std": 0.31791355460882187, "rewards/accuracy_reward": 0.10714286146685481, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8348214775323868, "step": 371 }, { "completion_length": 757.1897583007812, "epoch": 0.1111194085579867, "grad_norm": 2.622593879699707, "kl": 0.3544921875, "learning_rate": 9.996649405471418e-07, "loss": 0.0142, "reward": 0.8643973618745804, "reward_std": 0.3242262154817581, "rewards/accuracy_reward": 0.07589286239817739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.788504496216774, "step": 372 }, { "completion_length": 712.3951110839844, "epoch": 0.11141811664550817, "grad_norm": 2.021888256072998, "kl": 0.28515625, "learning_rate": 9.996465868808365e-07, "loss": 0.0114, "reward": 0.9726563096046448, "reward_std": 0.2714943028986454, "rewards/accuracy_reward": 0.1696428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8030134290456772, "step": 373 }, { "completion_length": 679.8482513427734, "epoch": 0.11171682473302964, "grad_norm": 3.2115862369537354, "kl": 0.281982421875, "learning_rate": 9.996277440431148e-07, "loss": 0.0113, "reward": 0.914620578289032, "reward_std": 0.27972378954291344, "rewards/accuracy_reward": 0.10044643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8141741454601288, "step": 374 }, { "completion_length": 711.1339569091797, "epoch": 0.11201553282055111, "grad_norm": 39.69878005981445, "kl": 1.9970703125, "learning_rate": 9.996084120544758e-07, "loss": 0.08, "reward": 0.977120578289032, "reward_std": 0.2950412705540657, "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7829241454601288, "step": 375 }, { "completion_length": 766.0647735595703, "epoch": 0.11231424090807259, "grad_norm": 601.2239379882812, "kl": 16.458984375, "learning_rate": 9.99588590935951e-07, "loss": 0.6592, "reward": 0.95089291036129, "reward_std": 0.30778902769088745, "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7678571790456772, "step": 376 }, { "completion_length": 805.482177734375, "epoch": 0.11261294899559406, "grad_norm": 376.05023193359375, "kl": 11.9833984375, "learning_rate": 9.995682807091034e-07, "loss": 0.4809, "reward": 0.852120578289032, "reward_std": 0.26393967494368553, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7628348767757416, "step": 377 }, { "completion_length": 750.4933471679688, "epoch": 0.11291165708311553, "grad_norm": 8.203405380249023, "kl": 0.3919677734375, "learning_rate": 9.99547481396029e-07, "loss": 0.0157, "reward": 0.8309152275323868, "reward_std": 0.2914173975586891, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7393973469734192, "step": 378 }, { "completion_length": 882.435302734375, "epoch": 0.113210365170637, "grad_norm": 1.9115374088287354, "kl": 0.1640625, "learning_rate": 9.99526193019355e-07, "loss": 0.0066, "reward": 0.827566996216774, "reward_std": 0.2774134799838066, "rewards/accuracy_reward": 0.09598214901052415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7315848469734192, "step": 379 }, { "completion_length": 739.7835083007812, "epoch": 0.11350907325815847, "grad_norm": 1.1423251628875732, "kl": 0.144287109375, "learning_rate": 9.995044156022416e-07, "loss": 0.0058, "reward": 0.906808078289032, "reward_std": 0.3055914118885994, "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7684152126312256, "step": 380 }, { "completion_length": 704.3303756713867, "epoch": 0.11380778134567994, "grad_norm": 0.730486273765564, "kl": 0.115966796875, "learning_rate": 9.9948214916838e-07, "loss": 0.0046, "reward": 0.9481027275323868, "reward_std": 0.24457179754972458, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7963170111179352, "step": 381 }, { "completion_length": 640.544677734375, "epoch": 0.11410648943320141, "grad_norm": 11.123881340026855, "kl": 0.385986328125, "learning_rate": 9.994593937419942e-07, "loss": 0.0154, "reward": 0.9832589626312256, "reward_std": 0.3330209478735924, "rewards/accuracy_reward": 0.17410715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.809151828289032, "step": 382 }, { "completion_length": 637.6785888671875, "epoch": 0.11440519752072287, "grad_norm": 0.8567681312561035, "kl": 0.11572265625, "learning_rate": 9.994361493478399e-07, "loss": 0.0046, "reward": 1.0708706080913544, "reward_std": 0.3237199932336807, "rewards/accuracy_reward": 0.2433035869617015, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.827566996216774, "step": 383 }, { "completion_length": 708.7187805175781, "epoch": 0.11470390560824434, "grad_norm": 1.1126370429992676, "kl": 0.118896484375, "learning_rate": 9.994124160112044e-07, "loss": 0.0048, "reward": 1.017857164144516, "reward_std": 0.273752361536026, "rewards/accuracy_reward": 0.2008928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.816964328289032, "step": 384 }, { "completion_length": 638.1428909301758, "epoch": 0.11500261369576581, "grad_norm": 975.51416015625, "kl": 15.520751953125, "learning_rate": 9.993881937579075e-07, "loss": 0.6179, "reward": 1.0094866454601288, "reward_std": 0.28896087035536766, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8487723618745804, "step": 385 }, { "completion_length": 636.0335083007812, "epoch": 0.11530132178328728, "grad_norm": 1.9929475784301758, "kl": 0.1573486328125, "learning_rate": 9.993634826143003e-07, "loss": 0.0063, "reward": 1.0797991752624512, "reward_std": 0.24084266647696495, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8699776977300644, "step": 386 }, { "completion_length": 407.75001525878906, "epoch": 0.11560002987080875, "grad_norm": 9.93256950378418, "kl": 0.2896728515625, "learning_rate": 9.993382826072668e-07, "loss": 0.0116, "reward": 1.1311384737491608, "reward_std": 0.2778293117880821, "rewards/accuracy_reward": 0.18750000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9436384290456772, "step": 387 }, { "completion_length": 511.76119232177734, "epoch": 0.11589873795833022, "grad_norm": 1.857025384902954, "kl": 0.17138671875, "learning_rate": 9.993125937642214e-07, "loss": 0.0068, "reward": 1.073660746216774, "reward_std": 0.2735866792500019, "rewards/accuracy_reward": 0.17187500186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.901785746216774, "step": 388 }, { "completion_length": 540.8281631469727, "epoch": 0.1161974460458517, "grad_norm": 0.7709251046180725, "kl": 0.12548828125, "learning_rate": 9.992864161131115e-07, "loss": 0.005, "reward": 1.1205357611179352, "reward_std": 0.28581181168556213, "rewards/accuracy_reward": 0.2299107275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8906250447034836, "step": 389 }, { "completion_length": 540.4330596923828, "epoch": 0.11649615413337316, "grad_norm": 0.3130357265472412, "kl": 0.0955810546875, "learning_rate": 9.992597496824156e-07, "loss": 0.0038, "reward": 1.1277902126312256, "reward_std": 0.25703832879662514, "rewards/accuracy_reward": 0.2254464440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9023438096046448, "step": 390 }, { "completion_length": 495.52904510498047, "epoch": 0.11679486222089464, "grad_norm": 0.47927623987197876, "kl": 0.1146240234375, "learning_rate": 9.992325945011443e-07, "loss": 0.0046, "reward": 1.0150670111179352, "reward_std": 0.20418647304177284, "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9257813096046448, "step": 391 }, { "completion_length": 401.7165298461914, "epoch": 0.1170935703084161, "grad_norm": 0.2655290365219116, "kl": 0.1121826171875, "learning_rate": 9.992049505988397e-07, "loss": 0.0045, "reward": 1.1311384439468384, "reward_std": 0.12776708975434303, "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9481027126312256, "step": 392 }, { "completion_length": 429.12279510498047, "epoch": 0.11739227839593756, "grad_norm": 0.3222493827342987, "kl": 0.1126708984375, "learning_rate": 9.991768180055755e-07, "loss": 0.0045, "reward": 0.994977742433548, "reward_std": 0.12592878192663193, "rewards/accuracy_reward": 0.033482144586741924, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9614955931901932, "step": 393 }, { "completion_length": 537.2544784545898, "epoch": 0.11769098648345903, "grad_norm": 0.5967728495597839, "kl": 0.11083984375, "learning_rate": 9.991481967519575e-07, "loss": 0.0044, "reward": 1.1556920409202576, "reward_std": 0.23186370730400085, "rewards/accuracy_reward": 0.2232142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9324777126312256, "step": 394 }, { "completion_length": 537.1361999511719, "epoch": 0.1179896945709805, "grad_norm": 0.8660680651664734, "kl": 0.1314697265625, "learning_rate": 9.991190868691228e-07, "loss": 0.0053, "reward": 1.066964328289032, "reward_std": 0.2305702194571495, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9352678954601288, "step": 395 }, { "completion_length": 494.6205520629883, "epoch": 0.11828840265850198, "grad_norm": 0.29755041003227234, "kl": 0.087890625, "learning_rate": 9.990894883887397e-07, "loss": 0.0035, "reward": 1.1177455931901932, "reward_std": 0.24486588686704636, "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9391741454601288, "step": 396 }, { "completion_length": 457.2143096923828, "epoch": 0.11858711074602345, "grad_norm": 0.40724557638168335, "kl": 0.10205078125, "learning_rate": 9.99059401343009e-07, "loss": 0.0041, "reward": 1.1992188096046448, "reward_std": 0.23863843083381653, "rewards/accuracy_reward": 0.2589285857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9402902126312256, "step": 397 }, { "completion_length": 480.57814025878906, "epoch": 0.11888581883354492, "grad_norm": 0.2165663242340088, "kl": 0.078857421875, "learning_rate": 9.990288257646621e-07, "loss": 0.0032, "reward": 1.2109375298023224, "reward_std": 0.29532572627067566, "rewards/accuracy_reward": 0.2678571604192257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9430803954601288, "step": 398 }, { "completion_length": 462.5268020629883, "epoch": 0.11918452692106639, "grad_norm": 0.2247057408094406, "kl": 0.074951171875, "learning_rate": 9.989977616869623e-07, "loss": 0.003, "reward": 1.1300223767757416, "reward_std": 0.2128436379134655, "rewards/accuracy_reward": 0.17857143841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9514509290456772, "step": 399 }, { "completion_length": 447.3303756713867, "epoch": 0.11948323500858786, "grad_norm": 0.26497000455856323, "kl": 0.0816650390625, "learning_rate": 9.989662091437042e-07, "loss": 0.0033, "reward": 1.0546875596046448, "reward_std": 0.1659500151872635, "rewards/accuracy_reward": 0.10267857788130641, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9520089775323868, "step": 400 }, { "completion_length": 540.5401992797852, "epoch": 0.11978194309610933, "grad_norm": 0.19962382316589355, "kl": 0.06640625, "learning_rate": 9.989341681692143e-07, "loss": 0.0027, "reward": 1.1049107760190964, "reward_std": 0.16908875480294228, "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.933035746216774, "step": 401 }, { "completion_length": 471.5223388671875, "epoch": 0.1200806511836308, "grad_norm": 0.3539354205131531, "kl": 0.0770263671875, "learning_rate": 9.989016387983494e-07, "loss": 0.0031, "reward": 1.0809152126312256, "reward_std": 0.1983552873134613, "rewards/accuracy_reward": 0.13392857369035482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9469866454601288, "step": 402 }, { "completion_length": 571.6205673217773, "epoch": 0.12037935927115227, "grad_norm": 0.3033287823200226, "kl": 0.075439453125, "learning_rate": 9.988686210664985e-07, "loss": 0.003, "reward": 1.0189732611179352, "reward_std": 0.22725924849510193, "rewards/accuracy_reward": 0.09375000419095159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9252232611179352, "step": 403 }, { "completion_length": 427.78126525878906, "epoch": 0.12067806735867373, "grad_norm": 0.24465063214302063, "kl": 0.08251953125, "learning_rate": 9.98835115009582e-07, "loss": 0.0033, "reward": 1.0792411267757416, "reward_std": 0.19819923117756844, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9676339626312256, "step": 404 }, { "completion_length": 580.2366180419922, "epoch": 0.1209767754461952, "grad_norm": 1.161953091621399, "kl": 0.09130859375, "learning_rate": 9.988011206640509e-07, "loss": 0.0036, "reward": 1.0306920260190964, "reward_std": 0.20968204364180565, "rewards/accuracy_reward": 0.10937500465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.921316996216774, "step": 405 }, { "completion_length": 438.11832427978516, "epoch": 0.12127548353371667, "grad_norm": 0.4461577832698822, "kl": 0.09619140625, "learning_rate": 9.987666380668876e-07, "loss": 0.0039, "reward": 1.1199776977300644, "reward_std": 0.2154545597732067, "rewards/accuracy_reward": 0.16071429778821766, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9592634290456772, "step": 406 }, { "completion_length": 520.0937728881836, "epoch": 0.12157419162123814, "grad_norm": 0.46113523840904236, "kl": 0.07427978515625, "learning_rate": 9.98731667255606e-07, "loss": 0.003, "reward": 1.13058041036129, "reward_std": 0.18933826684951782, "rewards/accuracy_reward": 0.1785714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9520089775323868, "step": 407 }, { "completion_length": 505.4219055175781, "epoch": 0.12187289970875961, "grad_norm": 0.2774527668952942, "kl": 0.076904296875, "learning_rate": 9.98696208268251e-07, "loss": 0.0031, "reward": 1.1467634439468384, "reward_std": 0.2926201969385147, "rewards/accuracy_reward": 0.2053571566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9414063096046448, "step": 408 }, { "completion_length": 472.5982360839844, "epoch": 0.12217160779628108, "grad_norm": 0.19608289003372192, "kl": 0.074462890625, "learning_rate": 9.986602611433982e-07, "loss": 0.003, "reward": 1.0329241454601288, "reward_std": 0.16042689234018326, "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9481027126312256, "step": 409 }, { "completion_length": 543.1986694335938, "epoch": 0.12247031588380256, "grad_norm": 0.33984804153442383, "kl": 0.0819091796875, "learning_rate": 9.986238259201547e-07, "loss": 0.0033, "reward": 1.0686384588479996, "reward_std": 0.20311567559838295, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9481027275323868, "step": 410 }, { "completion_length": 546.1004638671875, "epoch": 0.12276902397132403, "grad_norm": 0.2952536642551422, "kl": 0.0726318359375, "learning_rate": 9.985869026381586e-07, "loss": 0.0029, "reward": 0.9972098618745804, "reward_std": 0.2328958511352539, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9369420111179352, "step": 411 }, { "completion_length": 498.99110412597656, "epoch": 0.1230677320588455, "grad_norm": 0.22392858564853668, "kl": 0.088134765625, "learning_rate": 9.985494913375785e-07, "loss": 0.0035, "reward": 1.126116156578064, "reward_std": 0.21110533364117146, "rewards/accuracy_reward": 0.1808035857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9453125447034836, "step": 412 }, { "completion_length": 513.8326187133789, "epoch": 0.12336644014636697, "grad_norm": 0.19624385237693787, "kl": 0.078125, "learning_rate": 9.985115920591146e-07, "loss": 0.0031, "reward": 1.0541295409202576, "reward_std": 0.2238072231411934, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9492187947034836, "step": 413 }, { "completion_length": 440.32814025878906, "epoch": 0.12366514823388844, "grad_norm": 0.2959558367729187, "kl": 0.084716796875, "learning_rate": 9.984732048439972e-07, "loss": 0.0034, "reward": 1.1774554252624512, "reward_std": 0.2203635759651661, "rewards/accuracy_reward": 0.2142857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9631696939468384, "step": 414 }, { "completion_length": 481.61163330078125, "epoch": 0.1239638563214099, "grad_norm": 0.2822831869125366, "kl": 0.091552734375, "learning_rate": 9.984343297339883e-07, "loss": 0.0037, "reward": 1.1757813096046448, "reward_std": 0.19585711881518364, "rewards/accuracy_reward": 0.2299107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9458705633878708, "step": 415 }, { "completion_length": 586.1495971679688, "epoch": 0.12426256440893137, "grad_norm": 0.16940149664878845, "kl": 0.0704345703125, "learning_rate": 9.983949667713796e-07, "loss": 0.0028, "reward": 1.0524553954601288, "reward_std": 0.18331819027662277, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9520089626312256, "step": 416 }, { "completion_length": 480.8861770629883, "epoch": 0.12456127249645284, "grad_norm": 0.44966766238212585, "kl": 0.096435546875, "learning_rate": 9.983551159989946e-07, "loss": 0.0039, "reward": 1.141183078289032, "reward_std": 0.21786313503980637, "rewards/accuracy_reward": 0.1852678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9559152275323868, "step": 417 }, { "completion_length": 468.0669860839844, "epoch": 0.12485998058397431, "grad_norm": 0.20547077059745789, "kl": 0.076416015625, "learning_rate": 9.98314777460187e-07, "loss": 0.0031, "reward": 1.2226563394069672, "reward_std": 0.2532882858067751, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562798023224, "step": 418 }, { "completion_length": 505.94422149658203, "epoch": 0.1251586886714958, "grad_norm": 0.3456398546695709, "kl": 0.087890625, "learning_rate": 9.98273951198841e-07, "loss": 0.0035, "reward": 1.1417411416769028, "reward_std": 0.23927699774503708, "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9609375447034836, "step": 419 }, { "completion_length": 549.1428909301758, "epoch": 0.12545739675901724, "grad_norm": 0.3700881004333496, "kl": 0.0936279296875, "learning_rate": 9.982326372593718e-07, "loss": 0.0037, "reward": 1.1406250596046448, "reward_std": 0.22192098572850227, "rewards/accuracy_reward": 0.19419643632136285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9464286118745804, "step": 420 }, { "completion_length": 505.3437728881836, "epoch": 0.1257561048465387, "grad_norm": 0.2480056881904602, "kl": 0.08203125, "learning_rate": 9.981908356867247e-07, "loss": 0.0033, "reward": 1.1830357611179352, "reward_std": 0.2581581100821495, "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500447034836, "step": 421 }, { "completion_length": 562.2053909301758, "epoch": 0.12605481293406018, "grad_norm": 0.20588630437850952, "kl": 0.07305908203125, "learning_rate": 9.981485465263759e-07, "loss": 0.0029, "reward": 1.1551340222358704, "reward_std": 0.22923550754785538, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9520089775323868, "step": 422 }, { "completion_length": 534.3705596923828, "epoch": 0.12635352102158165, "grad_norm": 0.25785502791404724, "kl": 0.0716552734375, "learning_rate": 9.981057698243315e-07, "loss": 0.0029, "reward": 1.0569197088479996, "reward_std": 0.18045998737215996, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9520089775323868, "step": 423 }, { "completion_length": 569.2165451049805, "epoch": 0.12665222910910312, "grad_norm": 0.2416565865278244, "kl": 0.07373046875, "learning_rate": 9.980625056271289e-07, "loss": 0.003, "reward": 1.129464328289032, "reward_std": 0.18461395613849163, "rewards/accuracy_reward": 0.1741071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9553571939468384, "step": 424 }, { "completion_length": 524.0134201049805, "epoch": 0.1269509371966246, "grad_norm": 0.18952515721321106, "kl": 0.089111328125, "learning_rate": 9.980187539818348e-07, "loss": 0.0036, "reward": 1.0279018580913544, "reward_std": 0.17535785771906376, "rewards/accuracy_reward": 0.0736607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9542411267757416, "step": 425 }, { "completion_length": 535.4888687133789, "epoch": 0.12724964528414606, "grad_norm": 0.3223864734172821, "kl": 0.071533203125, "learning_rate": 9.979745149360471e-07, "loss": 0.0029, "reward": 1.1752232611179352, "reward_std": 0.20497772097587585, "rewards/accuracy_reward": 0.21428572619333863, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9609375447034836, "step": 426 }, { "completion_length": 559.0290451049805, "epoch": 0.12754835337166753, "grad_norm": 0.2794383764266968, "kl": 0.08599853515625, "learning_rate": 9.97929788537893e-07, "loss": 0.0034, "reward": 1.1501116752624512, "reward_std": 0.25798890739679337, "rewards/accuracy_reward": 0.1941964402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9559152275323868, "step": 427 }, { "completion_length": 557.5245819091797, "epoch": 0.127847061459189, "grad_norm": 0.4616053104400635, "kl": 0.087646484375, "learning_rate": 9.978845748360312e-07, "loss": 0.0035, "reward": 1.0887277126312256, "reward_std": 0.24302243068814278, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9347098469734192, "step": 428 }, { "completion_length": 517.2477951049805, "epoch": 0.12814576954671048, "grad_norm": 0.3306460678577423, "kl": 0.099853515625, "learning_rate": 9.978388738796493e-07, "loss": 0.004, "reward": 1.0987723767757416, "reward_std": 0.20485292375087738, "rewards/accuracy_reward": 0.14285714644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9559152126312256, "step": 429 }, { "completion_length": 578.0268096923828, "epoch": 0.12844447763423195, "grad_norm": 0.4622373878955841, "kl": 0.076904296875, "learning_rate": 9.977926857184655e-07, "loss": 0.0031, "reward": 1.0608259588479996, "reward_std": 0.20990071445703506, "rewards/accuracy_reward": 0.12053572246804833, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9402902126312256, "step": 430 }, { "completion_length": 468.4018096923828, "epoch": 0.12874318572175342, "grad_norm": 0.41462934017181396, "kl": 0.081298828125, "learning_rate": 9.977460104027282e-07, "loss": 0.0033, "reward": 1.1891741752624512, "reward_std": 0.17870273813605309, "rewards/accuracy_reward": 0.2120535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205633878708, "step": 431 }, { "completion_length": 547.0803833007812, "epoch": 0.1290418938092749, "grad_norm": 0.6695319414138794, "kl": 0.09881591796875, "learning_rate": 9.97698847983215e-07, "loss": 0.004, "reward": 1.1417411118745804, "reward_std": 0.21126482635736465, "rewards/accuracy_reward": 0.18303572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.95870541036129, "step": 432 }, { "completion_length": 615.0357360839844, "epoch": 0.12934060189679636, "grad_norm": 0.2691216766834259, "kl": 0.0645751953125, "learning_rate": 9.976511985112348e-07, "loss": 0.0026, "reward": 1.041852742433548, "reward_std": 0.20139127597212791, "rewards/accuracy_reward": 0.09151786169968545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9503348618745804, "step": 433 }, { "completion_length": 575.7634201049805, "epoch": 0.12963930998431783, "grad_norm": 0.38623949885368347, "kl": 0.073486328125, "learning_rate": 9.97603062038625e-07, "loss": 0.0029, "reward": 1.0898438096046448, "reward_std": 0.22267015278339386, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.960379496216774, "step": 434 }, { "completion_length": 651.3348541259766, "epoch": 0.1299380180718393, "grad_norm": 0.6963973641395569, "kl": 0.06884765625, "learning_rate": 9.975544386177537e-07, "loss": 0.0028, "reward": 1.1808035969734192, "reward_std": 0.24355067312717438, "rewards/accuracy_reward": 0.2388392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.941964328289032, "step": 435 }, { "completion_length": 618.9910888671875, "epoch": 0.13023672615936077, "grad_norm": 0.9838470816612244, "kl": 0.0758056640625, "learning_rate": 9.97505328301518e-07, "loss": 0.003, "reward": 1.0195312798023224, "reward_std": 0.19645055383443832, "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9503348767757416, "step": 436 }, { "completion_length": 495.95984649658203, "epoch": 0.13053543424688224, "grad_norm": 0.44010692834854126, "kl": 0.06732177734375, "learning_rate": 9.974557311433453e-07, "loss": 0.0027, "reward": 1.05636166036129, "reward_std": 0.15333734266459942, "rewards/accuracy_reward": 0.08035714761354029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 437 }, { "completion_length": 605.2187728881836, "epoch": 0.1308341423344037, "grad_norm": 0.9653443694114685, "kl": 0.069580078125, "learning_rate": 9.974056471971925e-07, "loss": 0.0028, "reward": 1.1746652126312256, "reward_std": 0.20580513216555119, "rewards/accuracy_reward": 0.2165178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9581473618745804, "step": 438 }, { "completion_length": 580.8616333007812, "epoch": 0.13113285042192518, "grad_norm": 0.8792102336883545, "kl": 0.07470703125, "learning_rate": 9.973550765175463e-07, "loss": 0.003, "reward": 1.0954241454601288, "reward_std": 0.19353876449167728, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9614955633878708, "step": 439 }, { "completion_length": 577.9330596923828, "epoch": 0.13143155850944666, "grad_norm": 1.5934703350067139, "kl": 0.09130859375, "learning_rate": 9.97304019159422e-07, "loss": 0.0037, "reward": 1.184151828289032, "reward_std": 0.2472451999783516, "rewards/accuracy_reward": 0.22991072107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9542411118745804, "step": 440 }, { "completion_length": 617.9107513427734, "epoch": 0.13173026659696813, "grad_norm": 1.385853886604309, "kl": 0.0859375, "learning_rate": 9.972524751783657e-07, "loss": 0.0034, "reward": 1.111607164144516, "reward_std": 0.15477954596281052, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464775323868, "step": 441 }, { "completion_length": 583.4196701049805, "epoch": 0.13202897468448957, "grad_norm": 1.7211228609085083, "kl": 0.191650390625, "learning_rate": 9.972004446304516e-07, "loss": 0.0077, "reward": 1.0239955931901932, "reward_std": 0.11778904683887959, "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.945870578289032, "step": 442 }, { "completion_length": 590.4843978881836, "epoch": 0.13232768277201104, "grad_norm": 1.9713138341903687, "kl": 0.27392578125, "learning_rate": 9.971479275722843e-07, "loss": 0.0109, "reward": 1.1227679252624512, "reward_std": 0.15139595791697502, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9575893133878708, "step": 443 }, { "completion_length": 550.3973541259766, "epoch": 0.1326263908595325, "grad_norm": 13.88331413269043, "kl": 0.956787109375, "learning_rate": 9.97094924060997e-07, "loss": 0.0383, "reward": 1.095982164144516, "reward_std": 0.17489194124937057, "rewards/accuracy_reward": 0.133928582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9620536118745804, "step": 444 }, { "completion_length": 586.1964492797852, "epoch": 0.13292509894705398, "grad_norm": 3.871896982192993, "kl": 0.68603515625, "learning_rate": 9.970414341542522e-07, "loss": 0.0274, "reward": 1.1796875596046448, "reward_std": 0.21864816546440125, "rewards/accuracy_reward": 0.2321428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9475446939468384, "step": 445 }, { "completion_length": 546.8839569091797, "epoch": 0.13322380703457545, "grad_norm": 2.0695455074310303, "kl": 0.701171875, "learning_rate": 9.969874579102418e-07, "loss": 0.0281, "reward": 1.11104916036129, "reward_std": 0.1708942875266075, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9525670111179352, "step": 446 }, { "completion_length": 572.2277069091797, "epoch": 0.13352251512209692, "grad_norm": 4.468729019165039, "kl": 0.6143798828125, "learning_rate": 9.969329953876866e-07, "loss": 0.0246, "reward": 1.1975446939468384, "reward_std": 0.1519378237426281, "rewards/accuracy_reward": 0.2187500111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.978794664144516, "step": 447 }, { "completion_length": 535.2232284545898, "epoch": 0.1338212232096184, "grad_norm": 2.5589234828948975, "kl": 0.2327880859375, "learning_rate": 9.968780466458367e-07, "loss": 0.0093, "reward": 1.059151828289032, "reward_std": 0.18590735271573067, "rewards/accuracy_reward": 0.09375000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9654018133878708, "step": 448 }, { "completion_length": 529.1652069091797, "epoch": 0.13411993129713987, "grad_norm": 3.261960029602051, "kl": 0.3896484375, "learning_rate": 9.968226117444707e-07, "loss": 0.0156, "reward": 1.1668527275323868, "reward_std": 0.1876882202923298, "rewards/accuracy_reward": 0.19642857951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9704241305589676, "step": 449 }, { "completion_length": 491.87725830078125, "epoch": 0.13441863938466134, "grad_norm": 0.7007730007171631, "kl": 0.43798828125, "learning_rate": 9.967666907438965e-07, "loss": 0.0175, "reward": 1.0842634290456772, "reward_std": 0.1109037920832634, "rewards/accuracy_reward": 0.09821429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 450 }, { "completion_length": 537.4643249511719, "epoch": 0.1347173474721828, "grad_norm": 10.42814826965332, "kl": 1.4130859375, "learning_rate": 9.967102837049506e-07, "loss": 0.0567, "reward": 1.1054687798023224, "reward_std": 0.1526376809924841, "rewards/accuracy_reward": 0.12946428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 451 }, { "completion_length": 471.0513610839844, "epoch": 0.13501605555970428, "grad_norm": 1.8379894495010376, "kl": 0.573486328125, "learning_rate": 9.966533906889987e-07, "loss": 0.0229, "reward": 1.1562500298023224, "reward_std": 0.16726750694215298, "rewards/accuracy_reward": 0.17633929592557251, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 452 }, { "completion_length": 537.6384048461914, "epoch": 0.13531476364722575, "grad_norm": 3.655841827392578, "kl": 0.711669921875, "learning_rate": 9.965960117579341e-07, "loss": 0.0285, "reward": 1.1328125298023224, "reward_std": 0.17133421823382378, "rewards/accuracy_reward": 0.16071429150179029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9720982611179352, "step": 453 }, { "completion_length": 546.9643096923828, "epoch": 0.13561347173474722, "grad_norm": 1.9390339851379395, "kl": 0.6015625, "learning_rate": 9.965381469741798e-07, "loss": 0.024, "reward": 1.14620541036129, "reward_std": 0.17020147666335106, "rewards/accuracy_reward": 0.17857144260779023, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9676339775323868, "step": 454 }, { "completion_length": 543.6116409301758, "epoch": 0.1359121798222687, "grad_norm": 1.6037344932556152, "kl": 0.576171875, "learning_rate": 9.964797964006871e-07, "loss": 0.0231, "reward": 1.2550223767757416, "reward_std": 0.18281599879264832, "rewards/accuracy_reward": 0.279017873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.976004496216774, "step": 455 }, { "completion_length": 622.5290451049805, "epoch": 0.13621088790979016, "grad_norm": 6.9101104736328125, "kl": 1.47607421875, "learning_rate": 9.964209601009357e-07, "loss": 0.0592, "reward": 1.1043527275323868, "reward_std": 0.19975124672055244, "rewards/accuracy_reward": 0.14285714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9614955633878708, "step": 456 }, { "completion_length": 544.4933395385742, "epoch": 0.13650959599731163, "grad_norm": 5.5457048416137695, "kl": 1.32568359375, "learning_rate": 9.963616381389336e-07, "loss": 0.053, "reward": 1.1322545409202576, "reward_std": 0.16719898022711277, "rewards/accuracy_reward": 0.15848214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723618745804, "step": 457 }, { "completion_length": 506.0781478881836, "epoch": 0.1368083040848331, "grad_norm": 1.488362193107605, "kl": 0.69140625, "learning_rate": 9.963018305792174e-07, "loss": 0.0276, "reward": 1.2070313096046448, "reward_std": 0.16905471310019493, "rewards/accuracy_reward": 0.2388392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.968191996216774, "step": 458 }, { "completion_length": 534.3236999511719, "epoch": 0.13710701217235458, "grad_norm": 2.323958396911621, "kl": 0.16552734375, "learning_rate": 9.962415374868516e-07, "loss": 0.0066, "reward": 1.0636161267757416, "reward_std": 0.1598990075290203, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 459 }, { "completion_length": 506.2611846923828, "epoch": 0.13740572025987605, "grad_norm": 2.7829341888427734, "kl": 0.1705322265625, "learning_rate": 9.961807589274297e-07, "loss": 0.0068, "reward": 1.1768973767757416, "reward_std": 0.17819204181432724, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 460 }, { "completion_length": 570.8549346923828, "epoch": 0.13770442834739752, "grad_norm": 5.174618721008301, "kl": 0.2080078125, "learning_rate": 9.961194949670722e-07, "loss": 0.0083, "reward": 1.3035714626312256, "reward_std": 0.1835720967501402, "rewards/accuracy_reward": 0.3348214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500447034836, "step": 461 }, { "completion_length": 513.1317367553711, "epoch": 0.138003136434919, "grad_norm": 2.654130220413208, "kl": 0.329345703125, "learning_rate": 9.960577456724288e-07, "loss": 0.0131, "reward": 1.1718750596046448, "reward_std": 0.18154440820217133, "rewards/accuracy_reward": 0.1852678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071939468384, "step": 462 }, { "completion_length": 501.86610412597656, "epoch": 0.13830184452244043, "grad_norm": 3.7480976581573486, "kl": 0.587890625, "learning_rate": 9.959955111106763e-07, "loss": 0.0235, "reward": 1.1222098916769028, "reward_std": 0.14336296264082193, "rewards/accuracy_reward": 0.14732143585570157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884290456772, "step": 463 }, { "completion_length": 488.56922149658203, "epoch": 0.1386005526099619, "grad_norm": 3.337512731552124, "kl": 0.8916015625, "learning_rate": 9.959327913495202e-07, "loss": 0.0357, "reward": 1.2862723767757416, "reward_std": 0.22710124030709267, "rewards/accuracy_reward": 0.3035714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 464 }, { "completion_length": 439.1406478881836, "epoch": 0.13889926069748337, "grad_norm": 1.0424065589904785, "kl": 0.468994140625, "learning_rate": 9.95869586457193e-07, "loss": 0.0188, "reward": 1.2299107611179352, "reward_std": 0.1906886138021946, "rewards/accuracy_reward": 0.2544642984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464477300644, "step": 465 }, { "completion_length": 475.57814025878906, "epoch": 0.13919796878500484, "grad_norm": 3.2375733852386475, "kl": 0.96142578125, "learning_rate": 9.958058965024558e-07, "loss": 0.0385, "reward": 1.098214328289032, "reward_std": 0.12657487578690052, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 466 }, { "completion_length": 461.1942138671875, "epoch": 0.13949667687252632, "grad_norm": 1.064997673034668, "kl": 0.6298828125, "learning_rate": 9.957417215545968e-07, "loss": 0.0252, "reward": 1.1160714626312256, "reward_std": 0.17097464576363564, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98214291036129, "step": 467 }, { "completion_length": 488.49108123779297, "epoch": 0.13979538496004779, "grad_norm": 2.571380376815796, "kl": 0.80419921875, "learning_rate": 9.95677061683432e-07, "loss": 0.0322, "reward": 1.061383992433548, "reward_std": 0.10335664078593254, "rewards/accuracy_reward": 0.07589286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 468 }, { "completion_length": 522.0178680419922, "epoch": 0.14009409304756926, "grad_norm": 1.4982008934020996, "kl": 0.83203125, "learning_rate": 9.956119169593055e-07, "loss": 0.0333, "reward": 1.0864956080913544, "reward_std": 0.20152686536312103, "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9704241305589676, "step": 469 }, { "completion_length": 463.0268020629883, "epoch": 0.14039280113509073, "grad_norm": 3.3827905654907227, "kl": 0.3846435546875, "learning_rate": 9.955462874530878e-07, "loss": 0.0154, "reward": 1.1757813096046448, "reward_std": 0.16034765541553497, "rewards/accuracy_reward": 0.1897321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 470 }, { "completion_length": 481.42859649658203, "epoch": 0.1406915092226122, "grad_norm": 4.308563232421875, "kl": 0.423828125, "learning_rate": 9.954801732361776e-07, "loss": 0.017, "reward": 1.1082589626312256, "reward_std": 0.135277496650815, "rewards/accuracy_reward": 0.12723215017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 471 }, { "completion_length": 453.8326187133789, "epoch": 0.14099021731013367, "grad_norm": 1.4146311283111572, "kl": 0.402099609375, "learning_rate": 9.954135743805007e-07, "loss": 0.0161, "reward": 1.2695313096046448, "reward_std": 0.1782943643629551, "rewards/accuracy_reward": 0.2834821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 472 }, { "completion_length": 451.1317138671875, "epoch": 0.14128892539765514, "grad_norm": 2.7925329208374023, "kl": 0.7265625, "learning_rate": 9.9534649095851e-07, "loss": 0.029, "reward": 1.1127232909202576, "reward_std": 0.18179600313305855, "rewards/accuracy_reward": 0.13392857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946790456772, "step": 473 }, { "completion_length": 465.1741256713867, "epoch": 0.1415876334851766, "grad_norm": 1.3965520858764648, "kl": 0.7001953125, "learning_rate": 9.952789230431859e-07, "loss": 0.028, "reward": 1.1501116454601288, "reward_std": 0.20379038155078888, "rewards/accuracy_reward": 0.16964286752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 474 }, { "completion_length": 447.5111770629883, "epoch": 0.14188634157269808, "grad_norm": 1.4890884160995483, "kl": 0.900390625, "learning_rate": 9.952108707080355e-07, "loss": 0.0361, "reward": 1.1417411267757416, "reward_std": 0.14212601259350777, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 475 }, { "completion_length": 509.7477798461914, "epoch": 0.14218504966021955, "grad_norm": 4.34094762802124, "kl": 1.4931640625, "learning_rate": 9.95142334027093e-07, "loss": 0.0599, "reward": 1.0825893431901932, "reward_std": 0.16298084519803524, "rewards/accuracy_reward": 0.10937500605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.973214328289032, "step": 476 }, { "completion_length": 543.0692291259766, "epoch": 0.14248375774774102, "grad_norm": 4.2634735107421875, "kl": 1.269775390625, "learning_rate": 9.950733130749197e-07, "loss": 0.0508, "reward": 1.2036831080913544, "reward_std": 0.15944139286875725, "rewards/accuracy_reward": 0.22321429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 477 }, { "completion_length": 437.6629638671875, "epoch": 0.1427824658352625, "grad_norm": 0.5774505138397217, "kl": 0.530029296875, "learning_rate": 9.950038079266038e-07, "loss": 0.0212, "reward": 1.1584821939468384, "reward_std": 0.15452806651592255, "rewards/accuracy_reward": 0.17633929336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98214291036129, "step": 478 }, { "completion_length": 456.5491409301758, "epoch": 0.14308117392278397, "grad_norm": 0.8781670331954956, "kl": 0.2930908203125, "learning_rate": 9.949338186577601e-07, "loss": 0.0118, "reward": 1.0558036267757416, "reward_std": 0.15465228632092476, "rewards/accuracy_reward": 0.06473214481957257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 479 }, { "completion_length": 470.69869232177734, "epoch": 0.14337988201030544, "grad_norm": 3.9238171577453613, "kl": 0.338134765625, "learning_rate": 9.948633453445297e-07, "loss": 0.0136, "reward": 1.1188616454601288, "reward_std": 0.12921205535531044, "rewards/accuracy_reward": 0.14062500791624188, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.97823666036129, "step": 480 }, { "completion_length": 500.37279510498047, "epoch": 0.1436785900978269, "grad_norm": 1.0951365232467651, "kl": 0.145751953125, "learning_rate": 9.94792388063581e-07, "loss": 0.0058, "reward": 1.156808078289032, "reward_std": 0.14874013140797615, "rewards/accuracy_reward": 0.16294643562287092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 481 }, { "completion_length": 532.5134048461914, "epoch": 0.14397729818534838, "grad_norm": 5.572168350219727, "kl": 0.3111572265625, "learning_rate": 9.94720946892108e-07, "loss": 0.0125, "reward": 1.2220982611179352, "reward_std": 0.16599978134036064, "rewards/accuracy_reward": 0.2566964440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9654018431901932, "step": 482 }, { "completion_length": 458.87279510498047, "epoch": 0.14427600627286985, "grad_norm": 2.802061080932617, "kl": 0.41796875, "learning_rate": 9.946490219078326e-07, "loss": 0.0167, "reward": 1.0892857760190964, "reward_std": 0.16592061147093773, "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 483 }, { "completion_length": 494.36387634277344, "epoch": 0.14457471436039132, "grad_norm": 2.0279507637023926, "kl": 0.4227294921875, "learning_rate": 9.945766131890014e-07, "loss": 0.0169, "reward": 1.2427456080913544, "reward_std": 0.22683854214847088, "rewards/accuracy_reward": 0.2700892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562798023224, "step": 484 }, { "completion_length": 511.0178756713867, "epoch": 0.14487342244791276, "grad_norm": 0.7630558609962463, "kl": 0.7294921875, "learning_rate": 9.945037208143882e-07, "loss": 0.0292, "reward": 1.07979916036129, "reward_std": 0.14969712123274803, "rewards/accuracy_reward": 0.10267857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977120578289032, "step": 485 }, { "completion_length": 482.19197845458984, "epoch": 0.14517213053543424, "grad_norm": 1.271724820137024, "kl": 0.2711181640625, "learning_rate": 9.94430344863293e-07, "loss": 0.0108, "reward": 1.102678656578064, "reward_std": 0.15464110858738422, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 486 }, { "completion_length": 448.99778747558594, "epoch": 0.1454708386229557, "grad_norm": 0.4339018762111664, "kl": 0.6982421875, "learning_rate": 9.943564854155412e-07, "loss": 0.0279, "reward": 1.0937500298023224, "reward_std": 0.14868748746812344, "rewards/accuracy_reward": 0.10267857741564512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 487 }, { "completion_length": 419.0268020629883, "epoch": 0.14576954671047718, "grad_norm": 1.0046104192733765, "kl": 0.220947265625, "learning_rate": 9.942821425514853e-07, "loss": 0.0088, "reward": 1.0993304252624512, "reward_std": 0.0926582042593509, "rewards/accuracy_reward": 0.10491071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9944196790456772, "step": 488 }, { "completion_length": 468.50225830078125, "epoch": 0.14606825479799865, "grad_norm": 0.8940080404281616, "kl": 0.3131103515625, "learning_rate": 9.942073163520023e-07, "loss": 0.0125, "reward": 1.1445312798023224, "reward_std": 0.16051233559846878, "rewards/accuracy_reward": 0.1517857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 489 }, { "completion_length": 445.62056732177734, "epoch": 0.14636696288552012, "grad_norm": 1.8175476789474487, "kl": 1.07421875, "learning_rate": 9.941320068984961e-07, "loss": 0.0429, "reward": 1.0597098618745804, "reward_std": 0.1142397578805685, "rewards/accuracy_reward": 0.07812500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 490 }, { "completion_length": 497.9777069091797, "epoch": 0.1466656709730416, "grad_norm": 1.3806238174438477, "kl": 0.9287109375, "learning_rate": 9.940562142728961e-07, "loss": 0.0371, "reward": 1.119977742433548, "reward_std": 0.13752157799899578, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 491 }, { "completion_length": 429.1607360839844, "epoch": 0.14696437906056306, "grad_norm": 0.5336852073669434, "kl": 0.633056640625, "learning_rate": 9.939799385576573e-07, "loss": 0.0254, "reward": 1.233258992433548, "reward_std": 0.14618048071861267, "rewards/accuracy_reward": 0.2410714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875149011612, "step": 492 }, { "completion_length": 594.0937652587891, "epoch": 0.14726308714808453, "grad_norm": 5.892765522003174, "kl": 1.78125, "learning_rate": 9.9390317983576e-07, "loss": 0.0713, "reward": 1.1261160969734192, "reward_std": 0.13901685737073421, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9631696939468384, "step": 493 }, { "completion_length": 427.25894927978516, "epoch": 0.147561795235606, "grad_norm": 1.4710144996643066, "kl": 0.37841796875, "learning_rate": 9.9382593819071e-07, "loss": 0.0151, "reward": 1.1662947088479996, "reward_std": 0.0945171294733882, "rewards/accuracy_reward": 0.17857143841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 494 }, { "completion_length": 501.8080596923828, "epoch": 0.14786050332312747, "grad_norm": 0.8383501768112183, "kl": 0.55078125, "learning_rate": 9.93748213706539e-07, "loss": 0.0221, "reward": 1.102120578289032, "reward_std": 0.18948381952941418, "rewards/accuracy_reward": 0.12053572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848767757416, "step": 495 }, { "completion_length": 516.3348388671875, "epoch": 0.14815921141064894, "grad_norm": 1.2299991846084595, "kl": 0.883544921875, "learning_rate": 9.936700064678033e-07, "loss": 0.0353, "reward": 1.086495578289032, "reward_std": 0.1434980034828186, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884290456772, "step": 496 }, { "completion_length": 525.6093902587891, "epoch": 0.14845791949817042, "grad_norm": 0.9366551041603088, "kl": 0.8349609375, "learning_rate": 9.93591316559585e-07, "loss": 0.0334, "reward": 1.1707589775323868, "reward_std": 0.12898553721606731, "rewards/accuracy_reward": 0.18973215110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268431901932, "step": 497 }, { "completion_length": 514.7321548461914, "epoch": 0.1487566275856919, "grad_norm": 0.758455753326416, "kl": 0.3927001953125, "learning_rate": 9.935121440674913e-07, "loss": 0.0157, "reward": 1.2433036267757416, "reward_std": 0.26713603362441063, "rewards/accuracy_reward": 0.2566964402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 498 }, { "completion_length": 440.8147506713867, "epoch": 0.14905533567321336, "grad_norm": 3.904609441757202, "kl": 0.5966796875, "learning_rate": 9.934324890776533e-07, "loss": 0.0238, "reward": 1.1562500596046448, "reward_std": 0.14246814418584108, "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464775323868, "step": 499 }, { "completion_length": 503.3303909301758, "epoch": 0.14935404376073483, "grad_norm": 0.6919684410095215, "kl": 0.52001953125, "learning_rate": 9.933523516767282e-07, "loss": 0.0208, "reward": 1.1344866454601288, "reward_std": 0.13098725792951882, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616454601288, "step": 500 }, { "completion_length": 436.06697845458984, "epoch": 0.1496527518482563, "grad_norm": 1.2278622388839722, "kl": 0.72021484375, "learning_rate": 9.932717319518979e-07, "loss": 0.0288, "reward": 1.0792411267757416, "reward_std": 0.10912673827260733, "rewards/accuracy_reward": 0.08705357811413705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 501 }, { "completion_length": 460.5178756713867, "epoch": 0.14995145993577777, "grad_norm": 1.6881215572357178, "kl": 0.719970703125, "learning_rate": 9.931906299908685e-07, "loss": 0.0288, "reward": 1.0558036267757416, "reward_std": 0.11966271884739399, "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 502 }, { "completion_length": 524.6741256713867, "epoch": 0.15025016802329924, "grad_norm": 1.153145670890808, "kl": 0.623046875, "learning_rate": 9.93109045881871e-07, "loss": 0.025, "reward": 1.1847098469734192, "reward_std": 0.16736655496060848, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884290456772, "step": 503 }, { "completion_length": 518.7277069091797, "epoch": 0.1505488761108207, "grad_norm": 1.5287176370620728, "kl": 0.906005859375, "learning_rate": 9.930269797136608e-07, "loss": 0.0362, "reward": 1.121651828289032, "reward_std": 0.18830066174268723, "rewards/accuracy_reward": 0.14062500605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 504 }, { "completion_length": 450.8794860839844, "epoch": 0.15084758419834218, "grad_norm": 0.35877665877342224, "kl": 0.1707763671875, "learning_rate": 9.929444315755182e-07, "loss": 0.0068, "reward": 1.3203125298023224, "reward_std": 0.22315667942166328, "rewards/accuracy_reward": 0.330357164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 505 }, { "completion_length": 517.6361846923828, "epoch": 0.15114629228586363, "grad_norm": 2.405656576156616, "kl": 1.232177734375, "learning_rate": 9.92861401557247e-07, "loss": 0.0492, "reward": 1.0965402275323868, "reward_std": 0.17963510379195213, "rewards/accuracy_reward": 0.12276786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723618745804, "step": 506 }, { "completion_length": 441.35269927978516, "epoch": 0.1514450003733851, "grad_norm": 6.5084357261657715, "kl": 1.7421875, "learning_rate": 9.927778897491763e-07, "loss": 0.0699, "reward": 1.1417411416769028, "reward_std": 0.18336454033851624, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9698661118745804, "step": 507 }, { "completion_length": 459.4531478881836, "epoch": 0.15174370846090657, "grad_norm": 0.9653996229171753, "kl": 0.2847900390625, "learning_rate": 9.926938962421582e-07, "loss": 0.0114, "reward": 1.266183078289032, "reward_std": 0.156936414539814, "rewards/accuracy_reward": 0.272321441443637, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616454601288, "step": 508 }, { "completion_length": 547.4620819091797, "epoch": 0.15204241654842804, "grad_norm": 3.624725818634033, "kl": 0.246826171875, "learning_rate": 9.9260942112757e-07, "loss": 0.0099, "reward": 1.1026785969734192, "reward_std": 0.1669379472732544, "rewards/accuracy_reward": 0.1250000020954758, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 509 }, { "completion_length": 544.662956237793, "epoch": 0.1523411246359495, "grad_norm": 2.0700650215148926, "kl": 0.286376953125, "learning_rate": 9.925244644973115e-07, "loss": 0.0115, "reward": 1.0329241752624512, "reward_std": 0.12555187940597534, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 510 }, { "completion_length": 542.0602798461914, "epoch": 0.15263983272347098, "grad_norm": 1.0929808616638184, "kl": 0.82421875, "learning_rate": 9.92439026443808e-07, "loss": 0.033, "reward": 1.2020089626312256, "reward_std": 0.09506223909556866, "rewards/accuracy_reward": 0.2254464402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625298023224, "step": 511 }, { "completion_length": 561.4375305175781, "epoch": 0.15293854081099245, "grad_norm": 2.823585271835327, "kl": 1.55419921875, "learning_rate": 9.923531070600073e-07, "loss": 0.0622, "reward": 1.2204241454601288, "reward_std": 0.19808544591069221, "rewards/accuracy_reward": 0.2566964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9637277126312256, "step": 512 }, { "completion_length": 512.2790374755859, "epoch": 0.15323724889851392, "grad_norm": 0.7137159705162048, "kl": 0.212890625, "learning_rate": 9.922667064393816e-07, "loss": 0.0085, "reward": 1.0965402126312256, "reward_std": 0.09723949152976274, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 513 }, { "completion_length": 454.38394927978516, "epoch": 0.1535359569860354, "grad_norm": 0.6505142450332642, "kl": 0.9052734375, "learning_rate": 9.921798246759258e-07, "loss": 0.0362, "reward": 1.1808036267757416, "reward_std": 0.20494533330202103, "rewards/accuracy_reward": 0.2008928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107760190964, "step": 514 }, { "completion_length": 537.5960083007812, "epoch": 0.15383466507355686, "grad_norm": 4.60072135925293, "kl": 1.541015625, "learning_rate": 9.92092461864159e-07, "loss": 0.0616, "reward": 1.1305803954601288, "reward_std": 0.18545103073120117, "rewards/accuracy_reward": 0.14955358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 515 }, { "completion_length": 472.56028747558594, "epoch": 0.15413337316107834, "grad_norm": 0.7777644991874695, "kl": 0.733642578125, "learning_rate": 9.920046180991236e-07, "loss": 0.0293, "reward": 1.0613839775323868, "reward_std": 0.11052386835217476, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 516 }, { "completion_length": 446.6585006713867, "epoch": 0.1544320812485998, "grad_norm": 0.6075223088264465, "kl": 0.8173828125, "learning_rate": 9.919162934763848e-07, "loss": 0.0327, "reward": 1.2561384439468384, "reward_std": 0.17371251061558723, "rewards/accuracy_reward": 0.2723214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 517 }, { "completion_length": 524.3415374755859, "epoch": 0.15473078933612128, "grad_norm": 0.4626213014125824, "kl": 0.524658203125, "learning_rate": 9.918274880920311e-07, "loss": 0.021, "reward": 1.1205357611179352, "reward_std": 0.13185781612992287, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 518 }, { "completion_length": 461.8928756713867, "epoch": 0.15502949742364275, "grad_norm": 1.5585558414459229, "kl": 0.343505859375, "learning_rate": 9.917382020426742e-07, "loss": 0.0137, "reward": 1.2042411267757416, "reward_std": 0.20589962974190712, "rewards/accuracy_reward": 0.2276785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625298023224, "step": 519 }, { "completion_length": 607.0111846923828, "epoch": 0.15532820551116422, "grad_norm": 0.5512415766716003, "kl": 0.579345703125, "learning_rate": 9.916484354254486e-07, "loss": 0.0232, "reward": 1.0775670111179352, "reward_std": 0.15687880665063858, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 520 }, { "completion_length": 545.4977798461914, "epoch": 0.1556269135986857, "grad_norm": 1.3139277696609497, "kl": 0.60107421875, "learning_rate": 9.915581883380112e-07, "loss": 0.024, "reward": 1.0781250596046448, "reward_std": 0.12373684532940388, "rewards/accuracy_reward": 0.10491071688011289, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9732143133878708, "step": 521 }, { "completion_length": 455.2343978881836, "epoch": 0.15592562168620716, "grad_norm": 1.550239086151123, "kl": 0.58740234375, "learning_rate": 9.914674608785422e-07, "loss": 0.0235, "reward": 1.116071492433548, "reward_std": 0.11238089692778885, "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071939468384, "step": 522 }, { "completion_length": 473.6763610839844, "epoch": 0.15622432977372863, "grad_norm": 0.5991495847702026, "kl": 0.534912109375, "learning_rate": 9.913762531457444e-07, "loss": 0.0214, "reward": 1.184151828289032, "reward_std": 0.16634351015090942, "rewards/accuracy_reward": 0.2053571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946790456772, "step": 523 }, { "completion_length": 512.3303756713867, "epoch": 0.1565230378612501, "grad_norm": 0.8286630511283875, "kl": 0.59716796875, "learning_rate": 9.912845652388425e-07, "loss": 0.0239, "reward": 1.2047991454601288, "reward_std": 0.16683787666261196, "rewards/accuracy_reward": 0.2165178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 524 }, { "completion_length": 522.7366333007812, "epoch": 0.15682174594877157, "grad_norm": 1.8539392948150635, "kl": 0.8896484375, "learning_rate": 9.911923972575844e-07, "loss": 0.0356, "reward": 1.1489956080913544, "reward_std": 0.16585970297455788, "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977120578289032, "step": 525 }, { "completion_length": 564.8036041259766, "epoch": 0.15712045403629304, "grad_norm": 2.7271671295166016, "kl": 1.33203125, "learning_rate": 9.910997493022395e-07, "loss": 0.0534, "reward": 1.127790242433548, "reward_std": 0.13940796442329884, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 526 }, { "completion_length": 541.8236846923828, "epoch": 0.15741916212381452, "grad_norm": 3.661660671234131, "kl": 1.36572265625, "learning_rate": 9.910066214735997e-07, "loss": 0.0545, "reward": 1.0920759439468384, "reward_std": 0.12679478991776705, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 527 }, { "completion_length": 491.7879638671875, "epoch": 0.15771787021133596, "grad_norm": 1.547270655632019, "kl": 1.0311279296875, "learning_rate": 9.90913013872979e-07, "loss": 0.0412, "reward": 1.2578125298023224, "reward_std": 0.19776270166039467, "rewards/accuracy_reward": 0.2700892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 528 }, { "completion_length": 561.6339645385742, "epoch": 0.15801657829885743, "grad_norm": 2.342393398284912, "kl": 1.5771484375, "learning_rate": 9.908189266022135e-07, "loss": 0.0632, "reward": 1.1400669813156128, "reward_std": 0.1721143089234829, "rewards/accuracy_reward": 0.16294643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205931901932, "step": 529 }, { "completion_length": 486.80582427978516, "epoch": 0.1583152863863789, "grad_norm": 0.7340788245201111, "kl": 0.649658203125, "learning_rate": 9.907243597636606e-07, "loss": 0.026, "reward": 1.1065848767757416, "reward_std": 0.14072087779641151, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 530 }, { "completion_length": 491.8995590209961, "epoch": 0.15861399447390037, "grad_norm": 3.445952892303467, "kl": 0.607421875, "learning_rate": 9.906293134602e-07, "loss": 0.0243, "reward": 1.1891741454601288, "reward_std": 0.16777821630239487, "rewards/accuracy_reward": 0.20312500465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 531 }, { "completion_length": 552.4776992797852, "epoch": 0.15891270256142184, "grad_norm": 1.8861815929412842, "kl": 0.79931640625, "learning_rate": 9.905337877952326e-07, "loss": 0.032, "reward": 1.1071428954601288, "reward_std": 0.17125286906957626, "rewards/accuracy_reward": 0.12723214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 532 }, { "completion_length": 523.6361846923828, "epoch": 0.1592114106489433, "grad_norm": 2.086726188659668, "kl": 0.92724609375, "learning_rate": 9.90437782872681e-07, "loss": 0.0371, "reward": 1.1205357611179352, "reward_std": 0.1100863516330719, "rewards/accuracy_reward": 0.14732143771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.973214328289032, "step": 533 }, { "completion_length": 575.3393096923828, "epoch": 0.15951011873646478, "grad_norm": 2.9933319091796875, "kl": 0.6279296875, "learning_rate": 9.903412987969894e-07, "loss": 0.0251, "reward": 1.0809151828289032, "reward_std": 0.18813743442296982, "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9715402275323868, "step": 534 }, { "completion_length": 484.93082427978516, "epoch": 0.15980882682398626, "grad_norm": 0.5235523581504822, "kl": 0.32080078125, "learning_rate": 9.902443356731225e-07, "loss": 0.0128, "reward": 1.102120578289032, "reward_std": 0.13236583629623055, "rewards/accuracy_reward": 0.11160715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 535 }, { "completion_length": 506.8906555175781, "epoch": 0.16010753491150773, "grad_norm": 1.4459084272384644, "kl": 0.6011962890625, "learning_rate": 9.901468936065673e-07, "loss": 0.024, "reward": 1.2723214626312256, "reward_std": 0.16410532034933567, "rewards/accuracy_reward": 0.2991071566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.973214328289032, "step": 536 }, { "completion_length": 512.8125381469727, "epoch": 0.1604062429990292, "grad_norm": 0.9446246027946472, "kl": 0.697265625, "learning_rate": 9.900489727033305e-07, "loss": 0.0279, "reward": 1.1266741454601288, "reward_std": 0.134576715528965, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98604916036129, "step": 537 }, { "completion_length": 533.7120742797852, "epoch": 0.16070495108655067, "grad_norm": 2.1584291458129883, "kl": 0.91796875, "learning_rate": 9.899505730699412e-07, "loss": 0.0368, "reward": 1.131138414144516, "reward_std": 0.19674314372241497, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884290456772, "step": 538 }, { "completion_length": 490.9576110839844, "epoch": 0.16100365917407214, "grad_norm": 0.9661543965339661, "kl": 1.106201171875, "learning_rate": 9.89851694813448e-07, "loss": 0.0443, "reward": 1.170758992433548, "reward_std": 0.16460462659597397, "rewards/accuracy_reward": 0.18750001303851604, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589477300644, "step": 539 }, { "completion_length": 492.6897506713867, "epoch": 0.1613023672615936, "grad_norm": 1.9914056062698364, "kl": 1.0185546875, "learning_rate": 9.89752338041421e-07, "loss": 0.0407, "reward": 1.207589328289032, "reward_std": 0.15517804399132729, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 540 }, { "completion_length": 456.46653747558594, "epoch": 0.16160107534911508, "grad_norm": 1.032148838043213, "kl": 1.146484375, "learning_rate": 9.896525028619504e-07, "loss": 0.0459, "reward": 1.1300223767757416, "reward_std": 0.169561130926013, "rewards/accuracy_reward": 0.14732143515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 541 }, { "completion_length": 490.89288330078125, "epoch": 0.16189978343663655, "grad_norm": 0.7512778043746948, "kl": 0.703125, "learning_rate": 9.895521893836474e-07, "loss": 0.0281, "reward": 1.1283482611179352, "reward_std": 0.1720958985388279, "rewards/accuracy_reward": 0.14508929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 542 }, { "completion_length": 599.6428680419922, "epoch": 0.16219849152415802, "grad_norm": 1.669854998588562, "kl": 1.2421875, "learning_rate": 9.89451397715643e-07, "loss": 0.0496, "reward": 1.0518973618745804, "reward_std": 0.16408654116094112, "rewards/accuracy_reward": 0.08482143352739513, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9670759290456772, "step": 543 }, { "completion_length": 475.5156478881836, "epoch": 0.1624971996116795, "grad_norm": 0.6620777249336243, "kl": 0.753662109375, "learning_rate": 9.89350127967589e-07, "loss": 0.0301, "reward": 1.2717634439468384, "reward_std": 0.25048714876174927, "rewards/accuracy_reward": 0.2879464402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 544 }, { "completion_length": 468.61163330078125, "epoch": 0.16279590769920096, "grad_norm": 1.53274405002594, "kl": 0.527587890625, "learning_rate": 9.892483802496565e-07, "loss": 0.0211, "reward": 1.1082589626312256, "reward_std": 0.13696571998298168, "rewards/accuracy_reward": 0.12276786426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 545 }, { "completion_length": 482.02010345458984, "epoch": 0.16309461578672244, "grad_norm": 0.7057362794876099, "kl": 1.224609375, "learning_rate": 9.891461546725373e-07, "loss": 0.0489, "reward": 1.0507813096046448, "reward_std": 0.14582271128892899, "rewards/accuracy_reward": 0.07589286239817739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884439468384, "step": 546 }, { "completion_length": 538.4710083007812, "epoch": 0.1633933238742439, "grad_norm": 0.8956061005592346, "kl": 0.7158203125, "learning_rate": 9.89043451347443e-07, "loss": 0.0286, "reward": 1.111607164144516, "reward_std": 0.1789443762972951, "rewards/accuracy_reward": 0.12946429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 547 }, { "completion_length": 478.79466247558594, "epoch": 0.16369203196176538, "grad_norm": 0.740838348865509, "kl": 0.88671875, "learning_rate": 9.889402703861042e-07, "loss": 0.0355, "reward": 1.0876116454601288, "reward_std": 0.1325605195015669, "rewards/accuracy_reward": 0.10267857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 548 }, { "completion_length": 413.5379638671875, "epoch": 0.16399074004928682, "grad_norm": 0.3861332833766937, "kl": 0.63623046875, "learning_rate": 9.88836611900772e-07, "loss": 0.0254, "reward": 1.1339286267757416, "reward_std": 0.1355127152055502, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 549 }, { "completion_length": 495.64957427978516, "epoch": 0.1642894481368083, "grad_norm": 1.2519989013671875, "kl": 0.9931640625, "learning_rate": 9.887324760042168e-07, "loss": 0.0397, "reward": 1.09542416036129, "reward_std": 0.12293285224586725, "rewards/accuracy_reward": 0.11607143515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 550 }, { "completion_length": 514.5044860839844, "epoch": 0.16458815622432976, "grad_norm": 3.5379178524017334, "kl": 1.25634765625, "learning_rate": 9.886278628097281e-07, "loss": 0.0503, "reward": 1.14229916036129, "reward_std": 0.09153135237284005, "rewards/accuracy_reward": 0.15401786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 551 }, { "completion_length": 495.91297149658203, "epoch": 0.16488686431185123, "grad_norm": 2.3060805797576904, "kl": 1.797607421875, "learning_rate": 9.885227724311147e-07, "loss": 0.0718, "reward": 1.1495536267757416, "reward_std": 0.13161679729819298, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071939468384, "step": 552 }, { "completion_length": 440.86609649658203, "epoch": 0.1651855723993727, "grad_norm": 0.5077342987060547, "kl": 1.0458984375, "learning_rate": 9.884172049827048e-07, "loss": 0.0418, "reward": 1.1222098469734192, "reward_std": 0.13035261910408735, "rewards/accuracy_reward": 0.1361607164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 553 }, { "completion_length": 495.7701187133789, "epoch": 0.16548428048689418, "grad_norm": 0.6793200969696045, "kl": 1.259765625, "learning_rate": 9.883111605793453e-07, "loss": 0.0504, "reward": 1.1841518580913544, "reward_std": 0.20330706052482128, "rewards/accuracy_reward": 0.2031250149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 554 }, { "completion_length": 458.0558319091797, "epoch": 0.16578298857441565, "grad_norm": 0.8589764833450317, "kl": 0.854736328125, "learning_rate": 9.882046393364024e-07, "loss": 0.0342, "reward": 1.2265625596046448, "reward_std": 0.1696150042116642, "rewards/accuracy_reward": 0.23883930034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 555 }, { "completion_length": 519.9397506713867, "epoch": 0.16608169666193712, "grad_norm": 0.5971102714538574, "kl": 1.271484375, "learning_rate": 9.880976413697603e-07, "loss": 0.0507, "reward": 1.1160714626312256, "reward_std": 0.1716254334896803, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 556 }, { "completion_length": 508.9219055175781, "epoch": 0.1663804047494586, "grad_norm": 0.5942420959472656, "kl": 0.59228515625, "learning_rate": 9.879901667958228e-07, "loss": 0.0237, "reward": 1.1473214328289032, "reward_std": 0.14420348033308983, "rewards/accuracy_reward": 0.15625001210719347, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 557 }, { "completion_length": 592.0826110839844, "epoch": 0.16667911283698006, "grad_norm": 4.108725070953369, "kl": 2.154296875, "learning_rate": 9.878822157315115e-07, "loss": 0.0861, "reward": 1.078683078289032, "reward_std": 0.2102178931236267, "rewards/accuracy_reward": 0.10491071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723618745804, "step": 558 }, { "completion_length": 578.8772735595703, "epoch": 0.16697782092450153, "grad_norm": 5.859631061553955, "kl": 2.341796875, "learning_rate": 9.877737882942665e-07, "loss": 0.0939, "reward": 1.0876116454601288, "reward_std": 0.20238688960671425, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.969308078289032, "step": 559 }, { "completion_length": 498.09376525878906, "epoch": 0.167276529012023, "grad_norm": 0.8200750946998596, "kl": 0.8826904296875, "learning_rate": 9.876648846020464e-07, "loss": 0.0352, "reward": 1.1757813096046448, "reward_std": 0.16990238800644875, "rewards/accuracy_reward": 0.191964291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 560 }, { "completion_length": 496.80359649658203, "epoch": 0.16757523709954447, "grad_norm": 3.734675407409668, "kl": 0.3829345703125, "learning_rate": 9.875555047733273e-07, "loss": 0.0153, "reward": 1.1127232611179352, "reward_std": 0.207657590508461, "rewards/accuracy_reward": 0.1294642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 561 }, { "completion_length": 548.0402069091797, "epoch": 0.16787394518706594, "grad_norm": 1.9187591075897217, "kl": 0.4718017578125, "learning_rate": 9.874456489271043e-07, "loss": 0.0189, "reward": 1.159040242433548, "reward_std": 0.18500838801264763, "rewards/accuracy_reward": 0.1830357164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 562 }, { "completion_length": 533.0692138671875, "epoch": 0.1681726532745874, "grad_norm": 4.8922858238220215, "kl": 0.38818359375, "learning_rate": 9.873353171828894e-07, "loss": 0.0155, "reward": 1.1992188096046448, "reward_std": 0.19123363681137562, "rewards/accuracy_reward": 0.2142857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 563 }, { "completion_length": 460.29466247558594, "epoch": 0.16847136136210888, "grad_norm": 1.9324222803115845, "kl": 0.275146484375, "learning_rate": 9.87224509660713e-07, "loss": 0.011, "reward": 1.116071492433548, "reward_std": 0.14730653539299965, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 564 }, { "completion_length": 403.2946548461914, "epoch": 0.16877006944963036, "grad_norm": 0.29693281650543213, "kl": 0.2489013671875, "learning_rate": 9.871132264811227e-07, "loss": 0.01, "reward": 1.1607143580913544, "reward_std": 0.10660163406282663, "rewards/accuracy_reward": 0.16294643771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9977678805589676, "step": 565 }, { "completion_length": 437.60047149658203, "epoch": 0.16906877753715183, "grad_norm": 0.6992530226707458, "kl": 0.42919921875, "learning_rate": 9.870014677651837e-07, "loss": 0.0172, "reward": 1.125558078289032, "reward_std": 0.1767597794532776, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616454601288, "step": 566 }, { "completion_length": 523.1428680419922, "epoch": 0.1693674856246733, "grad_norm": 0.5305894017219543, "kl": 0.763916015625, "learning_rate": 9.868892336344783e-07, "loss": 0.0305, "reward": 1.1344866454601288, "reward_std": 0.13854572363197803, "rewards/accuracy_reward": 0.14955357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 567 }, { "completion_length": 515.1294937133789, "epoch": 0.16966619371219477, "grad_norm": 0.991972029209137, "kl": 0.8814697265625, "learning_rate": 9.867765242111069e-07, "loss": 0.0353, "reward": 1.1183036267757416, "reward_std": 0.13602731563150883, "rewards/accuracy_reward": 0.13392857951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 568 }, { "completion_length": 504.1518020629883, "epoch": 0.16996490179971624, "grad_norm": 0.9571132063865662, "kl": 0.572265625, "learning_rate": 9.866633396176853e-07, "loss": 0.0229, "reward": 1.1501116454601288, "reward_std": 0.13728927448391914, "rewards/accuracy_reward": 0.16071429662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 569 }, { "completion_length": 481.99556732177734, "epoch": 0.1702636098872377, "grad_norm": 0.9713841676712036, "kl": 0.6962890625, "learning_rate": 9.865496799773482e-07, "loss": 0.0278, "reward": 1.231026828289032, "reward_std": 0.1595183163881302, "rewards/accuracy_reward": 0.2477678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 570 }, { "completion_length": 485.0848388671875, "epoch": 0.17056231797475915, "grad_norm": 1.1867055892944336, "kl": 0.850341796875, "learning_rate": 9.864355454137456e-07, "loss": 0.0341, "reward": 1.1244420111179352, "reward_std": 0.17652177438139915, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 571 }, { "completion_length": 489.19866943359375, "epoch": 0.17086102606228062, "grad_norm": 0.5559407472610474, "kl": 0.505859375, "learning_rate": 9.863209360510449e-07, "loss": 0.0202, "reward": 1.1735491454601288, "reward_std": 0.1344629619270563, "rewards/accuracy_reward": 0.1852678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 572 }, { "completion_length": 491.5781478881836, "epoch": 0.1711597341498021, "grad_norm": 0.3290311396121979, "kl": 0.4027099609375, "learning_rate": 9.8620585201393e-07, "loss": 0.0161, "reward": 1.0870536267757416, "reward_std": 0.10389538970775902, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 573 }, { "completion_length": 499.5067138671875, "epoch": 0.17145844223732357, "grad_norm": 0.8362019062042236, "kl": 0.8642578125, "learning_rate": 9.860902934276005e-07, "loss": 0.0346, "reward": 1.092633992433548, "reward_std": 0.14767915289849043, "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 574 }, { "completion_length": 603.9821929931641, "epoch": 0.17175715032484504, "grad_norm": 1.617186188697815, "kl": 1.4873046875, "learning_rate": 9.859742604177734e-07, "loss": 0.0595, "reward": 1.0948661267757416, "reward_std": 0.2452581450343132, "rewards/accuracy_reward": 0.12946429662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.965401828289032, "step": 575 }, { "completion_length": 428.0580520629883, "epoch": 0.1720558584123665, "grad_norm": 0.49620357155799866, "kl": 0.162841796875, "learning_rate": 9.85857753110681e-07, "loss": 0.0065, "reward": 1.1512277126312256, "reward_std": 0.14956348622217774, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949776977300644, "step": 576 }, { "completion_length": 473.0111846923828, "epoch": 0.17235456649988798, "grad_norm": 0.5955946445465088, "kl": 0.4512939453125, "learning_rate": 9.85740771633072e-07, "loss": 0.0181, "reward": 1.1216518580913544, "reward_std": 0.15761225670576096, "rewards/accuracy_reward": 0.133928582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 577 }, { "completion_length": 542.4665374755859, "epoch": 0.17265327458740945, "grad_norm": 0.49536001682281494, "kl": 0.687744140625, "learning_rate": 9.856233161122105e-07, "loss": 0.0275, "reward": 1.1395089626312256, "reward_std": 0.2069953940808773, "rewards/accuracy_reward": 0.16071429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946939468384, "step": 578 }, { "completion_length": 476.9040298461914, "epoch": 0.17295198267493092, "grad_norm": 0.5257432460784912, "kl": 0.39990234375, "learning_rate": 9.855053866758766e-07, "loss": 0.016, "reward": 1.0658482611179352, "reward_std": 0.06564219947904348, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 579 }, { "completion_length": 514.1830673217773, "epoch": 0.1732506907624524, "grad_norm": 0.37839993834495544, "kl": 0.7236328125, "learning_rate": 9.853869834523664e-07, "loss": 0.0289, "reward": 1.1791295111179352, "reward_std": 0.20041032880544662, "rewards/accuracy_reward": 0.1941964402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 580 }, { "completion_length": 504.8013610839844, "epoch": 0.17354939884997386, "grad_norm": 0.38725385069847107, "kl": 0.530517578125, "learning_rate": 9.852681065704907e-07, "loss": 0.0213, "reward": 1.0892857909202576, "reward_std": 0.19004188477993011, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071939468384, "step": 581 }, { "completion_length": 535.8795013427734, "epoch": 0.17384810693749533, "grad_norm": 0.9888827800750732, "kl": 0.557861328125, "learning_rate": 9.851487561595757e-07, "loss": 0.0223, "reward": 1.0982143431901932, "reward_std": 0.13407435175031424, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 582 }, { "completion_length": 581.7656478881836, "epoch": 0.1741468150250168, "grad_norm": 0.9480743408203125, "kl": 0.5810546875, "learning_rate": 9.850289323494634e-07, "loss": 0.0232, "reward": 1.131696492433548, "reward_std": 0.10674651153385639, "rewards/accuracy_reward": 0.14732143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 583 }, { "completion_length": 450.98216247558594, "epoch": 0.17444552311253828, "grad_norm": 0.5415212512016296, "kl": 0.475341796875, "learning_rate": 9.8490863527051e-07, "loss": 0.019, "reward": 1.137834906578064, "reward_std": 0.11892829090356827, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455484867096, "step": 584 }, { "completion_length": 479.23663330078125, "epoch": 0.17474423120005975, "grad_norm": 0.8757508993148804, "kl": 0.58935546875, "learning_rate": 9.847878650535871e-07, "loss": 0.0236, "reward": 1.0736607760190964, "reward_std": 0.09667750261723995, "rewards/accuracy_reward": 0.08705357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 585 }, { "completion_length": 520.3370742797852, "epoch": 0.17504293928758122, "grad_norm": 1.0633591413497925, "kl": 1.0699462890625, "learning_rate": 9.846666218300807e-07, "loss": 0.0428, "reward": 1.0954241752624512, "reward_std": 0.20894418470561504, "rewards/accuracy_reward": 0.11607143585570157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 586 }, { "completion_length": 473.5022506713867, "epoch": 0.1753416473751027, "grad_norm": 0.4582430124282837, "kl": 0.9990234375, "learning_rate": 9.845449057318917e-07, "loss": 0.0399, "reward": 1.104352742433548, "reward_std": 0.16844079457223415, "rewards/accuracy_reward": 0.11607143771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 587 }, { "completion_length": 501.96654510498047, "epoch": 0.17564035546262416, "grad_norm": 1.2257659435272217, "kl": 0.542236328125, "learning_rate": 9.844227168914351e-07, "loss": 0.0217, "reward": 1.2087053954601288, "reward_std": 0.16881055384874344, "rewards/accuracy_reward": 0.21651786752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 588 }, { "completion_length": 529.8727798461914, "epoch": 0.17593906355014563, "grad_norm": 1.3179659843444824, "kl": 0.753662109375, "learning_rate": 9.843000554416408e-07, "loss": 0.0302, "reward": 1.1328125596046448, "reward_std": 0.19781884644180536, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946790456772, "step": 589 }, { "completion_length": 515.1160888671875, "epoch": 0.1762377716376671, "grad_norm": 0.6451693773269653, "kl": 0.7589111328125, "learning_rate": 9.841769215159522e-07, "loss": 0.0304, "reward": 1.0463170260190964, "reward_std": 0.11020912975072861, "rewards/accuracy_reward": 0.06696428963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 590 }, { "completion_length": 465.5468978881836, "epoch": 0.17653647972518857, "grad_norm": 0.8860642313957214, "kl": 0.201416015625, "learning_rate": 9.840533152483267e-07, "loss": 0.008, "reward": 1.1026785969734192, "reward_std": 0.15491892583668232, "rewards/accuracy_reward": 0.11830358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 591 }, { "completion_length": 615.4107437133789, "epoch": 0.17683518781271004, "grad_norm": 2.3938698768615723, "kl": 1.7080078125, "learning_rate": 9.83929236773236e-07, "loss": 0.0684, "reward": 1.0189732611179352, "reward_std": 0.15873467363417149, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9609375447034836, "step": 592 }, { "completion_length": 486.1093978881836, "epoch": 0.17713389590023149, "grad_norm": 0.31437811255455017, "kl": 0.5313720703125, "learning_rate": 9.838046862256655e-07, "loss": 0.0213, "reward": 1.1132812798023224, "reward_std": 0.1670653074979782, "rewards/accuracy_reward": 0.1205357238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.992745578289032, "step": 593 }, { "completion_length": 552.381721496582, "epoch": 0.17743260398775296, "grad_norm": 0.8906832933425903, "kl": 1.1396484375, "learning_rate": 9.836796637411136e-07, "loss": 0.0456, "reward": 1.262276828289032, "reward_std": 0.19462577812373638, "rewards/accuracy_reward": 0.2901785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.972098246216774, "step": 594 }, { "completion_length": 509.99332427978516, "epoch": 0.17773131207527443, "grad_norm": 0.756534218788147, "kl": 0.7763671875, "learning_rate": 9.835541694555928e-07, "loss": 0.031, "reward": 1.057477742433548, "reward_std": 0.1198305319994688, "rewards/accuracy_reward": 0.08035714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977120578289032, "step": 595 }, { "completion_length": 473.26341247558594, "epoch": 0.1780300201627959, "grad_norm": 0.5157192945480347, "kl": 0.87890625, "learning_rate": 9.834282035056286e-07, "loss": 0.0352, "reward": 1.13058041036129, "reward_std": 0.1998073235154152, "rewards/accuracy_reward": 0.14955357951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 596 }, { "completion_length": 587.0401992797852, "epoch": 0.17832872825031737, "grad_norm": 0.9537596702575684, "kl": 0.84747314453125, "learning_rate": 9.833017660282596e-07, "loss": 0.0339, "reward": 1.1049107760190964, "reward_std": 0.07732915785163641, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 597 }, { "completion_length": 445.45314025878906, "epoch": 0.17862743633783884, "grad_norm": 0.3905724883079529, "kl": 0.2381591796875, "learning_rate": 9.83174857161037e-07, "loss": 0.0095, "reward": 1.1741071939468384, "reward_std": 0.11902506882324815, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9977678656578064, "step": 598 }, { "completion_length": 513.975456237793, "epoch": 0.1789261444253603, "grad_norm": 0.6687798500061035, "kl": 0.349609375, "learning_rate": 9.830474770420257e-07, "loss": 0.014, "reward": 1.2003348767757416, "reward_std": 0.15723086148500443, "rewards/accuracy_reward": 0.2142857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 599 }, { "completion_length": 476.2277069091797, "epoch": 0.17922485251288178, "grad_norm": 0.9798215627670288, "kl": 0.590576171875, "learning_rate": 9.829196258098025e-07, "loss": 0.0236, "reward": 1.119977742433548, "reward_std": 0.11785487271845341, "rewards/accuracy_reward": 0.1294642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134439468384, "step": 600 }, { "completion_length": 512.0156478881836, "epoch": 0.17952356060040325, "grad_norm": 0.7906419634819031, "kl": 0.523193359375, "learning_rate": 9.82791303603457e-07, "loss": 0.021, "reward": 1.069196492433548, "reward_std": 0.16376737505197525, "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 601 }, { "completion_length": 556.2165451049805, "epoch": 0.17982226868792472, "grad_norm": 1.080286979675293, "kl": 0.391357421875, "learning_rate": 9.826625105625915e-07, "loss": 0.0157, "reward": 1.10714291036129, "reward_std": 0.1569022834300995, "rewards/accuracy_reward": 0.12723215017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 602 }, { "completion_length": 443.4397506713867, "epoch": 0.1801209767754462, "grad_norm": 0.29876068234443665, "kl": 0.262939453125, "learning_rate": 9.8253324682732e-07, "loss": 0.0105, "reward": 1.1261160969734192, "reward_std": 0.12912756018340588, "rewards/accuracy_reward": 0.12946429220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966518133878708, "step": 603 }, { "completion_length": 456.18528747558594, "epoch": 0.18041968486296767, "grad_norm": 1.1415199041366577, "kl": 0.4671630859375, "learning_rate": 9.824035125382686e-07, "loss": 0.0187, "reward": 1.0881696939468384, "reward_std": 0.13202595710754395, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589477300644, "step": 604 }, { "completion_length": 493.2611846923828, "epoch": 0.18071839295048914, "grad_norm": 0.5567336678504944, "kl": 0.6591796875, "learning_rate": 9.822733078365758e-07, "loss": 0.0264, "reward": 1.198102742433548, "reward_std": 0.2035803496837616, "rewards/accuracy_reward": 0.2142857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 605 }, { "completion_length": 514.1651992797852, "epoch": 0.1810171010380106, "grad_norm": 7.047621250152588, "kl": 0.43865966796875, "learning_rate": 9.821426328638914e-07, "loss": 0.0176, "reward": 1.0429687798023224, "reward_std": 0.14158733375370502, "rewards/accuracy_reward": 0.05580357415601611, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871651977300644, "step": 606 }, { "completion_length": 438.89288330078125, "epoch": 0.18131580912553208, "grad_norm": 0.6857143044471741, "kl": 0.4632568359375, "learning_rate": 9.820114877623768e-07, "loss": 0.0185, "reward": 1.1981027126312256, "reward_std": 0.13388617825694382, "rewards/accuracy_reward": 0.2075892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 607 }, { "completion_length": 471.11385345458984, "epoch": 0.18161451721305355, "grad_norm": 0.7130495309829712, "kl": 0.591064453125, "learning_rate": 9.818798726747055e-07, "loss": 0.0237, "reward": 1.0546875298023224, "reward_std": 0.15398884564638138, "rewards/accuracy_reward": 0.0736607180442661, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 608 }, { "completion_length": 536.7723388671875, "epoch": 0.18191322530057502, "grad_norm": 1.316405177116394, "kl": 0.69482421875, "learning_rate": 9.817477877440614e-07, "loss": 0.0277, "reward": 1.0987723618745804, "reward_std": 0.14790908247232437, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 609 }, { "completion_length": 461.8973388671875, "epoch": 0.1822119333880965, "grad_norm": 0.3656269609928131, "kl": 0.437255859375, "learning_rate": 9.8161523311414e-07, "loss": 0.0175, "reward": 1.0943080484867096, "reward_std": 0.17243296280503273, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 610 }, { "completion_length": 423.4152069091797, "epoch": 0.18251064147561796, "grad_norm": 0.5425147414207458, "kl": 0.244140625, "learning_rate": 9.814822089291476e-07, "loss": 0.0098, "reward": 1.2617187798023224, "reward_std": 0.10137429274618626, "rewards/accuracy_reward": 0.2678571566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 611 }, { "completion_length": 476.1919860839844, "epoch": 0.18280934956313943, "grad_norm": 0.6473791599273682, "kl": 0.32171630859375, "learning_rate": 9.81348715333802e-07, "loss": 0.0129, "reward": 1.0351562798023224, "reward_std": 0.1180603913962841, "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 612 }, { "completion_length": 473.92635345458984, "epoch": 0.1831080576506609, "grad_norm": 0.32872024178504944, "kl": 0.369873046875, "learning_rate": 9.812147524733309e-07, "loss": 0.0148, "reward": 1.159040242433548, "reward_std": 0.14281282387673855, "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 613 }, { "completion_length": 510.95538330078125, "epoch": 0.18340676573818235, "grad_norm": 0.8843008875846863, "kl": 0.80303955078125, "learning_rate": 9.810803204934725e-07, "loss": 0.0321, "reward": 1.109933078289032, "reward_std": 0.15194646641612053, "rewards/accuracy_reward": 0.13169643376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 614 }, { "completion_length": 490.99779510498047, "epoch": 0.18370547382570382, "grad_norm": 0.44693607091903687, "kl": 0.31964111328125, "learning_rate": 9.809454195404757e-07, "loss": 0.0128, "reward": 1.1417411118745804, "reward_std": 0.1665857806801796, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 615 }, { "completion_length": 483.20091247558594, "epoch": 0.1840041819132253, "grad_norm": 0.6193176507949829, "kl": 0.58349609375, "learning_rate": 9.808100497610999e-07, "loss": 0.0233, "reward": 1.080915242433548, "reward_std": 0.11542054079473019, "rewards/accuracy_reward": 0.09375000302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 616 }, { "completion_length": 450.5424346923828, "epoch": 0.18430289000074676, "grad_norm": 0.31227371096611023, "kl": 0.374267578125, "learning_rate": 9.806742113026137e-07, "loss": 0.015, "reward": 1.1344866752624512, "reward_std": 0.13403281942009926, "rewards/accuracy_reward": 0.14508929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 617 }, { "completion_length": 478.6718978881836, "epoch": 0.18460159808826823, "grad_norm": 1.1397629976272583, "kl": 0.5234375, "learning_rate": 9.805379043127962e-07, "loss": 0.021, "reward": 1.1077009588479996, "reward_std": 0.08060717582702637, "rewards/accuracy_reward": 0.12053571990691125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 618 }, { "completion_length": 462.6205596923828, "epoch": 0.1849003061757897, "grad_norm": 0.39551034569740295, "kl": 0.1619873046875, "learning_rate": 9.804011289399362e-07, "loss": 0.0065, "reward": 1.1517857611179352, "reward_std": 0.11572229582816362, "rewards/accuracy_reward": 0.15848215110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 619 }, { "completion_length": 484.47547149658203, "epoch": 0.18519901426331117, "grad_norm": 0.6877611875534058, "kl": 0.775634765625, "learning_rate": 9.802638853328316e-07, "loss": 0.031, "reward": 1.1026786267757416, "reward_std": 0.12112346291542053, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 620 }, { "completion_length": 559.700927734375, "epoch": 0.18549772235083264, "grad_norm": 1.8854550123214722, "kl": 1.359375, "learning_rate": 9.801261736407903e-07, "loss": 0.0544, "reward": 1.1690848469734192, "reward_std": 0.23427484557032585, "rewards/accuracy_reward": 0.1919642984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977120578289032, "step": 621 }, { "completion_length": 482.57592010498047, "epoch": 0.18579643043835412, "grad_norm": 1.9412014484405518, "kl": 1.010986328125, "learning_rate": 9.79987994013629e-07, "loss": 0.0404, "reward": 1.2527902722358704, "reward_std": 0.21579580940306187, "rewards/accuracy_reward": 0.2700892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 622 }, { "completion_length": 461.5089340209961, "epoch": 0.1860951385258756, "grad_norm": 0.7637563943862915, "kl": 0.6826171875, "learning_rate": 9.798493466016733e-07, "loss": 0.0273, "reward": 1.209821492433548, "reward_std": 0.21884611435234547, "rewards/accuracy_reward": 0.22544643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 623 }, { "completion_length": 462.40404510498047, "epoch": 0.18639384661339706, "grad_norm": 0.9162909984588623, "kl": 0.43206787109375, "learning_rate": 9.797102315557585e-07, "loss": 0.0173, "reward": 1.2282366752624512, "reward_std": 0.21267294511198997, "rewards/accuracy_reward": 0.2343750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616156578064, "step": 624 }, { "completion_length": 512.6116409301758, "epoch": 0.18669255470091853, "grad_norm": 0.5495480895042419, "kl": 0.5498046875, "learning_rate": 9.79570649027228e-07, "loss": 0.022, "reward": 1.094308078289032, "reward_std": 0.14428702555596828, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 625 }, { "completion_length": 486.0357360839844, "epoch": 0.18699126278844, "grad_norm": 0.8224413394927979, "kl": 0.1851806640625, "learning_rate": 9.794305991679336e-07, "loss": 0.0074, "reward": 1.1741071939468384, "reward_std": 0.13987762946635485, "rewards/accuracy_reward": 0.18080358393490314, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 626 }, { "completion_length": 535.4553833007812, "epoch": 0.18728997087596147, "grad_norm": 0.9639474153518677, "kl": 0.626708984375, "learning_rate": 9.79290082130236e-07, "loss": 0.0251, "reward": 1.1378348767757416, "reward_std": 0.14490332826972008, "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 627 }, { "completion_length": 566.1585006713867, "epoch": 0.18758867896348294, "grad_norm": 0.861150860786438, "kl": 0.6708984375, "learning_rate": 9.79149098067004e-07, "loss": 0.0268, "reward": 1.0909598469734192, "reward_std": 0.18495739065110683, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 628 }, { "completion_length": 572.4174423217773, "epoch": 0.1878873870510044, "grad_norm": 1.5209650993347168, "kl": 0.434326171875, "learning_rate": 9.790076471316147e-07, "loss": 0.0174, "reward": 1.1601562947034836, "reward_std": 0.15840748697519302, "rewards/accuracy_reward": 0.18526786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884439468384, "step": 629 }, { "completion_length": 496.12056732177734, "epoch": 0.18818609513852588, "grad_norm": 0.6706302165985107, "kl": 0.698486328125, "learning_rate": 9.788657294779523e-07, "loss": 0.028, "reward": 1.0915178954601288, "reward_std": 0.19166230969130993, "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 630 }, { "completion_length": 484.8482360839844, "epoch": 0.18848480322604735, "grad_norm": 0.5271580815315247, "kl": 0.214111328125, "learning_rate": 9.787233452604096e-07, "loss": 0.0086, "reward": 1.1467634737491608, "reward_std": 0.13865200988948345, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 631 }, { "completion_length": 433.25225830078125, "epoch": 0.18878351131356882, "grad_norm": 0.8549447655677795, "kl": 0.39208984375, "learning_rate": 9.785804946338869e-07, "loss": 0.0157, "reward": 1.1662946939468384, "reward_std": 0.20806214585900307, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 632 }, { "completion_length": 481.96653747558594, "epoch": 0.1890822194010903, "grad_norm": 0.8952639102935791, "kl": 0.79150390625, "learning_rate": 9.78437177753791e-07, "loss": 0.0317, "reward": 1.109933078289032, "reward_std": 0.17169356811791658, "rewards/accuracy_reward": 0.1294642931316048, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 633 }, { "completion_length": 512.9576110839844, "epoch": 0.18938092748861177, "grad_norm": 1.7692556381225586, "kl": 1.2021484375, "learning_rate": 9.782933947760374e-07, "loss": 0.0481, "reward": 1.073102742433548, "reward_std": 0.18196702376008034, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9704241454601288, "step": 634 }, { "completion_length": 453.59600830078125, "epoch": 0.18967963557613324, "grad_norm": 0.8720694184303284, "kl": 0.58544921875, "learning_rate": 9.781491458570475e-07, "loss": 0.0234, "reward": 1.0172991752624512, "reward_std": 0.12266621738672256, "rewards/accuracy_reward": 0.03125000139698386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98604916036129, "step": 635 }, { "completion_length": 492.61608123779297, "epoch": 0.18997834366365468, "grad_norm": 0.9425884485244751, "kl": 0.420654296875, "learning_rate": 9.780044311537501e-07, "loss": 0.0168, "reward": 1.1205357611179352, "reward_std": 0.1422184370458126, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 636 }, { "completion_length": 540.9933242797852, "epoch": 0.19027705175117615, "grad_norm": 1.2788711786270142, "kl": 0.653076171875, "learning_rate": 9.778592508235805e-07, "loss": 0.0261, "reward": 1.094866156578064, "reward_std": 0.16943515092134476, "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 637 }, { "completion_length": 525.0736846923828, "epoch": 0.19057575983869762, "grad_norm": 1.3094171285629272, "kl": 0.3739013671875, "learning_rate": 9.777136050244806e-07, "loss": 0.0149, "reward": 1.133370578289032, "reward_std": 0.11925033293664455, "rewards/accuracy_reward": 0.14062500465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455484867096, "step": 638 }, { "completion_length": 476.6495819091797, "epoch": 0.1908744679262191, "grad_norm": 0.4519892930984497, "kl": 0.4283447265625, "learning_rate": 9.775674939148988e-07, "loss": 0.0172, "reward": 1.1679688096046448, "reward_std": 0.09244140144437551, "rewards/accuracy_reward": 0.17410715389996767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616454601288, "step": 639 }, { "completion_length": 458.25001525878906, "epoch": 0.19117317601374056, "grad_norm": 1.3447630405426025, "kl": 0.4417724609375, "learning_rate": 9.774209176537901e-07, "loss": 0.0177, "reward": 1.1651786267757416, "reward_std": 0.2005729302763939, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 640 }, { "completion_length": 479.45091247558594, "epoch": 0.19147188410126204, "grad_norm": 0.9138901233673096, "kl": 0.2529296875, "learning_rate": 9.772738764006144e-07, "loss": 0.0101, "reward": 1.0931920111179352, "reward_std": 0.1080146529711783, "rewards/accuracy_reward": 0.10044643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 641 }, { "completion_length": 473.71207427978516, "epoch": 0.1917705921887835, "grad_norm": 0.279255211353302, "kl": 0.5390625, "learning_rate": 9.771263703153388e-07, "loss": 0.0215, "reward": 1.1350446939468384, "reward_std": 0.18361820466816425, "rewards/accuracy_reward": 0.1473214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 642 }, { "completion_length": 478.1562805175781, "epoch": 0.19206930027630498, "grad_norm": 0.6441079378128052, "kl": 0.3282470703125, "learning_rate": 9.76978399558435e-07, "loss": 0.0132, "reward": 1.119977742433548, "reward_std": 0.07897382415831089, "rewards/accuracy_reward": 0.12946429499424994, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 643 }, { "completion_length": 504.41297149658203, "epoch": 0.19236800836382645, "grad_norm": 0.5242125391960144, "kl": 0.62158203125, "learning_rate": 9.768299642908808e-07, "loss": 0.0248, "reward": 1.1065848767757416, "reward_std": 0.1327293086796999, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 644 }, { "completion_length": 515.654052734375, "epoch": 0.19266671645134792, "grad_norm": 0.4616898000240326, "kl": 0.3087158203125, "learning_rate": 9.766810646741595e-07, "loss": 0.0123, "reward": 1.2059152126312256, "reward_std": 0.1690350268036127, "rewards/accuracy_reward": 0.2120535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 645 }, { "completion_length": 543.1830673217773, "epoch": 0.1929654245388694, "grad_norm": 0.5128000378608704, "kl": 0.6953125, "learning_rate": 9.76531700870259e-07, "loss": 0.0278, "reward": 1.2500000596046448, "reward_std": 0.17622294649481773, "rewards/accuracy_reward": 0.2611607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 646 }, { "completion_length": 478.7366256713867, "epoch": 0.19326413262639086, "grad_norm": 0.6272923350334167, "kl": 0.365234375, "learning_rate": 9.763818730416724e-07, "loss": 0.0146, "reward": 1.1104911267757416, "reward_std": 0.12183901481330395, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 647 }, { "completion_length": 466.8839569091797, "epoch": 0.19356284071391233, "grad_norm": 0.7747365236282349, "kl": 0.2037353515625, "learning_rate": 9.76231581351398e-07, "loss": 0.0082, "reward": 1.1540179252624512, "reward_std": 0.126930289901793, "rewards/accuracy_reward": 0.17187500861473382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428805589676, "step": 648 }, { "completion_length": 491.9084930419922, "epoch": 0.1938615488014338, "grad_norm": 0.5740340352058411, "kl": 0.33935546875, "learning_rate": 9.760808259629383e-07, "loss": 0.0136, "reward": 1.2427456080913544, "reward_std": 0.19157883897423744, "rewards/accuracy_reward": 0.25000001303851604, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 649 }, { "completion_length": 488.42860412597656, "epoch": 0.19416025688895527, "grad_norm": 0.5230385065078735, "kl": 0.25, "learning_rate": 9.759296070403001e-07, "loss": 0.01, "reward": 1.1556920111179352, "reward_std": 0.0956993605941534, "rewards/accuracy_reward": 0.15848214412108064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9972098469734192, "step": 650 }, { "completion_length": 553.8750305175781, "epoch": 0.19445896497647674, "grad_norm": 0.5529772043228149, "kl": 0.8153076171875, "learning_rate": 9.757779247479953e-07, "loss": 0.0326, "reward": 1.1696428954601288, "reward_std": 0.17050893604755402, "rewards/accuracy_reward": 0.20089286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500298023224, "step": 651 }, { "completion_length": 530.3370742797852, "epoch": 0.19475767306399822, "grad_norm": 0.45832741260528564, "kl": 0.65576171875, "learning_rate": 9.75625779251039e-07, "loss": 0.0262, "reward": 1.0993303954601288, "reward_std": 0.12875141948461533, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 652 }, { "completion_length": 520.1317291259766, "epoch": 0.1950563811515197, "grad_norm": 1.1616557836532593, "kl": 0.603759765625, "learning_rate": 9.754731707149508e-07, "loss": 0.0241, "reward": 1.1400670111179352, "reward_std": 0.131966651417315, "rewards/accuracy_reward": 0.15401786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 653 }, { "completion_length": 441.99554443359375, "epoch": 0.19535508923904116, "grad_norm": 0.637003481388092, "kl": 0.3272705078125, "learning_rate": 9.753200993057534e-07, "loss": 0.0131, "reward": 1.0864956080913544, "reward_std": 0.13327866420149803, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 654 }, { "completion_length": 555.0178909301758, "epoch": 0.19565379732656263, "grad_norm": 0.7092158794403076, "kl": 0.518310546875, "learning_rate": 9.751665651899742e-07, "loss": 0.0207, "reward": 1.031808078289032, "reward_std": 0.13264071056619287, "rewards/accuracy_reward": 0.04687500116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 655 }, { "completion_length": 546.7812805175781, "epoch": 0.1959525054140841, "grad_norm": 0.6416203379631042, "kl": 0.57080078125, "learning_rate": 9.750125685346426e-07, "loss": 0.0228, "reward": 1.1311384439468384, "reward_std": 0.1257972614839673, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 656 }, { "completion_length": 526.319221496582, "epoch": 0.19625121350160554, "grad_norm": 0.3702664375305176, "kl": 0.7841796875, "learning_rate": 9.748581095072922e-07, "loss": 0.0314, "reward": 1.1339286267757416, "reward_std": 0.14120574295520782, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428805589676, "step": 657 }, { "completion_length": 563.459846496582, "epoch": 0.196549921589127, "grad_norm": 0.7520486116409302, "kl": 0.5927734375, "learning_rate": 9.747031882759594e-07, "loss": 0.0237, "reward": 1.102120578289032, "reward_std": 0.08807096816599369, "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 658 }, { "completion_length": 506.4085006713867, "epoch": 0.19684862967664848, "grad_norm": 0.7290525436401367, "kl": 0.701416015625, "learning_rate": 9.74547805009183e-07, "loss": 0.0281, "reward": 1.1183035969734192, "reward_std": 0.1037074476480484, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 659 }, { "completion_length": 486.3705596923828, "epoch": 0.19714733776416996, "grad_norm": 1.172566533088684, "kl": 0.47967529296875, "learning_rate": 9.74391959876005e-07, "loss": 0.0192, "reward": 1.1389509439468384, "reward_std": 0.13016869686543941, "rewards/accuracy_reward": 0.14285714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937798023224, "step": 660 }, { "completion_length": 465.87278747558594, "epoch": 0.19744604585169143, "grad_norm": 1.0802216529846191, "kl": 0.4415283203125, "learning_rate": 9.742356530459693e-07, "loss": 0.0177, "reward": 1.3409598767757416, "reward_std": 0.13420666381716728, "rewards/accuracy_reward": 0.3459821566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949777126312256, "step": 661 }, { "completion_length": 523.4330673217773, "epoch": 0.1977447539392129, "grad_norm": 0.7925987243652344, "kl": 0.48193359375, "learning_rate": 9.74078884689123e-07, "loss": 0.0192, "reward": 1.1171875596046448, "reward_std": 0.13363372161984444, "rewards/accuracy_reward": 0.12500000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 662 }, { "completion_length": 509.08931732177734, "epoch": 0.19804346202673437, "grad_norm": 0.5259466171264648, "kl": 0.401123046875, "learning_rate": 9.73921654976014e-07, "loss": 0.0161, "reward": 1.1071428954601288, "reward_std": 0.15577078238129616, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 663 }, { "completion_length": 485.1205596923828, "epoch": 0.19834217011425584, "grad_norm": 0.6413285732269287, "kl": 0.439453125, "learning_rate": 9.737639640776933e-07, "loss": 0.0176, "reward": 1.0809152126312256, "reward_std": 0.11533269472420216, "rewards/accuracy_reward": 0.09821429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.982700914144516, "step": 664 }, { "completion_length": 481.71654510498047, "epoch": 0.1986408782017773, "grad_norm": 0.3569239377975464, "kl": 0.4345703125, "learning_rate": 9.73605812165713e-07, "loss": 0.0174, "reward": 1.174107164144516, "reward_std": 0.14892829954624176, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933036118745804, "step": 665 }, { "completion_length": 417.3571548461914, "epoch": 0.19893958628929878, "grad_norm": 0.6382654905319214, "kl": 0.3311767578125, "learning_rate": 9.73447199412127e-07, "loss": 0.0133, "reward": 1.2299107611179352, "reward_std": 0.18275857903063297, "rewards/accuracy_reward": 0.2366071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933036118745804, "step": 666 }, { "completion_length": 504.96654510498047, "epoch": 0.19923829437682025, "grad_norm": 1.0673881769180298, "kl": 0.5859375, "learning_rate": 9.732881259894902e-07, "loss": 0.0234, "reward": 1.0290179252624512, "reward_std": 0.10172250494360924, "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393431901932, "step": 667 }, { "completion_length": 441.2143096923828, "epoch": 0.19953700246434172, "grad_norm": 0.44690513610839844, "kl": 0.22021484375, "learning_rate": 9.73128592070859e-07, "loss": 0.0088, "reward": 1.2600446939468384, "reward_std": 0.10272586392238736, "rewards/accuracy_reward": 0.2656250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 668 }, { "completion_length": 485.8348388671875, "epoch": 0.1998357105518632, "grad_norm": 0.5233400464057922, "kl": 0.26715087890625, "learning_rate": 9.729685978297907e-07, "loss": 0.0107, "reward": 1.1936384439468384, "reward_std": 0.1454283483326435, "rewards/accuracy_reward": 0.20312500605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134439468384, "step": 669 }, { "completion_length": 480.70091247558594, "epoch": 0.20013441863938466, "grad_norm": 0.5614091157913208, "kl": 0.6317138671875, "learning_rate": 9.728081434403437e-07, "loss": 0.0253, "reward": 1.1132813096046448, "reward_std": 0.15901632234454155, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 670 }, { "completion_length": 476.06922149658203, "epoch": 0.20043312672690614, "grad_norm": 0.544139564037323, "kl": 0.4591064453125, "learning_rate": 9.726472290770759e-07, "loss": 0.0184, "reward": 1.117745578289032, "reward_std": 0.1060850229114294, "rewards/accuracy_reward": 0.1272321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134439468384, "step": 671 }, { "completion_length": 433.8928680419922, "epoch": 0.2007318348144276, "grad_norm": 0.393295556306839, "kl": 0.42010498046875, "learning_rate": 9.724858549150467e-07, "loss": 0.0168, "reward": 1.1729911267757416, "reward_std": 0.16706965863704681, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 672 }, { "completion_length": 493.04913330078125, "epoch": 0.20103054290194908, "grad_norm": 0.5490773916244507, "kl": 0.620849609375, "learning_rate": 9.723240211298156e-07, "loss": 0.0249, "reward": 1.1908482611179352, "reward_std": 0.1683233119547367, "rewards/accuracy_reward": 0.2031250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232313156128, "step": 673 }, { "completion_length": 443.8571548461914, "epoch": 0.20132925098947055, "grad_norm": 0.7766083478927612, "kl": 0.4105224609375, "learning_rate": 9.721617278974417e-07, "loss": 0.0164, "reward": 1.137276828289032, "reward_std": 0.13907591998577118, "rewards/accuracy_reward": 0.1473214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98995541036129, "step": 674 }, { "completion_length": 427.16072845458984, "epoch": 0.20162795907699202, "grad_norm": 0.5506747364997864, "kl": 0.380615234375, "learning_rate": 9.719989753944842e-07, "loss": 0.0153, "reward": 1.0792411267757416, "reward_std": 0.1125008724629879, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 675 }, { "completion_length": 455.4241256713867, "epoch": 0.2019266671645135, "grad_norm": 0.5947301387786865, "kl": 0.199462890625, "learning_rate": 9.718357637980016e-07, "loss": 0.008, "reward": 1.1953125298023224, "reward_std": 0.1513738613575697, "rewards/accuracy_reward": 0.1986607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966518133878708, "step": 676 }, { "completion_length": 498.21654510498047, "epoch": 0.20222537525203496, "grad_norm": 1.0777292251586914, "kl": 0.71466064453125, "learning_rate": 9.716720932855526e-07, "loss": 0.0286, "reward": 1.102678656578064, "reward_std": 0.12415685504674911, "rewards/accuracy_reward": 0.11830357741564512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 677 }, { "completion_length": 433.28126525878906, "epoch": 0.20252408333955643, "grad_norm": 0.5113633871078491, "kl": 0.55859375, "learning_rate": 9.715079640351942e-07, "loss": 0.0224, "reward": 1.1545759290456772, "reward_std": 0.09122903132811189, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 678 }, { "completion_length": 470.0893020629883, "epoch": 0.20282279142707788, "grad_norm": 0.7459912300109863, "kl": 1.087890625, "learning_rate": 9.713433762254833e-07, "loss": 0.0435, "reward": 1.1925223767757416, "reward_std": 0.20874515548348427, "rewards/accuracy_reward": 0.2075892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 679 }, { "completion_length": 514.4486923217773, "epoch": 0.20312149951459935, "grad_norm": 0.5556454062461853, "kl": 0.685791015625, "learning_rate": 9.711783300354749e-07, "loss": 0.0274, "reward": 1.209821492433548, "reward_std": 0.1953204795718193, "rewards/accuracy_reward": 0.22767858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98214291036129, "step": 680 }, { "completion_length": 500.7009048461914, "epoch": 0.20342020760212082, "grad_norm": 0.5278182625770569, "kl": 0.6005859375, "learning_rate": 9.710128256447235e-07, "loss": 0.0241, "reward": 1.2031250298023224, "reward_std": 0.1156912837177515, "rewards/accuracy_reward": 0.212053582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 681 }, { "completion_length": 496.4018020629883, "epoch": 0.2037189156896423, "grad_norm": 0.682120144367218, "kl": 0.560546875, "learning_rate": 9.708468632332817e-07, "loss": 0.0224, "reward": 1.1858259439468384, "reward_std": 0.1306252833455801, "rewards/accuracy_reward": 0.191964291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 682 }, { "completion_length": 462.29689025878906, "epoch": 0.20401762377716376, "grad_norm": 0.4647371768951416, "kl": 0.5355224609375, "learning_rate": 9.706804429816998e-07, "loss": 0.0214, "reward": 1.1534598767757416, "reward_std": 0.09655911289155483, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949776977300644, "step": 683 }, { "completion_length": 442.88172912597656, "epoch": 0.20431633186468523, "grad_norm": 0.3012891709804535, "kl": 0.1060791015625, "learning_rate": 9.705135650710271e-07, "loss": 0.0042, "reward": 1.142857164144516, "reward_std": 0.08174209576100111, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 684 }, { "completion_length": 438.9419860839844, "epoch": 0.2046150399522067, "grad_norm": 0.5509459972381592, "kl": 0.376220703125, "learning_rate": 9.703462296828106e-07, "loss": 0.015, "reward": 1.1166295409202576, "reward_std": 0.12034999951720238, "rewards/accuracy_reward": 0.12500000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 685 }, { "completion_length": 482.43975830078125, "epoch": 0.20491374803972817, "grad_norm": 0.5423620343208313, "kl": 0.5928955078125, "learning_rate": 9.701784369990944e-07, "loss": 0.0237, "reward": 1.1908482313156128, "reward_std": 0.21324692480266094, "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 686 }, { "completion_length": 488.33038330078125, "epoch": 0.20521245612724964, "grad_norm": 0.9988133311271667, "kl": 0.669189453125, "learning_rate": 9.700101872024206e-07, "loss": 0.0267, "reward": 1.0965402126312256, "reward_std": 0.11594839137978852, "rewards/accuracy_reward": 0.10714286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 687 }, { "completion_length": 447.96876525878906, "epoch": 0.2055111642147711, "grad_norm": 1.475473165512085, "kl": 0.55810546875, "learning_rate": 9.698414804758287e-07, "loss": 0.0223, "reward": 1.1662946939468384, "reward_std": 0.10871441569179296, "rewards/accuracy_reward": 0.17633929336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 688 }, { "completion_length": 514.1540451049805, "epoch": 0.20580987230229258, "grad_norm": 0.633601188659668, "kl": 0.55078125, "learning_rate": 9.69672317002855e-07, "loss": 0.022, "reward": 1.1690848469734192, "reward_std": 0.1844466645270586, "rewards/accuracy_reward": 0.1830357275903225, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.983816996216774, "step": 689 }, { "completion_length": 527.8705596923828, "epoch": 0.20610858038981406, "grad_norm": 0.5426949858665466, "kl": 0.806396484375, "learning_rate": 9.695026969675323e-07, "loss": 0.0323, "reward": 1.1272321939468384, "reward_std": 0.13666174188256264, "rewards/accuracy_reward": 0.14285715040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 690 }, { "completion_length": 617.466552734375, "epoch": 0.20640728847733553, "grad_norm": 0.6465925574302673, "kl": 0.65185546875, "learning_rate": 9.693326205543913e-07, "loss": 0.0261, "reward": 1.1791295111179352, "reward_std": 0.1373054590076208, "rewards/accuracy_reward": 0.19866072246804833, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 691 }, { "completion_length": 457.9442138671875, "epoch": 0.206705996564857, "grad_norm": 0.34519603848457336, "kl": 0.16766357421875, "learning_rate": 9.691620879484581e-07, "loss": 0.0067, "reward": 1.0920759439468384, "reward_std": 0.12864240258932114, "rewards/accuracy_reward": 0.09821428591385484, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 692 }, { "completion_length": 532.8236923217773, "epoch": 0.20700470465237847, "grad_norm": 0.9325131773948669, "kl": 0.25830078125, "learning_rate": 9.689910993352554e-07, "loss": 0.0103, "reward": 1.1741071939468384, "reward_std": 0.1233384720981121, "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 693 }, { "completion_length": 554.8169937133789, "epoch": 0.20730341273989994, "grad_norm": 0.7304608225822449, "kl": 0.427978515625, "learning_rate": 9.688196549008023e-07, "loss": 0.0171, "reward": 1.1445313394069672, "reward_std": 0.16148574091494083, "rewards/accuracy_reward": 0.15848214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 694 }, { "completion_length": 505.7232360839844, "epoch": 0.2076021208274214, "grad_norm": 0.29398098587989807, "kl": 0.207275390625, "learning_rate": 9.686477548316135e-07, "loss": 0.0083, "reward": 1.1774553954601288, "reward_std": 0.04344875318929553, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966517984867096, "step": 695 }, { "completion_length": 504.6986846923828, "epoch": 0.20790082891494288, "grad_norm": 0.46870157122612, "kl": 0.5513916015625, "learning_rate": 9.684753993146991e-07, "loss": 0.022, "reward": 1.0998884439468384, "reward_std": 0.21844489872455597, "rewards/accuracy_reward": 0.12723214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562947034836, "step": 696 }, { "completion_length": 549.6808319091797, "epoch": 0.20819953700246435, "grad_norm": 1.0854008197784424, "kl": 0.327880859375, "learning_rate": 9.683025885375654e-07, "loss": 0.0131, "reward": 1.1863839626312256, "reward_std": 0.15954533405601978, "rewards/accuracy_reward": 0.19866072572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 697 }, { "completion_length": 526.4977874755859, "epoch": 0.20849824508998582, "grad_norm": 0.5828836560249329, "kl": 0.42144775390625, "learning_rate": 9.681293226882134e-07, "loss": 0.0169, "reward": 1.0987723767757416, "reward_std": 0.08236542530357838, "rewards/accuracy_reward": 0.10491071920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 698 }, { "completion_length": 522.2053756713867, "epoch": 0.2087969531775073, "grad_norm": 0.360264390707016, "kl": 0.259521484375, "learning_rate": 9.679556019551392e-07, "loss": 0.0104, "reward": 1.1344866454601288, "reward_std": 0.13367217406630516, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 699 }, { "completion_length": 584.8616409301758, "epoch": 0.20909566126502874, "grad_norm": 0.26686954498291016, "kl": 0.33453369140625, "learning_rate": 9.677814265273344e-07, "loss": 0.0134, "reward": 1.0859375596046448, "reward_std": 0.12017492484301329, "rewards/accuracy_reward": 0.09375000582076609, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 700 }, { "completion_length": 487.80806732177734, "epoch": 0.2093943693525502, "grad_norm": 0.34027180075645447, "kl": 0.3529052734375, "learning_rate": 9.676067965942844e-07, "loss": 0.0141, "reward": 1.2148437798023224, "reward_std": 0.14020542055368423, "rewards/accuracy_reward": 0.2209821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 701 }, { "completion_length": 530.4687805175781, "epoch": 0.20969307744007168, "grad_norm": 1.1944791078567505, "kl": 0.5240478515625, "learning_rate": 9.674317123459696e-07, "loss": 0.021, "reward": 1.0000000447034836, "reward_std": 0.10248917527496815, "rewards/accuracy_reward": 0.0267857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.973214328289032, "step": 702 }, { "completion_length": 508.2522506713867, "epoch": 0.20999178552759315, "grad_norm": 0.5666194558143616, "kl": 0.3785400390625, "learning_rate": 9.672561739728643e-07, "loss": 0.0152, "reward": 1.1953125596046448, "reward_std": 0.17594011314213276, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 703 }, { "completion_length": 498.8705520629883, "epoch": 0.21029049361511462, "grad_norm": 0.46182939410209656, "kl": 0.514892578125, "learning_rate": 9.670801816659375e-07, "loss": 0.0207, "reward": 1.1110491156578064, "reward_std": 0.1415058933198452, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 704 }, { "completion_length": 574.0535888671875, "epoch": 0.2105892017026361, "grad_norm": 0.7337431311607361, "kl": 0.6490478515625, "learning_rate": 9.669037356166511e-07, "loss": 0.026, "reward": 1.0719866454601288, "reward_std": 0.19927315413951874, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 705 }, { "completion_length": 539.4710159301758, "epoch": 0.21088790979015756, "grad_norm": 0.5099868178367615, "kl": 0.47900390625, "learning_rate": 9.667268360169616e-07, "loss": 0.0191, "reward": 1.2209821939468384, "reward_std": 0.15614140778779984, "rewards/accuracy_reward": 0.2343750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 706 }, { "completion_length": 534.9018020629883, "epoch": 0.21118661787767903, "grad_norm": 0.9677974581718445, "kl": 0.408203125, "learning_rate": 9.665494830593177e-07, "loss": 0.0163, "reward": 0.9966518133878708, "reward_std": 0.06590837799012661, "rewards/accuracy_reward": 0.008928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 707 }, { "completion_length": 516.0201187133789, "epoch": 0.2114853259652005, "grad_norm": 1.0230144262313843, "kl": 0.7681884765625, "learning_rate": 9.663716769366627e-07, "loss": 0.0307, "reward": 1.1618304252624512, "reward_std": 0.19439531117677689, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 708 }, { "completion_length": 492.63841247558594, "epoch": 0.21178403405272198, "grad_norm": 0.505307137966156, "kl": 0.250732421875, "learning_rate": 9.66193417842432e-07, "loss": 0.0101, "reward": 1.16964291036129, "reward_std": 0.15724136121571064, "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 709 }, { "completion_length": 565.3995742797852, "epoch": 0.21208274214024345, "grad_norm": 0.44145962595939636, "kl": 0.7001953125, "learning_rate": 9.66014705970554e-07, "loss": 0.028, "reward": 1.1300223767757416, "reward_std": 0.13378440961241722, "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 710 }, { "completion_length": 475.62501525878906, "epoch": 0.21238145022776492, "grad_norm": 1.2608178853988647, "kl": 0.732421875, "learning_rate": 9.658355415154498e-07, "loss": 0.0293, "reward": 1.1428571939468384, "reward_std": 0.12757073808461428, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 711 }, { "completion_length": 471.0982360839844, "epoch": 0.2126801583152864, "grad_norm": 0.7963230013847351, "kl": 0.77880859375, "learning_rate": 9.656559246720327e-07, "loss": 0.0312, "reward": 1.1819196939468384, "reward_std": 0.14792668633162975, "rewards/accuracy_reward": 0.1897321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875149011612, "step": 712 }, { "completion_length": 483.89288330078125, "epoch": 0.21297886640280786, "grad_norm": 0.984546422958374, "kl": 0.35693359375, "learning_rate": 9.65475855635708e-07, "loss": 0.0143, "reward": 1.1579241752624512, "reward_std": 0.11917468439787626, "rewards/accuracy_reward": 0.17633929196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 713 }, { "completion_length": 526.6562728881836, "epoch": 0.21327757449032933, "grad_norm": 0.8326878547668457, "kl": 0.5595703125, "learning_rate": 9.652953346023737e-07, "loss": 0.0224, "reward": 1.1300223767757416, "reward_std": 0.14925954677164555, "rewards/accuracy_reward": 0.14062500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 714 }, { "completion_length": 489.4888610839844, "epoch": 0.2135762825778508, "grad_norm": 0.3877517879009247, "kl": 0.12200927734375, "learning_rate": 9.651143617684185e-07, "loss": 0.0049, "reward": 1.1132812798023224, "reward_std": 0.0942178126424551, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.997209832072258, "step": 715 }, { "completion_length": 556.6808166503906, "epoch": 0.21387499066537227, "grad_norm": 0.8919534683227539, "kl": 0.46923828125, "learning_rate": 9.649329373307232e-07, "loss": 0.0188, "reward": 1.1635045409202576, "reward_std": 0.12650224938988686, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871651977300644, "step": 716 }, { "completion_length": 509.98217010498047, "epoch": 0.21417369875289374, "grad_norm": 0.3708331286907196, "kl": 0.376220703125, "learning_rate": 9.6475106148666e-07, "loss": 0.0151, "reward": 1.106026828289032, "reward_std": 0.15810667350888252, "rewards/accuracy_reward": 0.11830357951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232313156128, "step": 717 }, { "completion_length": 512.2277145385742, "epoch": 0.21447240684041521, "grad_norm": 0.8834773898124695, "kl": 0.4117431640625, "learning_rate": 9.645687344340918e-07, "loss": 0.0165, "reward": 1.104910746216774, "reward_std": 0.17525168042629957, "rewards/accuracy_reward": 0.12053571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 718 }, { "completion_length": 489.0692138671875, "epoch": 0.21477111492793668, "grad_norm": 0.6538218259811401, "kl": 0.49951171875, "learning_rate": 9.643859563713726e-07, "loss": 0.02, "reward": 1.127790242433548, "reward_std": 0.17597240582108498, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 719 }, { "completion_length": 533.9308319091797, "epoch": 0.21506982301545816, "grad_norm": 0.6355004906654358, "kl": 0.4755859375, "learning_rate": 9.64202727497347e-07, "loss": 0.019, "reward": 1.0859375596046448, "reward_std": 0.13856618478894234, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 720 }, { "completion_length": 504.10716247558594, "epoch": 0.21536853110297963, "grad_norm": 0.7570773363113403, "kl": 0.39593505859375, "learning_rate": 9.640190480113503e-07, "loss": 0.0159, "reward": 1.11886166036129, "reward_std": 0.10864936746656895, "rewards/accuracy_reward": 0.12946429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 721 }, { "completion_length": 544.7902069091797, "epoch": 0.21566723919050107, "grad_norm": 0.22913581132888794, "kl": 0.2122802734375, "learning_rate": 9.638349181132077e-07, "loss": 0.0085, "reward": 1.147321492433548, "reward_std": 0.13895958103239536, "rewards/accuracy_reward": 0.15178572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9955357313156128, "step": 722 }, { "completion_length": 507.45984649658203, "epoch": 0.21596594727802254, "grad_norm": 0.45322951674461365, "kl": 0.32293701171875, "learning_rate": 9.63650338003234e-07, "loss": 0.0129, "reward": 1.2360491752624512, "reward_std": 0.14585206843912601, "rewards/accuracy_reward": 0.2410714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949776977300644, "step": 723 }, { "completion_length": 585.1295013427734, "epoch": 0.216264655365544, "grad_norm": 0.46814486384391785, "kl": 0.507080078125, "learning_rate": 9.634653078822348e-07, "loss": 0.0203, "reward": 1.1914063096046448, "reward_std": 0.19463014230132103, "rewards/accuracy_reward": 0.2053571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 724 }, { "completion_length": 517.9843902587891, "epoch": 0.21656336345306548, "grad_norm": 0.5025748014450073, "kl": 0.39013671875, "learning_rate": 9.632798279515047e-07, "loss": 0.0156, "reward": 1.1138393580913544, "reward_std": 0.09581215679645538, "rewards/accuracy_reward": 0.12053571688011289, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.993303582072258, "step": 725 }, { "completion_length": 517.9286041259766, "epoch": 0.21686207154058695, "grad_norm": 1.1531426906585693, "kl": 0.5, "learning_rate": 9.630938984128276e-07, "loss": 0.02, "reward": 1.1316964626312256, "reward_std": 0.10139047540724277, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 726 }, { "completion_length": 564.0178985595703, "epoch": 0.21716077962810842, "grad_norm": 0.3559262752532959, "kl": 0.689453125, "learning_rate": 9.629075194684763e-07, "loss": 0.0275, "reward": 1.1238839328289032, "reward_std": 0.1268080472946167, "rewards/accuracy_reward": 0.13392857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 727 }, { "completion_length": 558.8192291259766, "epoch": 0.2174594877156299, "grad_norm": 0.5778434872627258, "kl": 0.69287109375, "learning_rate": 9.627206913212134e-07, "loss": 0.0277, "reward": 1.1227679252624512, "reward_std": 0.1349331084638834, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 728 }, { "completion_length": 504.1205520629883, "epoch": 0.21775819580315137, "grad_norm": 0.901152491569519, "kl": 0.3905029296875, "learning_rate": 9.62533414174289e-07, "loss": 0.0156, "reward": 1.240513414144516, "reward_std": 0.1567386705428362, "rewards/accuracy_reward": 0.2455357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949776977300644, "step": 729 }, { "completion_length": 556.5714569091797, "epoch": 0.21805690389067284, "grad_norm": 1.0231423377990723, "kl": 1.021728515625, "learning_rate": 9.62345688231443e-07, "loss": 0.0409, "reward": 1.125558078289032, "reward_std": 0.12907184660434723, "rewards/accuracy_reward": 0.14508929452858865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 730 }, { "completion_length": 498.10269927978516, "epoch": 0.2183556119781943, "grad_norm": 0.6305994987487793, "kl": 0.236083984375, "learning_rate": 9.621575136969023e-07, "loss": 0.0094, "reward": 1.104352742433548, "reward_std": 0.08718860894441605, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9972098618745804, "step": 731 }, { "completion_length": 497.51341247558594, "epoch": 0.21865432006571578, "grad_norm": 0.5556716918945312, "kl": 0.23876953125, "learning_rate": 9.619688907753823e-07, "loss": 0.0095, "reward": 1.166294664144516, "reward_std": 0.1463577151298523, "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 732 }, { "completion_length": 495.79467010498047, "epoch": 0.21895302815323725, "grad_norm": 1.2330949306488037, "kl": 0.28277587890625, "learning_rate": 9.617798196720866e-07, "loss": 0.0113, "reward": 1.190290242433548, "reward_std": 0.14214007183909416, "rewards/accuracy_reward": 0.19866072619333863, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 733 }, { "completion_length": 484.83707427978516, "epoch": 0.21925173624075872, "grad_norm": 0.49374058842658997, "kl": 0.19964599609375, "learning_rate": 9.615903005927056e-07, "loss": 0.008, "reward": 1.1116071939468384, "reward_std": 0.13209731318056583, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 734 }, { "completion_length": 564.037956237793, "epoch": 0.2195504443282802, "grad_norm": 0.462494820356369, "kl": 0.2884521484375, "learning_rate": 9.61400333743418e-07, "loss": 0.0116, "reward": 1.1612723469734192, "reward_std": 0.13284956850111485, "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 735 }, { "completion_length": 508.99778747558594, "epoch": 0.21984915241580166, "grad_norm": 0.33701732754707336, "kl": 0.13824462890625, "learning_rate": 9.612099193308889e-07, "loss": 0.0055, "reward": 1.137834906578064, "reward_std": 0.14256838336586952, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.992745578289032, "step": 736 }, { "completion_length": 533.7366409301758, "epoch": 0.22014786050332313, "grad_norm": 1.178019404411316, "kl": 0.563232421875, "learning_rate": 9.610190575622702e-07, "loss": 0.0226, "reward": 1.1523437798023224, "reward_std": 0.14986357279121876, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 737 }, { "completion_length": 492.12279510498047, "epoch": 0.2204465685908446, "grad_norm": 0.6145401000976562, "kl": 0.1524658203125, "learning_rate": 9.608277486452011e-07, "loss": 0.0061, "reward": 1.0156250596046448, "reward_std": 0.07281352020800114, "rewards/accuracy_reward": 0.020089287078008056, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9955357313156128, "step": 738 }, { "completion_length": 555.4286041259766, "epoch": 0.22074527667836608, "grad_norm": 1.8243494033813477, "kl": 0.98681640625, "learning_rate": 9.606359927878072e-07, "loss": 0.0395, "reward": 1.1356027126312256, "reward_std": 0.15265526995062828, "rewards/accuracy_reward": 0.14732143748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882813096046448, "step": 739 }, { "completion_length": 566.8259124755859, "epoch": 0.22104398476588755, "grad_norm": 1.132338523864746, "kl": 0.51953125, "learning_rate": 9.604437901986995e-07, "loss": 0.0208, "reward": 1.1389509737491608, "reward_std": 0.11075364425778389, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 740 }, { "completion_length": 472.3750228881836, "epoch": 0.22134269285340902, "grad_norm": 0.37086498737335205, "kl": 0.3154296875, "learning_rate": 9.60251141086976e-07, "loss": 0.0126, "reward": 1.1635045111179352, "reward_std": 0.16093514673411846, "rewards/accuracy_reward": 0.1696428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 741 }, { "completion_length": 517.0580520629883, "epoch": 0.2216414009409305, "grad_norm": 0.3174981474876404, "kl": 0.7060546875, "learning_rate": 9.6005804566222e-07, "loss": 0.0283, "reward": 1.1104910969734192, "reward_std": 0.15387093648314476, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 742 }, { "completion_length": 585.8259124755859, "epoch": 0.22194010902845193, "grad_norm": 0.7329994440078735, "kl": 0.813720703125, "learning_rate": 9.598645041345005e-07, "loss": 0.0325, "reward": 1.061383992433548, "reward_std": 0.16255704802460968, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 743 }, { "completion_length": 546.9509201049805, "epoch": 0.2222388171159734, "grad_norm": 0.4350084066390991, "kl": 0.557373046875, "learning_rate": 9.596705167143712e-07, "loss": 0.0223, "reward": 1.2299107611179352, "reward_std": 0.17922761663794518, "rewards/accuracy_reward": 0.2410714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 744 }, { "completion_length": 514.5602874755859, "epoch": 0.22253752520349487, "grad_norm": 0.9249079823493958, "kl": 0.81005859375, "learning_rate": 9.594760836128718e-07, "loss": 0.0324, "reward": 1.1143973469734192, "reward_std": 0.185337757691741, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 745 }, { "completion_length": 576.4352951049805, "epoch": 0.22283623329101634, "grad_norm": 1.001179814338684, "kl": 0.74267578125, "learning_rate": 9.592812050415264e-07, "loss": 0.0296, "reward": 1.088169664144516, "reward_std": 0.15661122649908066, "rewards/accuracy_reward": 0.10267857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 746 }, { "completion_length": 591.2567138671875, "epoch": 0.22313494137853782, "grad_norm": 1.4617908000946045, "kl": 0.9234619140625, "learning_rate": 9.590858812123437e-07, "loss": 0.0369, "reward": 1.1618303954601288, "reward_std": 0.10852047987282276, "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 747 }, { "completion_length": 554.7611694335938, "epoch": 0.2234336494660593, "grad_norm": 0.3936440050601959, "kl": 0.42236328125, "learning_rate": 9.588901123378172e-07, "loss": 0.0169, "reward": 1.127232164144516, "reward_std": 0.08940745517611504, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 748 }, { "completion_length": 511.7053756713867, "epoch": 0.22373235755358076, "grad_norm": 0.9255217909812927, "kl": 0.671875, "learning_rate": 9.58693898630924e-07, "loss": 0.0269, "reward": 1.1021205633878708, "reward_std": 0.17416214011609554, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 749 }, { "completion_length": 496.33707427978516, "epoch": 0.22403106564110223, "grad_norm": 0.636833131313324, "kl": 0.379638671875, "learning_rate": 9.584972403051252e-07, "loss": 0.0152, "reward": 1.1796875596046448, "reward_std": 0.14490778744220734, "rewards/accuracy_reward": 0.18526786798611283, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9944196790456772, "step": 750 }, { "completion_length": 535.131721496582, "epoch": 0.2243297737286237, "grad_norm": 0.5701225399971008, "kl": 0.817626953125, "learning_rate": 9.58300137574366e-07, "loss": 0.0328, "reward": 1.1936384439468384, "reward_std": 0.20658490434288979, "rewards/accuracy_reward": 0.2098214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 751 }, { "completion_length": 541.5067291259766, "epoch": 0.22462848181614517, "grad_norm": 0.36892202496528625, "kl": 0.5145263671875, "learning_rate": 9.581025906530752e-07, "loss": 0.0206, "reward": 1.1188616752624512, "reward_std": 0.18303344957530499, "rewards/accuracy_reward": 0.12723215040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916295111179352, "step": 752 }, { "completion_length": 638.770133972168, "epoch": 0.22492718990366664, "grad_norm": 1.0392556190490723, "kl": 1.076171875, "learning_rate": 9.579045997561637e-07, "loss": 0.043, "reward": 1.1383928954601288, "reward_std": 0.19326023198664188, "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9665178954601288, "step": 753 }, { "completion_length": 539.169677734375, "epoch": 0.2252258979911881, "grad_norm": 1.093238115310669, "kl": 0.2265625, "learning_rate": 9.577061650990266e-07, "loss": 0.0091, "reward": 1.2142857611179352, "reward_std": 0.15842561423778534, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 754 }, { "completion_length": 600.7901916503906, "epoch": 0.22552460607870958, "grad_norm": 0.4369701147079468, "kl": 0.4527587890625, "learning_rate": 9.575072868975416e-07, "loss": 0.0181, "reward": 1.0652902126312256, "reward_std": 0.12102726195007563, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 755 }, { "completion_length": 559.6986999511719, "epoch": 0.22582331416623105, "grad_norm": 0.9266340732574463, "kl": 0.492919921875, "learning_rate": 9.573079653680685e-07, "loss": 0.0197, "reward": 1.2349330484867096, "reward_std": 0.2059948742389679, "rewards/accuracy_reward": 0.2477678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 756 }, { "completion_length": 639.716552734375, "epoch": 0.22612202225375252, "grad_norm": 0.6921896934509277, "kl": 0.575927734375, "learning_rate": 9.571082007274493e-07, "loss": 0.023, "reward": 1.1869420409202576, "reward_std": 0.173069154843688, "rewards/accuracy_reward": 0.20089286845177412, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98604916036129, "step": 757 }, { "completion_length": 580.5223541259766, "epoch": 0.226420730341274, "grad_norm": 0.2854311764240265, "kl": 0.3355712890625, "learning_rate": 9.56907993193009e-07, "loss": 0.0134, "reward": 1.1562500596046448, "reward_std": 0.14615782164037228, "rewards/accuracy_reward": 0.16741072246804833, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888392984867096, "step": 758 }, { "completion_length": 604.006721496582, "epoch": 0.22671943842879547, "grad_norm": 0.3883119821548462, "kl": 0.5374755859375, "learning_rate": 9.56707342982553e-07, "loss": 0.0215, "reward": 1.0848214775323868, "reward_std": 0.1901334598660469, "rewards/accuracy_reward": 0.10044643585570157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 759 }, { "completion_length": 582.5625305175781, "epoch": 0.22701814651631694, "grad_norm": 0.6960889101028442, "kl": 0.748291015625, "learning_rate": 9.565062503143696e-07, "loss": 0.03, "reward": 1.1601562798023224, "reward_std": 0.1264827623963356, "rewards/accuracy_reward": 0.1785714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 760 }, { "completion_length": 546.6295013427734, "epoch": 0.2273168546038384, "grad_norm": 0.9367658495903015, "kl": 0.424072265625, "learning_rate": 9.563047154072275e-07, "loss": 0.017, "reward": 1.0530134588479996, "reward_std": 0.1265815906226635, "rewards/accuracy_reward": 0.06250000325962901, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 761 }, { "completion_length": 590.5893173217773, "epoch": 0.22761556269135988, "grad_norm": 0.7512945532798767, "kl": 0.9326171875, "learning_rate": 9.561027384803776e-07, "loss": 0.0372, "reward": 1.123883992433548, "reward_std": 0.11555911973118782, "rewards/accuracy_reward": 0.14062500465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 762 }, { "completion_length": 530.0580444335938, "epoch": 0.22791427077888135, "grad_norm": 0.9969374537467957, "kl": 0.82666015625, "learning_rate": 9.559003197535502e-07, "loss": 0.033, "reward": 1.2081473469734192, "reward_std": 0.1651926450431347, "rewards/accuracy_reward": 0.2254464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 763 }, { "completion_length": 519.9576187133789, "epoch": 0.22821297886640282, "grad_norm": 0.8202568888664246, "kl": 0.95849609375, "learning_rate": 9.556974594469577e-07, "loss": 0.0383, "reward": 1.2064732611179352, "reward_std": 0.21140655502676964, "rewards/accuracy_reward": 0.2299107313156128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625447034836, "step": 764 }, { "completion_length": 539.1361923217773, "epoch": 0.22851168695392426, "grad_norm": 1.1046142578125, "kl": 0.6812744140625, "learning_rate": 9.554941577812918e-07, "loss": 0.0272, "reward": 1.1478795111179352, "reward_std": 0.12372193485498428, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916295111179352, "step": 765 }, { "completion_length": 573.2411041259766, "epoch": 0.22881039504144574, "grad_norm": 2.3363564014434814, "kl": 1.7607421875, "learning_rate": 9.55290414977725e-07, "loss": 0.0704, "reward": 1.262276828289032, "reward_std": 0.21345970407128334, "rewards/accuracy_reward": 0.2991071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.963169664144516, "step": 766 }, { "completion_length": 458.9799346923828, "epoch": 0.2291091031289672, "grad_norm": 0.9479872584342957, "kl": 0.16583251953125, "learning_rate": 9.550862312579094e-07, "loss": 0.0066, "reward": 1.2126116454601288, "reward_std": 0.14686607057228684, "rewards/accuracy_reward": 0.216517873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937649011612, "step": 767 }, { "completion_length": 510.7210159301758, "epoch": 0.22940781121648868, "grad_norm": 0.5196967124938965, "kl": 0.21539306640625, "learning_rate": 9.548816068439768e-07, "loss": 0.0086, "reward": 1.1037946939468384, "reward_std": 0.10983996279537678, "rewards/accuracy_reward": 0.11607143748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 768 }, { "completion_length": 525.8192138671875, "epoch": 0.22970651930401015, "grad_norm": 1.5064340829849243, "kl": 0.30322265625, "learning_rate": 9.546765419585388e-07, "loss": 0.0122, "reward": 1.1768973767757416, "reward_std": 0.1412452608346939, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 769 }, { "completion_length": 516.7544937133789, "epoch": 0.23000522739153162, "grad_norm": 0.5921388864517212, "kl": 0.18560791015625, "learning_rate": 9.544710368246856e-07, "loss": 0.0074, "reward": 1.1746652126312256, "reward_std": 0.13677240163087845, "rewards/accuracy_reward": 0.1875000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 770 }, { "completion_length": 604.8437652587891, "epoch": 0.2303039354790531, "grad_norm": 0.7049351334571838, "kl": 0.562255859375, "learning_rate": 9.542650916659869e-07, "loss": 0.0226, "reward": 1.135602742433548, "reward_std": 0.14310345612466335, "rewards/accuracy_reward": 0.15848214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205633878708, "step": 771 }, { "completion_length": 645.0535888671875, "epoch": 0.23060264356657456, "grad_norm": 1.0093408823013306, "kl": 0.43017578125, "learning_rate": 9.540587067064905e-07, "loss": 0.0172, "reward": 1.1640625596046448, "reward_std": 0.15528906136751175, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 772 }, { "completion_length": 573.3303833007812, "epoch": 0.23090135165409603, "grad_norm": 0.4158021807670593, "kl": 0.257568359375, "learning_rate": 9.538518821707231e-07, "loss": 0.0103, "reward": 1.1517857611179352, "reward_std": 0.18125181272625923, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933036118745804, "step": 773 }, { "completion_length": 580.7075958251953, "epoch": 0.2312000597416175, "grad_norm": 0.37861958146095276, "kl": 0.6378173828125, "learning_rate": 9.5364461828369e-07, "loss": 0.0255, "reward": 1.07589291036129, "reward_std": 0.0705916490405798, "rewards/accuracy_reward": 0.09151786123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 774 }, { "completion_length": 547.9955596923828, "epoch": 0.23149876782913897, "grad_norm": 0.6685316562652588, "kl": 0.34185791015625, "learning_rate": 9.534369152708735e-07, "loss": 0.0137, "reward": 1.2377232611179352, "reward_std": 0.1447900179773569, "rewards/accuracy_reward": 0.2522321566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 775 }, { "completion_length": 516.5022506713867, "epoch": 0.23179747591666044, "grad_norm": 0.6068019270896912, "kl": 0.182373046875, "learning_rate": 9.532287733582343e-07, "loss": 0.0073, "reward": 1.1551339626312256, "reward_std": 0.15122156590223312, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9944196790456772, "step": 776 }, { "completion_length": 491.98217010498047, "epoch": 0.23209618400418192, "grad_norm": 0.5211226344108582, "kl": 0.306640625, "learning_rate": 9.530201927722103e-07, "loss": 0.0123, "reward": 1.1729911267757416, "reward_std": 0.11257885489612818, "rewards/accuracy_reward": 0.17857143143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 777 }, { "completion_length": 513.9241256713867, "epoch": 0.2323948920917034, "grad_norm": 0.7997031211853027, "kl": 0.3798828125, "learning_rate": 9.528111737397167e-07, "loss": 0.0152, "reward": 1.209263414144516, "reward_std": 0.16442982852458954, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 778 }, { "completion_length": 511.58484649658203, "epoch": 0.23269360017922486, "grad_norm": 0.34610220789909363, "kl": 0.41448974609375, "learning_rate": 9.526017164881459e-07, "loss": 0.0166, "reward": 1.172433078289032, "reward_std": 0.1684640347957611, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616454601288, "step": 779 }, { "completion_length": 596.6183319091797, "epoch": 0.23299230826674633, "grad_norm": 0.5165655612945557, "kl": 0.5673828125, "learning_rate": 9.523918212453668e-07, "loss": 0.0227, "reward": 1.1043527126312256, "reward_std": 0.17303659208118916, "rewards/accuracy_reward": 0.12500000838190317, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 780 }, { "completion_length": 512.6897506713867, "epoch": 0.2332910163542678, "grad_norm": 0.60798180103302, "kl": 0.394775390625, "learning_rate": 9.521814882397247e-07, "loss": 0.0158, "reward": 1.141183078289032, "reward_std": 0.12417612783610821, "rewards/accuracy_reward": 0.14955358253791928, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 781 }, { "completion_length": 570.2009124755859, "epoch": 0.23358972444178927, "grad_norm": 0.6957711577415466, "kl": 1.10546875, "learning_rate": 9.519707177000414e-07, "loss": 0.0441, "reward": 1.2092634439468384, "reward_std": 0.2184239998459816, "rewards/accuracy_reward": 0.2343750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884439468384, "step": 782 }, { "completion_length": 537.7991333007812, "epoch": 0.23388843252931074, "grad_norm": 0.5685148239135742, "kl": 0.4544677734375, "learning_rate": 9.517595098556148e-07, "loss": 0.0182, "reward": 1.1969866454601288, "reward_std": 0.1993623822927475, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616454601288, "step": 783 }, { "completion_length": 528.4844055175781, "epoch": 0.2341871406168322, "grad_norm": 0.8246506452560425, "kl": 0.50048828125, "learning_rate": 9.51547864936218e-07, "loss": 0.02, "reward": 1.1015625596046448, "reward_std": 0.16390693932771683, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 784 }, { "completion_length": 482.7455596923828, "epoch": 0.23448584870435368, "grad_norm": 0.44621944427490234, "kl": 0.2412109375, "learning_rate": 9.513357831721002e-07, "loss": 0.0096, "reward": 1.1462053954601288, "reward_std": 0.13078167289495468, "rewards/accuracy_reward": 0.14955357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966517984867096, "step": 785 }, { "completion_length": 606.8951110839844, "epoch": 0.23478455679187513, "grad_norm": 0.9336292743682861, "kl": 1.359375, "learning_rate": 9.511232647939852e-07, "loss": 0.0542, "reward": 1.0797991454601288, "reward_std": 0.14376436918973923, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562947034836, "step": 786 }, { "completion_length": 613.2611846923828, "epoch": 0.2350832648793966, "grad_norm": 1.325247049331665, "kl": 1.2021484375, "learning_rate": 9.509103100330727e-07, "loss": 0.0481, "reward": 1.1093750596046448, "reward_std": 0.20374146476387978, "rewards/accuracy_reward": 0.12946428963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 787 }, { "completion_length": 490.4531478881836, "epoch": 0.23538197296691807, "grad_norm": 0.3559531271457672, "kl": 0.58447265625, "learning_rate": 9.506969191210362e-07, "loss": 0.0234, "reward": 1.2734375596046448, "reward_std": 0.1403703335672617, "rewards/accuracy_reward": 0.2834821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 788 }, { "completion_length": 564.5312805175781, "epoch": 0.23568068105443954, "grad_norm": 0.5530391335487366, "kl": 0.74560546875, "learning_rate": 9.504830922900241e-07, "loss": 0.0298, "reward": 1.129464328289032, "reward_std": 0.17871279641985893, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98214291036129, "step": 789 }, { "completion_length": 513.3080596923828, "epoch": 0.235979389141961, "grad_norm": 0.6904689073562622, "kl": 1.07080078125, "learning_rate": 9.502688297726594e-07, "loss": 0.0429, "reward": 1.145089328289032, "reward_std": 0.17897039838135242, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 790 }, { "completion_length": 535.4576034545898, "epoch": 0.23627809722948248, "grad_norm": 0.7014322280883789, "kl": 0.69482421875, "learning_rate": 9.500541318020382e-07, "loss": 0.0277, "reward": 1.2131697237491608, "reward_std": 0.12851724959909916, "rewards/accuracy_reward": 0.2254464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 791 }, { "completion_length": 626.1763763427734, "epoch": 0.23657680531700395, "grad_norm": 0.8134711980819702, "kl": 0.361328125, "learning_rate": 9.498389986117312e-07, "loss": 0.0145, "reward": 1.2131696939468384, "reward_std": 0.18671451602131128, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 792 }, { "completion_length": 635.482177734375, "epoch": 0.23687551340452542, "grad_norm": 0.9965940117835999, "kl": 0.626953125, "learning_rate": 9.496234304357822e-07, "loss": 0.0251, "reward": 1.1093750447034836, "reward_std": 0.13091303035616875, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 793 }, { "completion_length": 481.31028747558594, "epoch": 0.2371742214920469, "grad_norm": 1.8337078094482422, "kl": 0.3118896484375, "learning_rate": 9.494074275087081e-07, "loss": 0.0125, "reward": 1.2890625596046448, "reward_std": 0.15605540201067924, "rewards/accuracy_reward": 0.296875023515895, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875447034836, "step": 794 }, { "completion_length": 525.0781555175781, "epoch": 0.23747292957956836, "grad_norm": 0.4117840528488159, "kl": 0.60986328125, "learning_rate": 9.49190990065499e-07, "loss": 0.0244, "reward": 1.1969866454601288, "reward_std": 0.15993358474224806, "rewards/accuracy_reward": 0.2098214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 795 }, { "completion_length": 542.9977798461914, "epoch": 0.23777163766708984, "grad_norm": 0.7983061075210571, "kl": 0.936279296875, "learning_rate": 9.489741183416178e-07, "loss": 0.0374, "reward": 1.1484375149011612, "reward_std": 0.235993392765522, "rewards/accuracy_reward": 0.17187500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625149011612, "step": 796 }, { "completion_length": 530.1250152587891, "epoch": 0.2380703457546113, "grad_norm": 0.5188504457473755, "kl": 0.623046875, "learning_rate": 9.487568125729994e-07, "loss": 0.0249, "reward": 1.1160714626312256, "reward_std": 0.12560179084539413, "rewards/accuracy_reward": 0.1272321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 797 }, { "completion_length": 663.8393096923828, "epoch": 0.23836905384213278, "grad_norm": 0.5393808484077454, "kl": 0.732421875, "learning_rate": 9.485390729960514e-07, "loss": 0.0293, "reward": 1.0770089626312256, "reward_std": 0.18508495762944221, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946790456772, "step": 798 }, { "completion_length": 560.7388458251953, "epoch": 0.23866776192965425, "grad_norm": 0.5781280994415283, "kl": 0.87548828125, "learning_rate": 9.483208998476529e-07, "loss": 0.035, "reward": 1.1919643580913544, "reward_std": 0.17319215275347233, "rewards/accuracy_reward": 0.2075892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 799 }, { "completion_length": 465.19422149658203, "epoch": 0.23896647001717572, "grad_norm": 0.4934673309326172, "kl": 0.2994384765625, "learning_rate": 9.481022933651549e-07, "loss": 0.012, "reward": 1.2745536267757416, "reward_std": 0.1833759816363454, "rewards/accuracy_reward": 0.2767857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9977678656578064, "step": 800 }, { "completion_length": 577.7009124755859, "epoch": 0.2392651781046972, "grad_norm": 0.5216473340988159, "kl": 0.5771484375, "learning_rate": 9.478832537863801e-07, "loss": 0.0231, "reward": 1.1847098767757416, "reward_std": 0.2090733591467142, "rewards/accuracy_reward": 0.19642858393490314, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 801 }, { "completion_length": 493.4643096923828, "epoch": 0.23956388619221866, "grad_norm": 0.2920762002468109, "kl": 0.4857177734375, "learning_rate": 9.476637813496219e-07, "loss": 0.0195, "reward": 1.1049107611179352, "reward_std": 0.14644568227231503, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 802 }, { "completion_length": 553.4196701049805, "epoch": 0.23986259427974013, "grad_norm": 0.4505070447921753, "kl": 0.3431396484375, "learning_rate": 9.474438762936449e-07, "loss": 0.0137, "reward": 1.1305803954601288, "reward_std": 0.12599851563572884, "rewards/accuracy_reward": 0.14062500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 803 }, { "completion_length": 543.1607437133789, "epoch": 0.2401613023672616, "grad_norm": 0.7030993700027466, "kl": 0.25, "learning_rate": 9.47223538857684e-07, "loss": 0.01, "reward": 1.104352742433548, "reward_std": 0.14498456940054893, "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949777126312256, "step": 804 }, { "completion_length": 560.9843902587891, "epoch": 0.24046001045478307, "grad_norm": 0.6558537483215332, "kl": 0.730712890625, "learning_rate": 9.470027692814451e-07, "loss": 0.0292, "reward": 1.2488839626312256, "reward_std": 0.21744874119758606, "rewards/accuracy_reward": 0.263392873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 805 }, { "completion_length": 614.7545013427734, "epoch": 0.24075871854230455, "grad_norm": 0.7064681053161621, "kl": 0.58935546875, "learning_rate": 9.467815678051036e-07, "loss": 0.0236, "reward": 1.2360491454601288, "reward_std": 0.1861652433872223, "rewards/accuracy_reward": 0.26116072107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884290456772, "step": 806 }, { "completion_length": 500.0937728881836, "epoch": 0.24105742662982602, "grad_norm": 0.35571005940437317, "kl": 0.486083984375, "learning_rate": 9.46559934669305e-07, "loss": 0.0195, "reward": 1.219308078289032, "reward_std": 0.16037281462922692, "rewards/accuracy_reward": 0.2299107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 807 }, { "completion_length": 582.2812881469727, "epoch": 0.24135613471734746, "grad_norm": 0.9047046303749084, "kl": 0.4757080078125, "learning_rate": 9.463378701151646e-07, "loss": 0.0191, "reward": 1.0848214626312256, "reward_std": 0.1279861144721508, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 808 }, { "completion_length": 568.2433242797852, "epoch": 0.24165484280486893, "grad_norm": 0.4924595057964325, "kl": 0.77294921875, "learning_rate": 9.461153743842668e-07, "loss": 0.0309, "reward": 1.0172991901636124, "reward_std": 0.1267413068562746, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 809 }, { "completion_length": 585.0223541259766, "epoch": 0.2419535508923904, "grad_norm": 0.5563468933105469, "kl": 0.792236328125, "learning_rate": 9.458924477186651e-07, "loss": 0.0317, "reward": 1.1406250298023224, "reward_std": 0.14911876246333122, "rewards/accuracy_reward": 0.15401786682195961, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 810 }, { "completion_length": 537.5826187133789, "epoch": 0.24225225897991187, "grad_norm": 0.9024861454963684, "kl": 0.6513671875, "learning_rate": 9.456690903608822e-07, "loss": 0.0261, "reward": 1.1724331080913544, "reward_std": 0.137374484911561, "rewards/accuracy_reward": 0.1875000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 811 }, { "completion_length": 598.7968902587891, "epoch": 0.24255096706743334, "grad_norm": 0.4529784619808197, "kl": 0.5615234375, "learning_rate": 9.454453025539084e-07, "loss": 0.0224, "reward": 1.1808035969734192, "reward_std": 0.19557678699493408, "rewards/accuracy_reward": 0.196428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 812 }, { "completion_length": 651.2277069091797, "epoch": 0.2428496751549548, "grad_norm": 0.7640814185142517, "kl": 1.080078125, "learning_rate": 9.452210845412032e-07, "loss": 0.0432, "reward": 1.1328125298023224, "reward_std": 0.1809886209666729, "rewards/accuracy_reward": 0.15848214970901608, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9743303954601288, "step": 813 }, { "completion_length": 529.5357437133789, "epoch": 0.24314838324247628, "grad_norm": 0.34580254554748535, "kl": 0.50146484375, "learning_rate": 9.449964365666941e-07, "loss": 0.0201, "reward": 1.1205357611179352, "reward_std": 0.15829378925263882, "rewards/accuracy_reward": 0.12946428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 814 }, { "completion_length": 585.9776916503906, "epoch": 0.24344709132999776, "grad_norm": 0.6586741209030151, "kl": 0.67431640625, "learning_rate": 9.447713588747756e-07, "loss": 0.027, "reward": 1.1512277126312256, "reward_std": 0.20074051059782505, "rewards/accuracy_reward": 0.17633929662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884439468384, "step": 815 }, { "completion_length": 590.5580596923828, "epoch": 0.24374579941751923, "grad_norm": 0.7344064712524414, "kl": 0.52099609375, "learning_rate": 9.445458517103105e-07, "loss": 0.0208, "reward": 1.1992188096046448, "reward_std": 0.1526091480627656, "rewards/accuracy_reward": 0.2075892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 816 }, { "completion_length": 571.4777069091797, "epoch": 0.2440445075050407, "grad_norm": 0.6873796582221985, "kl": 0.47021484375, "learning_rate": 9.443199153186284e-07, "loss": 0.0188, "reward": 1.133370578289032, "reward_std": 0.14040983095765114, "rewards/accuracy_reward": 0.14508929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 817 }, { "completion_length": 653.0424194335938, "epoch": 0.24434321559256217, "grad_norm": 0.5322118997573853, "kl": 0.9075927734375, "learning_rate": 9.440935499455259e-07, "loss": 0.0363, "reward": 1.0418527126312256, "reward_std": 0.15017078816890717, "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9704241454601288, "step": 818 }, { "completion_length": 571.982177734375, "epoch": 0.24464192368008364, "grad_norm": 0.37171444296836853, "kl": 0.3338623046875, "learning_rate": 9.438667558372665e-07, "loss": 0.0133, "reward": 1.2187500596046448, "reward_std": 0.10077786631882191, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9955357313156128, "step": 819 }, { "completion_length": 594.4888763427734, "epoch": 0.2449406317676051, "grad_norm": 1.0792231559753418, "kl": 0.53662109375, "learning_rate": 9.436395332405798e-07, "loss": 0.0214, "reward": 1.1166295409202576, "reward_std": 0.21413876861333847, "rewards/accuracy_reward": 0.13839286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.97823666036129, "step": 820 }, { "completion_length": 653.107177734375, "epoch": 0.24523933985512658, "grad_norm": 0.6972253918647766, "kl": 1.009765625, "learning_rate": 9.434118824026616e-07, "loss": 0.0404, "reward": 1.1210937798023224, "reward_std": 0.22297341749072075, "rewards/accuracy_reward": 0.14732143515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723618745804, "step": 821 }, { "completion_length": 585.9955596923828, "epoch": 0.24553804794264805, "grad_norm": 0.9743232131004333, "kl": 0.63916015625, "learning_rate": 9.43183803571174e-07, "loss": 0.0256, "reward": 1.1143973767757416, "reward_std": 0.14502291567623615, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.97823666036129, "step": 822 }, { "completion_length": 524.0401992797852, "epoch": 0.24583675603016952, "grad_norm": 0.46702414751052856, "kl": 0.4439697265625, "learning_rate": 9.429552969942443e-07, "loss": 0.0178, "reward": 1.1992188096046448, "reward_std": 0.18108893185853958, "rewards/accuracy_reward": 0.2075892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 823 }, { "completion_length": 594.8683471679688, "epoch": 0.246135464117691, "grad_norm": 0.4772663414478302, "kl": 0.541015625, "learning_rate": 9.427263629204651e-07, "loss": 0.0216, "reward": 1.184151828289032, "reward_std": 0.20117777958512306, "rewards/accuracy_reward": 0.196428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 824 }, { "completion_length": 583.5826187133789, "epoch": 0.24643417220521247, "grad_norm": 1.29152250289917, "kl": 0.662109375, "learning_rate": 9.424970015988943e-07, "loss": 0.0264, "reward": 1.0948661118745804, "reward_std": 0.10456458199769258, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 825 }, { "completion_length": 554.2857284545898, "epoch": 0.24673288029273394, "grad_norm": 0.4443506896495819, "kl": 0.42181396484375, "learning_rate": 9.422672132790549e-07, "loss": 0.0169, "reward": 1.2226563096046448, "reward_std": 0.11866672523319721, "rewards/accuracy_reward": 0.2299107275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.992745578289032, "step": 826 }, { "completion_length": 648.8214416503906, "epoch": 0.2470315883802554, "grad_norm": 0.41243091225624084, "kl": 0.709716796875, "learning_rate": 9.420369982109335e-07, "loss": 0.0284, "reward": 1.1277902126312256, "reward_std": 0.18657805770635605, "rewards/accuracy_reward": 0.14285715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 827 }, { "completion_length": 594.4419860839844, "epoch": 0.24733029646777688, "grad_norm": 0.3320216238498688, "kl": 0.3505859375, "learning_rate": 9.41806356644982e-07, "loss": 0.014, "reward": 1.1266741454601288, "reward_std": 0.1657080128788948, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 828 }, { "completion_length": 611.4933395385742, "epoch": 0.24762900455529832, "grad_norm": 0.395601361989975, "kl": 0.490234375, "learning_rate": 9.415752888321154e-07, "loss": 0.0196, "reward": 1.1584821939468384, "reward_std": 0.16444772109389305, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 829 }, { "completion_length": 528.0424346923828, "epoch": 0.2479277126428198, "grad_norm": 0.4735947549343109, "kl": 0.4576416015625, "learning_rate": 9.413437950237131e-07, "loss": 0.0183, "reward": 1.0864956080913544, "reward_std": 0.1369300801306963, "rewards/accuracy_reward": 0.10267857764847577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 830 }, { "completion_length": 549.3884201049805, "epoch": 0.24822642073034126, "grad_norm": 0.4527842700481415, "kl": 0.171630859375, "learning_rate": 9.411118754716177e-07, "loss": 0.0069, "reward": 1.1813616454601288, "reward_std": 0.14864954352378845, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9983258992433548, "step": 831 }, { "completion_length": 566.7120895385742, "epoch": 0.24852512881786273, "grad_norm": 2.5112602710723877, "kl": 0.408203125, "learning_rate": 9.40879530428135e-07, "loss": 0.0163, "reward": 1.152343824505806, "reward_std": 0.160810224711895, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 832 }, { "completion_length": 590.3192291259766, "epoch": 0.2488238369053842, "grad_norm": 0.3538128435611725, "kl": 0.08514404296875, "learning_rate": 9.406467601460333e-07, "loss": 0.0034, "reward": 1.1367188096046448, "reward_std": 0.13829970732331276, "rewards/accuracy_reward": 0.14062500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937649011612, "step": 833 }, { "completion_length": 607.6361846923828, "epoch": 0.24912254499290568, "grad_norm": 1.199746012687683, "kl": 0.373779296875, "learning_rate": 9.404135648785441e-07, "loss": 0.015, "reward": 1.1088170111179352, "reward_std": 0.13336562924087048, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 834 }, { "completion_length": 591.9821701049805, "epoch": 0.24942125308042715, "grad_norm": 0.5152125358581543, "kl": 0.57275390625, "learning_rate": 9.40179944879361e-07, "loss": 0.0229, "reward": 1.1322545111179352, "reward_std": 0.15407893992960453, "rewards/accuracy_reward": 0.15625000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.976004496216774, "step": 835 }, { "completion_length": 574.6250305175781, "epoch": 0.24971996116794862, "grad_norm": 0.5949001908302307, "kl": 0.498046875, "learning_rate": 9.399459004026396e-07, "loss": 0.0199, "reward": 1.1763393580913544, "reward_std": 0.16634082235395908, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 836 }, { "completion_length": 577.5379867553711, "epoch": 0.2500186692554701, "grad_norm": 0.8412688970565796, "kl": 0.2581787109375, "learning_rate": 9.397114317029974e-07, "loss": 0.0103, "reward": 1.1428571939468384, "reward_std": 0.16445696726441383, "rewards/accuracy_reward": 0.15178572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 837 }, { "completion_length": 599.8616180419922, "epoch": 0.2503173773429916, "grad_norm": 0.5147051215171814, "kl": 0.4501953125, "learning_rate": 9.394765390355133e-07, "loss": 0.018, "reward": 1.1813616454601288, "reward_std": 0.19456632435321808, "rewards/accuracy_reward": 0.20089286752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 838 }, { "completion_length": 611.8393096923828, "epoch": 0.25061608543051306, "grad_norm": 0.3369448781013489, "kl": 0.4560546875, "learning_rate": 9.392412226557275e-07, "loss": 0.0183, "reward": 1.0686384588479996, "reward_std": 0.10218443162739277, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 839 }, { "completion_length": 541.7835083007812, "epoch": 0.2509147935180345, "grad_norm": 1.4830896854400635, "kl": 0.7451171875, "learning_rate": 9.390054828196412e-07, "loss": 0.0298, "reward": 1.2354911267757416, "reward_std": 0.13625861518085003, "rewards/accuracy_reward": 0.25223215389996767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 840 }, { "completion_length": 603.982177734375, "epoch": 0.25121350160555594, "grad_norm": 1.1612263917922974, "kl": 0.37255859375, "learning_rate": 9.387693197837162e-07, "loss": 0.0149, "reward": 1.160714328289032, "reward_std": 0.19436660036444664, "rewards/accuracy_reward": 0.17857143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 841 }, { "completion_length": 520.8527069091797, "epoch": 0.2515122096930774, "grad_norm": 0.8383960723876953, "kl": 0.3436279296875, "learning_rate": 9.385327338048749e-07, "loss": 0.0137, "reward": 1.1428571939468384, "reward_std": 0.11377295851707458, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 842 }, { "completion_length": 542.8571624755859, "epoch": 0.2518109177805989, "grad_norm": 0.6731235980987549, "kl": 0.35498046875, "learning_rate": 9.382957251404995e-07, "loss": 0.0142, "reward": 1.1406250596046448, "reward_std": 0.14324615336954594, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 843 }, { "completion_length": 595.3616256713867, "epoch": 0.25210962586812036, "grad_norm": 1.0994726419448853, "kl": 0.8148193359375, "learning_rate": 9.38058294048432e-07, "loss": 0.0327, "reward": 1.1584821939468384, "reward_std": 0.16578829661011696, "rewards/accuracy_reward": 0.17410715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 844 }, { "completion_length": 537.9352951049805, "epoch": 0.25240833395564183, "grad_norm": 0.3866572976112366, "kl": 0.41058349609375, "learning_rate": 9.378204407869747e-07, "loss": 0.0164, "reward": 1.225446492433548, "reward_std": 0.12481418624520302, "rewards/accuracy_reward": 0.2321428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 845 }, { "completion_length": 569.2968902587891, "epoch": 0.2527070420431633, "grad_norm": 0.7516779899597168, "kl": 0.44970703125, "learning_rate": 9.37582165614888e-07, "loss": 0.018, "reward": 1.2036830484867096, "reward_std": 0.19335604459047318, "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 846 }, { "completion_length": 548.9687728881836, "epoch": 0.25300575013068477, "grad_norm": 0.3459845185279846, "kl": 0.533447265625, "learning_rate": 9.373434687913924e-07, "loss": 0.0213, "reward": 1.0881697237491608, "reward_std": 0.0808285465463996, "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 847 }, { "completion_length": 623.1428833007812, "epoch": 0.25330445821820624, "grad_norm": 1.5115634202957153, "kl": 0.7431640625, "learning_rate": 9.371043505761664e-07, "loss": 0.0298, "reward": 1.1328125298023224, "reward_std": 0.08527540601789951, "rewards/accuracy_reward": 0.14508928824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 848 }, { "completion_length": 561.162971496582, "epoch": 0.2536031663057277, "grad_norm": 1.263567566871643, "kl": 0.479736328125, "learning_rate": 9.368648112293474e-07, "loss": 0.0192, "reward": 1.1344866454601288, "reward_std": 0.18926744908094406, "rewards/accuracy_reward": 0.14732143841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 849 }, { "completion_length": 571.1674346923828, "epoch": 0.2539018743932492, "grad_norm": 0.3689493238925934, "kl": 0.234619140625, "learning_rate": 9.366248510115307e-07, "loss": 0.0094, "reward": 1.1478795111179352, "reward_std": 0.1350373812019825, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937649011612, "step": 850 }, { "completion_length": 598.9799346923828, "epoch": 0.25420058248077065, "grad_norm": 0.4493383765220642, "kl": 0.367431640625, "learning_rate": 9.363844701837692e-07, "loss": 0.0147, "reward": 1.2232143580913544, "reward_std": 0.18081500567495823, "rewards/accuracy_reward": 0.2366071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 851 }, { "completion_length": 489.9062728881836, "epoch": 0.2544992905682921, "grad_norm": 0.2528066337108612, "kl": 0.12005615234375, "learning_rate": 9.361436690075739e-07, "loss": 0.0048, "reward": 1.1674107909202576, "reward_std": 0.09870505146682262, "rewards/accuracy_reward": 0.16964286752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9977678805589676, "step": 852 }, { "completion_length": 507.7968978881836, "epoch": 0.2547979986558136, "grad_norm": 0.7789764404296875, "kl": 0.27581787109375, "learning_rate": 9.359024477449128e-07, "loss": 0.011, "reward": 1.258370578289032, "reward_std": 0.12830033339560032, "rewards/accuracy_reward": 0.2678571566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 853 }, { "completion_length": 613.8772583007812, "epoch": 0.25509670674333507, "grad_norm": 0.6732026934623718, "kl": 0.492431640625, "learning_rate": 9.356608066582113e-07, "loss": 0.0197, "reward": 1.1322545111179352, "reward_std": 0.17199627310037613, "rewards/accuracy_reward": 0.14955357927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 854 }, { "completion_length": 610.6294860839844, "epoch": 0.25539541483085654, "grad_norm": 0.4921303391456604, "kl": 0.69287109375, "learning_rate": 9.354187460103508e-07, "loss": 0.0278, "reward": 1.1227678954601288, "reward_std": 0.1288370881229639, "rewards/accuracy_reward": 0.14062500977888703, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98214291036129, "step": 855 }, { "completion_length": 640.5312957763672, "epoch": 0.255694122918378, "grad_norm": 2.0342624187469482, "kl": 0.71484375, "learning_rate": 9.351762660646698e-07, "loss": 0.0285, "reward": 1.0217634588479996, "reward_std": 0.16880080476403236, "rewards/accuracy_reward": 0.055803572526201606, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9659598767757416, "step": 856 }, { "completion_length": 460.45984649658203, "epoch": 0.2559928310058995, "grad_norm": 0.585967481136322, "kl": 0.20123291015625, "learning_rate": 9.349333670849628e-07, "loss": 0.0081, "reward": 1.1595982611179352, "reward_std": 0.12537458539009094, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 857 }, { "completion_length": 570.8526992797852, "epoch": 0.25629153909342095, "grad_norm": 0.5823943018913269, "kl": 0.65966796875, "learning_rate": 9.346900493354798e-07, "loss": 0.0264, "reward": 1.106026828289032, "reward_std": 0.11412902269512415, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946790456772, "step": 858 }, { "completion_length": 559.1094055175781, "epoch": 0.2565902471809424, "grad_norm": 0.4130266308784485, "kl": 0.7021484375, "learning_rate": 9.344463130809267e-07, "loss": 0.0281, "reward": 1.1104911267757416, "reward_std": 0.16115250065922737, "rewards/accuracy_reward": 0.12053571734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 859 }, { "completion_length": 545.9643096923828, "epoch": 0.2568889552684639, "grad_norm": 0.6934730410575867, "kl": 0.835693359375, "learning_rate": 9.342021585864649e-07, "loss": 0.0334, "reward": 1.1674107611179352, "reward_std": 0.152545265853405, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428805589676, "step": 860 }, { "completion_length": 662.3482360839844, "epoch": 0.25718766335598536, "grad_norm": 0.8332504630088806, "kl": 0.904296875, "learning_rate": 9.339575861177103e-07, "loss": 0.0362, "reward": 1.1406250596046448, "reward_std": 0.17979956604540348, "rewards/accuracy_reward": 0.16294643771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 861 }, { "completion_length": 521.1897430419922, "epoch": 0.25748637144350683, "grad_norm": 0.7529537677764893, "kl": 0.282470703125, "learning_rate": 9.337125959407341e-07, "loss": 0.0113, "reward": 1.2008929252624512, "reward_std": 0.16208594292402267, "rewards/accuracy_reward": 0.212053582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 862 }, { "completion_length": 525.8214416503906, "epoch": 0.2577850795310283, "grad_norm": 0.5077219009399414, "kl": 0.5224609375, "learning_rate": 9.33467188322061e-07, "loss": 0.0209, "reward": 1.1350446939468384, "reward_std": 0.1411692462861538, "rewards/accuracy_reward": 0.14508928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 863 }, { "completion_length": 536.0893096923828, "epoch": 0.2580837876185498, "grad_norm": 0.4176841676235199, "kl": 0.24560546875, "learning_rate": 9.332213635286713e-07, "loss": 0.0098, "reward": 1.0920759737491608, "reward_std": 0.10931755043566227, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 864 }, { "completion_length": 595.6741333007812, "epoch": 0.25838249570607125, "grad_norm": 0.5937929153442383, "kl": 0.382568359375, "learning_rate": 9.329751218279975e-07, "loss": 0.0153, "reward": 1.1311384439468384, "reward_std": 0.16943835839629173, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 865 }, { "completion_length": 646.2165374755859, "epoch": 0.2586812037935927, "grad_norm": 0.362122118473053, "kl": 0.67138671875, "learning_rate": 9.327284634879269e-07, "loss": 0.0268, "reward": 1.166852742433548, "reward_std": 0.2553482837975025, "rewards/accuracy_reward": 0.1897321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205633878708, "step": 866 }, { "completion_length": 513.7656326293945, "epoch": 0.2589799118811142, "grad_norm": 0.4364132583141327, "kl": 0.33837890625, "learning_rate": 9.324813887767993e-07, "loss": 0.0135, "reward": 1.2220982611179352, "reward_std": 0.1461089923977852, "rewards/accuracy_reward": 0.2276785857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 867 }, { "completion_length": 600.185302734375, "epoch": 0.25927861996863566, "grad_norm": 0.4120602309703827, "kl": 0.78271484375, "learning_rate": 9.322338979634082e-07, "loss": 0.0313, "reward": 1.1635045409202576, "reward_std": 0.22782481461763382, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 868 }, { "completion_length": 573.4442291259766, "epoch": 0.25957732805615713, "grad_norm": 4.449687480926514, "kl": 0.7333984375, "learning_rate": 9.319859913169987e-07, "loss": 0.0294, "reward": 1.2260045260190964, "reward_std": 0.1259610913693905, "rewards/accuracy_reward": 0.23883929708972573, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 869 }, { "completion_length": 552.3594055175781, "epoch": 0.2598760361436786, "grad_norm": 0.350967675447464, "kl": 0.400390625, "learning_rate": 9.317376691072694e-07, "loss": 0.016, "reward": 1.0998884439468384, "reward_std": 0.16713416203856468, "rewards/accuracy_reward": 0.10714286239817739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 870 }, { "completion_length": 516.2522583007812, "epoch": 0.2601747442312001, "grad_norm": 0.7615167498588562, "kl": 0.4326171875, "learning_rate": 9.314889316043706e-07, "loss": 0.0173, "reward": 1.217633992433548, "reward_std": 0.134350024163723, "rewards/accuracy_reward": 0.2276785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 871 }, { "completion_length": 507.49109649658203, "epoch": 0.26047345231872154, "grad_norm": 0.314506858587265, "kl": 0.5196533203125, "learning_rate": 9.312397790789039e-07, "loss": 0.0209, "reward": 1.1149554252624512, "reward_std": 0.20135390013456345, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 872 }, { "completion_length": 520.1763763427734, "epoch": 0.260772160406243, "grad_norm": 0.7186776399612427, "kl": 0.49951171875, "learning_rate": 9.309902118019233e-07, "loss": 0.02, "reward": 1.1138393133878708, "reward_std": 0.10645312676206231, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 873 }, { "completion_length": 663.9174346923828, "epoch": 0.2610708684937645, "grad_norm": 0.6773512959480286, "kl": 0.95654296875, "learning_rate": 9.307402300449332e-07, "loss": 0.0383, "reward": 1.0524554252624512, "reward_std": 0.14536426588892937, "rewards/accuracy_reward": 0.07142857415601611, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 874 }, { "completion_length": 560.3906402587891, "epoch": 0.26136957658128596, "grad_norm": 0.46104252338409424, "kl": 0.8619384765625, "learning_rate": 9.304898340798894e-07, "loss": 0.0344, "reward": 1.2477679401636124, "reward_std": 0.21645521745085716, "rewards/accuracy_reward": 0.2633928726427257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 875 }, { "completion_length": 580.4732360839844, "epoch": 0.2616682846688074, "grad_norm": 1.2469940185546875, "kl": 0.68994140625, "learning_rate": 9.302390241791981e-07, "loss": 0.0276, "reward": 1.1640625596046448, "reward_std": 0.15164926648139954, "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 876 }, { "completion_length": 631.2187652587891, "epoch": 0.2619669927563289, "grad_norm": 1.365164041519165, "kl": 1.4267578125, "learning_rate": 9.299878006157159e-07, "loss": 0.0572, "reward": 1.1757812798023224, "reward_std": 0.14457961171865463, "rewards/accuracy_reward": 0.1986607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977120578289032, "step": 877 }, { "completion_length": 569.7410888671875, "epoch": 0.26226570084385037, "grad_norm": 0.6709088683128357, "kl": 0.7421875, "learning_rate": 9.297361636627496e-07, "loss": 0.0297, "reward": 1.1847098767757416, "reward_std": 0.18815820664167404, "rewards/accuracy_reward": 0.19866071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 878 }, { "completion_length": 540.0268096923828, "epoch": 0.26256440893137184, "grad_norm": 0.568515419960022, "kl": 0.562744140625, "learning_rate": 9.294841135940553e-07, "loss": 0.0225, "reward": 1.1802456080913544, "reward_std": 0.09282771311700344, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 879 }, { "completion_length": 628.5736770629883, "epoch": 0.2628631170188933, "grad_norm": 0.412227064371109, "kl": 0.726318359375, "learning_rate": 9.292316506838387e-07, "loss": 0.0291, "reward": 1.1523438096046448, "reward_std": 0.17808657884597778, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 880 }, { "completion_length": 599.7589492797852, "epoch": 0.2631618251064148, "grad_norm": 0.5343768000602722, "kl": 0.8203125, "learning_rate": 9.289787752067549e-07, "loss": 0.0328, "reward": 1.1077009439468384, "reward_std": 0.13183025387115777, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 881 }, { "completion_length": 613.779052734375, "epoch": 0.26346053319393625, "grad_norm": 1.198655128479004, "kl": 0.8173828125, "learning_rate": 9.287254874379077e-07, "loss": 0.0327, "reward": 1.1835938096046448, "reward_std": 0.19375195167958736, "rewards/accuracy_reward": 0.20089286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.982700914144516, "step": 882 }, { "completion_length": 541.0759124755859, "epoch": 0.26375924128145767, "grad_norm": 0.6406006813049316, "kl": 0.574951171875, "learning_rate": 9.284717876528492e-07, "loss": 0.023, "reward": 1.1049107611179352, "reward_std": 0.16176280844956636, "rewards/accuracy_reward": 0.12053572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750596046448, "step": 883 }, { "completion_length": 644.8951416015625, "epoch": 0.26405794936897914, "grad_norm": 1.3040071725845337, "kl": 0.912109375, "learning_rate": 9.282176761275799e-07, "loss": 0.0365, "reward": 1.0468750596046448, "reward_std": 0.1803664918988943, "rewards/accuracy_reward": 0.07142857369035482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464775323868, "step": 884 }, { "completion_length": 520.6227951049805, "epoch": 0.2643566574565006, "grad_norm": 0.3576851487159729, "kl": 0.5628662109375, "learning_rate": 9.279631531385483e-07, "loss": 0.0225, "reward": 1.2672991454601288, "reward_std": 0.1295923851430416, "rewards/accuracy_reward": 0.274553582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455484867096, "step": 885 }, { "completion_length": 549.2009353637695, "epoch": 0.2646553655440221, "grad_norm": 0.49231430888175964, "kl": 0.7762451171875, "learning_rate": 9.277082189626506e-07, "loss": 0.031, "reward": 1.1852678954601288, "reward_std": 0.14668495394289494, "rewards/accuracy_reward": 0.19866072502918541, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 886 }, { "completion_length": 562.7076187133789, "epoch": 0.26495407363154355, "grad_norm": 0.555492639541626, "kl": 0.50927734375, "learning_rate": 9.274528738772299e-07, "loss": 0.0204, "reward": 1.113839328289032, "reward_std": 0.12142760679125786, "rewards/accuracy_reward": 0.12723215040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 887 }, { "completion_length": 563.7567291259766, "epoch": 0.265252781719065, "grad_norm": 0.46732810139656067, "kl": 0.419677734375, "learning_rate": 9.27197118160077e-07, "loss": 0.0168, "reward": 1.1350446939468384, "reward_std": 0.14343344885855913, "rewards/accuracy_reward": 0.14062500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9944196492433548, "step": 888 }, { "completion_length": 615.0312805175781, "epoch": 0.2655514898065865, "grad_norm": 1.1374934911727905, "kl": 0.4091796875, "learning_rate": 9.269409520894285e-07, "loss": 0.0164, "reward": 1.078683078289032, "reward_std": 0.15369689650833607, "rewards/accuracy_reward": 0.08928571594879031, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 889 }, { "completion_length": 538.7210006713867, "epoch": 0.26585019789410796, "grad_norm": 0.376068651676178, "kl": 0.8095703125, "learning_rate": 9.266843759439685e-07, "loss": 0.0324, "reward": 1.2823661267757416, "reward_std": 0.254325695335865, "rewards/accuracy_reward": 0.2991071566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 890 }, { "completion_length": 551.1495742797852, "epoch": 0.26614890598162944, "grad_norm": 0.4729823172092438, "kl": 0.8115234375, "learning_rate": 9.264273900028265e-07, "loss": 0.0325, "reward": 1.20870541036129, "reward_std": 0.17729134857654572, "rewards/accuracy_reward": 0.2276785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 891 }, { "completion_length": 555.6071701049805, "epoch": 0.2664476140691509, "grad_norm": 0.4330810010433197, "kl": 0.26959228515625, "learning_rate": 9.26169994545578e-07, "loss": 0.0108, "reward": 1.1947545409202576, "reward_std": 0.14379460364580154, "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 892 }, { "completion_length": 642.3236846923828, "epoch": 0.2667463221566724, "grad_norm": 0.6869919300079346, "kl": 0.97607421875, "learning_rate": 9.259121898522442e-07, "loss": 0.0391, "reward": 1.1768973767757416, "reward_std": 0.18049206770956516, "rewards/accuracy_reward": 0.20312500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723618745804, "step": 893 }, { "completion_length": 569.810302734375, "epoch": 0.26704503024419385, "grad_norm": 0.2670084834098816, "kl": 0.41357421875, "learning_rate": 9.256539762032909e-07, "loss": 0.0166, "reward": 1.1300223767757416, "reward_std": 0.10964337084442377, "rewards/accuracy_reward": 0.14062500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 894 }, { "completion_length": 585.8236846923828, "epoch": 0.2673437383317153, "grad_norm": 0.2895140051841736, "kl": 0.292236328125, "learning_rate": 9.253953538796299e-07, "loss": 0.0117, "reward": 1.1077009439468384, "reward_std": 0.16938695311546326, "rewards/accuracy_reward": 0.11830357764847577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 895 }, { "completion_length": 585.1295013427734, "epoch": 0.2676424464192368, "grad_norm": 0.5548635125160217, "kl": 0.533203125, "learning_rate": 9.251363231626161e-07, "loss": 0.0213, "reward": 1.1819196939468384, "reward_std": 0.21459459140896797, "rewards/accuracy_reward": 0.1986607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 896 }, { "completion_length": 623.5803680419922, "epoch": 0.26794115450675826, "grad_norm": 0.6818780303001404, "kl": 0.73095703125, "learning_rate": 9.248768843340505e-07, "loss": 0.0292, "reward": 1.1171875447034836, "reward_std": 0.11780876107513905, "rewards/accuracy_reward": 0.13839286123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946939468384, "step": 897 }, { "completion_length": 629.1786041259766, "epoch": 0.26823986259427973, "grad_norm": 0.6790116429328918, "kl": 0.699462890625, "learning_rate": 9.246170376761763e-07, "loss": 0.028, "reward": 1.1104911267757416, "reward_std": 0.13147182390093803, "rewards/accuracy_reward": 0.12946429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 898 }, { "completion_length": 518.1317138671875, "epoch": 0.2685385706818012, "grad_norm": 0.6455548405647278, "kl": 0.5068359375, "learning_rate": 9.243567834716818e-07, "loss": 0.0203, "reward": 1.237165242433548, "reward_std": 0.19271822459995747, "rewards/accuracy_reward": 0.2522321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 899 }, { "completion_length": 576.2411041259766, "epoch": 0.2688372787693227, "grad_norm": 0.7558750510215759, "kl": 0.50830078125, "learning_rate": 9.240961220036976e-07, "loss": 0.0203, "reward": 1.1316964626312256, "reward_std": 0.2299208790063858, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 900 }, { "completion_length": 568.0134048461914, "epoch": 0.26913598685684414, "grad_norm": 0.9008967280387878, "kl": 0.571044921875, "learning_rate": 9.23835053555798e-07, "loss": 0.0229, "reward": 1.1434152126312256, "reward_std": 0.11657683737576008, "rewards/accuracy_reward": 0.14955358067527413, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616156578064, "step": 901 }, { "completion_length": 516.7745742797852, "epoch": 0.2694346949443656, "grad_norm": 0.43788689374923706, "kl": 0.4189453125, "learning_rate": 9.23573578412e-07, "loss": 0.0167, "reward": 1.090401828289032, "reward_std": 0.12232772447168827, "rewards/accuracy_reward": 0.10044643585570157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98995541036129, "step": 902 }, { "completion_length": 657.5670013427734, "epoch": 0.2697334030318871, "grad_norm": 0.40157440304756165, "kl": 0.928466796875, "learning_rate": 9.233116968567627e-07, "loss": 0.0371, "reward": 1.117745578289032, "reward_std": 0.17824473977088928, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793526977300644, "step": 903 }, { "completion_length": 518.2902069091797, "epoch": 0.27003211111940856, "grad_norm": 1.359671950340271, "kl": 0.44287109375, "learning_rate": 9.230494091749879e-07, "loss": 0.0177, "reward": 1.1406250596046448, "reward_std": 0.1261586770415306, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 904 }, { "completion_length": 550.9531555175781, "epoch": 0.27033081920693003, "grad_norm": 0.9059129357337952, "kl": 0.539306640625, "learning_rate": 9.227867156520186e-07, "loss": 0.0216, "reward": 1.0424107313156128, "reward_std": 0.1210304256528616, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 905 }, { "completion_length": 623.935302734375, "epoch": 0.2706295272944515, "grad_norm": 0.5520076155662537, "kl": 0.57470703125, "learning_rate": 9.225236165736395e-07, "loss": 0.023, "reward": 1.1847098767757416, "reward_std": 0.16431425511837006, "rewards/accuracy_reward": 0.20089287031441927, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 906 }, { "completion_length": 605.9085083007812, "epoch": 0.27092823538197297, "grad_norm": 1.3464765548706055, "kl": 0.662109375, "learning_rate": 9.222601122260771e-07, "loss": 0.0265, "reward": 1.0954241454601288, "reward_std": 0.11895322613418102, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 907 }, { "completion_length": 583.0893096923828, "epoch": 0.27122694346949444, "grad_norm": 0.4892924726009369, "kl": 0.5477294921875, "learning_rate": 9.219962028959978e-07, "loss": 0.0219, "reward": 1.2109375596046448, "reward_std": 0.1870943084359169, "rewards/accuracy_reward": 0.22544644121080637, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 908 }, { "completion_length": 571.0000305175781, "epoch": 0.2715256515570159, "grad_norm": 0.42245879769325256, "kl": 0.5506591796875, "learning_rate": 9.217318888705094e-07, "loss": 0.0221, "reward": 1.209821492433548, "reward_std": 0.19262821972370148, "rewards/accuracy_reward": 0.2187500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 909 }, { "completion_length": 625.6473541259766, "epoch": 0.2718243596445374, "grad_norm": 0.3228980600833893, "kl": 0.5888671875, "learning_rate": 9.214671704371596e-07, "loss": 0.0236, "reward": 1.1618303954601288, "reward_std": 0.1861545518040657, "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 910 }, { "completion_length": 500.0558319091797, "epoch": 0.27212306773205885, "grad_norm": 0.6225403547286987, "kl": 0.22509765625, "learning_rate": 9.212020478839359e-07, "loss": 0.009, "reward": 1.2170759737491608, "reward_std": 0.18436596542596817, "rewards/accuracy_reward": 0.2209821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937798023224, "step": 911 }, { "completion_length": 626.607177734375, "epoch": 0.2724217758195803, "grad_norm": 0.3938045799732208, "kl": 0.87841796875, "learning_rate": 9.20936521499266e-07, "loss": 0.0351, "reward": 1.1205357611179352, "reward_std": 0.18636099249124527, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 912 }, { "completion_length": 527.3259201049805, "epoch": 0.2727204839071018, "grad_norm": 0.22259804606437683, "kl": 0.4913330078125, "learning_rate": 9.206705915720162e-07, "loss": 0.0197, "reward": 1.1456473767757416, "reward_std": 0.13029003888368607, "rewards/accuracy_reward": 0.15848215389996767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 913 }, { "completion_length": 557.5089569091797, "epoch": 0.27301919199462327, "grad_norm": 1.5124542713165283, "kl": 0.829345703125, "learning_rate": 9.204042583914925e-07, "loss": 0.0332, "reward": 1.116071492433548, "reward_std": 0.15215561725199223, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 914 }, { "completion_length": 598.3861846923828, "epoch": 0.27331790008214474, "grad_norm": 0.9961355328559875, "kl": 0.7880859375, "learning_rate": 9.201375222474392e-07, "loss": 0.0315, "reward": 1.137276828289032, "reward_std": 0.11574885249137878, "rewards/accuracy_reward": 0.14732143771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98995541036129, "step": 915 }, { "completion_length": 574.9442367553711, "epoch": 0.2736166081696662, "grad_norm": 0.885500967502594, "kl": 0.5621337890625, "learning_rate": 9.198703834300391e-07, "loss": 0.0225, "reward": 1.1690848469734192, "reward_std": 0.12254779040813446, "rewards/accuracy_reward": 0.1763392873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 916 }, { "completion_length": 553.7299346923828, "epoch": 0.2739153162571877, "grad_norm": 0.3820025622844696, "kl": 0.299072265625, "learning_rate": 9.19602842229913e-07, "loss": 0.012, "reward": 1.1328125298023224, "reward_std": 0.1388651393353939, "rewards/accuracy_reward": 0.14062500302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 917 }, { "completion_length": 586.1227874755859, "epoch": 0.27421402434470915, "grad_norm": 0.5103215575218201, "kl": 0.735107421875, "learning_rate": 9.193348989381196e-07, "loss": 0.0294, "reward": 1.0401786118745804, "reward_std": 0.10367594100534916, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 918 }, { "completion_length": 515.7589569091797, "epoch": 0.2745127324322306, "grad_norm": 0.5536638498306274, "kl": 0.767578125, "learning_rate": 9.190665538461546e-07, "loss": 0.0307, "reward": 1.2053572237491608, "reward_std": 0.16453335154801607, "rewards/accuracy_reward": 0.22321429708972573, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428805589676, "step": 919 }, { "completion_length": 651.2723541259766, "epoch": 0.2748114405197521, "grad_norm": 0.6049442291259766, "kl": 0.7158203125, "learning_rate": 9.187978072459517e-07, "loss": 0.0286, "reward": 1.248883992433548, "reward_std": 0.18450932949781418, "rewards/accuracy_reward": 0.26785715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 920 }, { "completion_length": 571.9196624755859, "epoch": 0.27511014860727356, "grad_norm": 0.26313653588294983, "kl": 0.513671875, "learning_rate": 9.185286594298804e-07, "loss": 0.0205, "reward": 1.1233259737491608, "reward_std": 0.13865310233086348, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916294813156128, "step": 921 }, { "completion_length": 597.8839416503906, "epoch": 0.27540885669479503, "grad_norm": 0.24607816338539124, "kl": 0.468017578125, "learning_rate": 9.182591106907474e-07, "loss": 0.0187, "reward": 1.0831473767757416, "reward_std": 0.1538062710314989, "rewards/accuracy_reward": 0.1004464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 922 }, { "completion_length": 470.4152069091797, "epoch": 0.2757075647823165, "grad_norm": 0.5443041324615479, "kl": 0.28131103515625, "learning_rate": 9.179891613217953e-07, "loss": 0.0113, "reward": 1.174665242433548, "reward_std": 0.10641314834356308, "rewards/accuracy_reward": 0.18303572502918541, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916295111179352, "step": 923 }, { "completion_length": 608.0000152587891, "epoch": 0.276006272869838, "grad_norm": 1.0227383375167847, "kl": 0.395751953125, "learning_rate": 9.177188116167025e-07, "loss": 0.0158, "reward": 1.1551339626312256, "reward_std": 0.14814446866512299, "rewards/accuracy_reward": 0.16741072479635477, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 924 }, { "completion_length": 575.4219055175781, "epoch": 0.27630498095735945, "grad_norm": 0.6639360189437866, "kl": 0.53662109375, "learning_rate": 9.174480618695829e-07, "loss": 0.0215, "reward": 1.0546875596046448, "reward_std": 0.1480580810457468, "rewards/accuracy_reward": 0.07142857322469354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 925 }, { "completion_length": 566.5111923217773, "epoch": 0.27660368904488086, "grad_norm": 0.5454622507095337, "kl": 0.3536376953125, "learning_rate": 9.171769123749857e-07, "loss": 0.0142, "reward": 1.0831473767757416, "reward_std": 0.1216332409530878, "rewards/accuracy_reward": 0.08928571920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 926 }, { "completion_length": 568.7589569091797, "epoch": 0.27690239713240233, "grad_norm": 0.20396657288074493, "kl": 0.42822265625, "learning_rate": 9.169053634278952e-07, "loss": 0.0172, "reward": 1.139508992433548, "reward_std": 0.18499912042170763, "rewards/accuracy_reward": 0.149553582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 927 }, { "completion_length": 468.1116256713867, "epoch": 0.2772011052199238, "grad_norm": 0.8305671215057373, "kl": 0.50146484375, "learning_rate": 9.166334153237298e-07, "loss": 0.0201, "reward": 1.2639509439468384, "reward_std": 0.21122267469763756, "rewards/accuracy_reward": 0.2767857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 928 }, { "completion_length": 569.2924346923828, "epoch": 0.2774998133074453, "grad_norm": 0.696872889995575, "kl": 0.60205078125, "learning_rate": 9.163610683583426e-07, "loss": 0.024, "reward": 1.0563616156578064, "reward_std": 0.12082820385694504, "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 929 }, { "completion_length": 534.669677734375, "epoch": 0.27779852139496675, "grad_norm": 0.3313974440097809, "kl": 0.63427734375, "learning_rate": 9.1608832282802e-07, "loss": 0.0254, "reward": 1.1858259439468384, "reward_std": 0.17009830474853516, "rewards/accuracy_reward": 0.2008928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 930 }, { "completion_length": 549.8705520629883, "epoch": 0.2780972294824882, "grad_norm": 0.5589247941970825, "kl": 0.701171875, "learning_rate": 9.158151790294828e-07, "loss": 0.028, "reward": 1.1914063096046448, "reward_std": 0.17256056144833565, "rewards/accuracy_reward": 0.20312500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 931 }, { "completion_length": 608.4576034545898, "epoch": 0.2783959375700097, "grad_norm": 0.4177086055278778, "kl": 0.509765625, "learning_rate": 9.155416372598847e-07, "loss": 0.0204, "reward": 1.0597098469734192, "reward_std": 0.12460575858131051, "rewards/accuracy_reward": 0.07142857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 932 }, { "completion_length": 537.6852874755859, "epoch": 0.27869464565753116, "grad_norm": 0.8698262572288513, "kl": 0.587890625, "learning_rate": 9.152676978168121e-07, "loss": 0.0235, "reward": 1.1891741454601288, "reward_std": 0.14359183236956596, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 933 }, { "completion_length": 519.0580596923828, "epoch": 0.27899335374505263, "grad_norm": 0.5529195666313171, "kl": 0.77880859375, "learning_rate": 9.149933609982843e-07, "loss": 0.0311, "reward": 1.156808078289032, "reward_std": 0.0786971002817154, "rewards/accuracy_reward": 0.16964286868460476, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 934 }, { "completion_length": 505.98663330078125, "epoch": 0.2792920618325741, "grad_norm": 0.8839489221572876, "kl": 0.654541015625, "learning_rate": 9.14718627102753e-07, "loss": 0.0262, "reward": 1.174665242433548, "reward_std": 0.16947070136666298, "rewards/accuracy_reward": 0.1897321566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 935 }, { "completion_length": 604.8013610839844, "epoch": 0.27959076992009557, "grad_norm": 1.5212138891220093, "kl": 0.720703125, "learning_rate": 9.144434964291017e-07, "loss": 0.0288, "reward": 1.1969866752624512, "reward_std": 0.1987488493323326, "rewards/accuracy_reward": 0.2075892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 936 }, { "completion_length": 552.9977951049805, "epoch": 0.27988947800761704, "grad_norm": 0.9869104027748108, "kl": 0.943359375, "learning_rate": 9.141679692766453e-07, "loss": 0.0377, "reward": 1.1383928954601288, "reward_std": 0.18967360630631447, "rewards/accuracy_reward": 0.16294643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464775323868, "step": 937 }, { "completion_length": 559.8482437133789, "epoch": 0.2801881860951385, "grad_norm": 1.0046206712722778, "kl": 0.7066650390625, "learning_rate": 9.138920459451309e-07, "loss": 0.0283, "reward": 1.1484375596046448, "reward_std": 0.15522260777652264, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 938 }, { "completion_length": 573.9888610839844, "epoch": 0.28048689418266, "grad_norm": 0.3951917588710785, "kl": 0.71337890625, "learning_rate": 9.136157267347358e-07, "loss": 0.0285, "reward": 1.061941996216774, "reward_std": 0.11855940823443234, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 939 }, { "completion_length": 517.9710159301758, "epoch": 0.28078560227018146, "grad_norm": 0.5482922792434692, "kl": 0.24444580078125, "learning_rate": 9.133390119460681e-07, "loss": 0.0098, "reward": 1.1506696939468384, "reward_std": 0.14120024256408215, "rewards/accuracy_reward": 0.15848215157166123, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875149011612, "step": 940 }, { "completion_length": 525.3571624755859, "epoch": 0.2810843103577029, "grad_norm": 0.5415010452270508, "kl": 0.46142578125, "learning_rate": 9.130619018801664e-07, "loss": 0.0185, "reward": 1.2008928805589676, "reward_std": 0.10121889039874077, "rewards/accuracy_reward": 0.21205357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 941 }, { "completion_length": 598.8236999511719, "epoch": 0.2813830184452244, "grad_norm": 0.30023208260536194, "kl": 0.632568359375, "learning_rate": 9.127843968384994e-07, "loss": 0.0253, "reward": 1.1277902126312256, "reward_std": 0.1554063092917204, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 942 }, { "completion_length": 585.9553756713867, "epoch": 0.28168172653274587, "grad_norm": 0.5237906575202942, "kl": 0.921875, "learning_rate": 9.125064971229654e-07, "loss": 0.0369, "reward": 1.1188616454601288, "reward_std": 0.13341565802693367, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 943 }, { "completion_length": 583.5513610839844, "epoch": 0.28198043462026734, "grad_norm": 0.5912911295890808, "kl": 0.283935546875, "learning_rate": 9.122282030358918e-07, "loss": 0.0114, "reward": 1.0820312947034836, "reward_std": 0.1401391364634037, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 944 }, { "completion_length": 549.4420013427734, "epoch": 0.2822791427077888, "grad_norm": 3.7443184852600098, "kl": 0.4052734375, "learning_rate": 9.119495148800357e-07, "loss": 0.0162, "reward": 1.145089328289032, "reward_std": 0.09839454479515553, "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 945 }, { "completion_length": 550.0848541259766, "epoch": 0.2825778507953103, "grad_norm": 0.42795446515083313, "kl": 0.42364501953125, "learning_rate": 9.116704329585822e-07, "loss": 0.017, "reward": 1.135602742433548, "reward_std": 0.14934809692203999, "rewards/accuracy_reward": 0.149553582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 946 }, { "completion_length": 572.6116333007812, "epoch": 0.28287655888283175, "grad_norm": 0.8728727698326111, "kl": 0.491455078125, "learning_rate": 9.11390957575145e-07, "loss": 0.0197, "reward": 1.1445313096046448, "reward_std": 0.14169807732105255, "rewards/accuracy_reward": 0.15625000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 947 }, { "completion_length": 600.7768173217773, "epoch": 0.2831752669703532, "grad_norm": 0.5905166268348694, "kl": 0.62939453125, "learning_rate": 9.111110890337661e-07, "loss": 0.0252, "reward": 1.1378348767757416, "reward_std": 0.13157309778034687, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 948 }, { "completion_length": 625.1920013427734, "epoch": 0.2834739750578747, "grad_norm": 0.5948166847229004, "kl": 0.4765625, "learning_rate": 9.108308276389152e-07, "loss": 0.0191, "reward": 1.0228794813156128, "reward_std": 0.12683340720832348, "rewards/accuracy_reward": 0.03348214481957257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973767757416, "step": 949 }, { "completion_length": 616.5312652587891, "epoch": 0.28377268314539617, "grad_norm": 0.7655850052833557, "kl": 0.5244140625, "learning_rate": 9.105501736954889e-07, "loss": 0.0211, "reward": 1.12276791036129, "reward_std": 0.10887712426483631, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 950 }, { "completion_length": 554.1651916503906, "epoch": 0.28407139123291764, "grad_norm": 0.26324549317359924, "kl": 0.41888427734375, "learning_rate": 9.102691275088115e-07, "loss": 0.0168, "reward": 1.2494420111179352, "reward_std": 0.18988919258117676, "rewards/accuracy_reward": 0.2611607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 951 }, { "completion_length": 620.4955749511719, "epoch": 0.2843700993204391, "grad_norm": 1.4637079238891602, "kl": 0.762451171875, "learning_rate": 9.099876893846333e-07, "loss": 0.0305, "reward": 1.0898437947034836, "reward_std": 0.10441599227488041, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 952 }, { "completion_length": 502.96431732177734, "epoch": 0.2846688074079606, "grad_norm": 0.5394575595855713, "kl": 0.21044921875, "learning_rate": 9.097058596291319e-07, "loss": 0.0084, "reward": 1.188616082072258, "reward_std": 0.1743544302880764, "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9944196790456772, "step": 953 }, { "completion_length": 576.6562805175781, "epoch": 0.28496751549548205, "grad_norm": 0.4836927354335785, "kl": 0.151611328125, "learning_rate": 9.0942363854891e-07, "loss": 0.0061, "reward": 1.2382813096046448, "reward_std": 0.19142217561602592, "rewards/accuracy_reward": 0.2410714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.997209832072258, "step": 954 }, { "completion_length": 545.5245666503906, "epoch": 0.2852662235830035, "grad_norm": 0.5363098382949829, "kl": 0.51904296875, "learning_rate": 9.091410264509968e-07, "loss": 0.0207, "reward": 1.174665242433548, "reward_std": 0.1930762343108654, "rewards/accuracy_reward": 0.18750000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 955 }, { "completion_length": 576.4152069091797, "epoch": 0.285564931670525, "grad_norm": 0.8340988159179688, "kl": 0.394775390625, "learning_rate": 9.088580236428463e-07, "loss": 0.0158, "reward": 1.195870578289032, "reward_std": 0.12990024965256453, "rewards/accuracy_reward": 0.20758929662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 956 }, { "completion_length": 608.2656555175781, "epoch": 0.28586363975804646, "grad_norm": 0.518818199634552, "kl": 0.8096923828125, "learning_rate": 9.085746304323381e-07, "loss": 0.0324, "reward": 1.1378348767757416, "reward_std": 0.20827723667025566, "rewards/accuracy_reward": 0.15848215157166123, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 957 }, { "completion_length": 557.9352874755859, "epoch": 0.28616234784556793, "grad_norm": 0.4352915585041046, "kl": 0.466796875, "learning_rate": 9.082908471277761e-07, "loss": 0.0187, "reward": 1.166294664144516, "reward_std": 0.11970551800914109, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 958 }, { "completion_length": 581.544677734375, "epoch": 0.2864610559330894, "grad_norm": 0.29951152205467224, "kl": 0.2337646484375, "learning_rate": 9.080066740378884e-07, "loss": 0.0093, "reward": 1.1724330484867096, "reward_std": 0.12425542995333672, "rewards/accuracy_reward": 0.18080357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916294813156128, "step": 959 }, { "completion_length": 535.4665374755859, "epoch": 0.2867597640206109, "grad_norm": 0.5877836346626282, "kl": 0.820556640625, "learning_rate": 9.077221114718279e-07, "loss": 0.0328, "reward": 1.095982164144516, "reward_std": 0.12071910314261913, "rewards/accuracy_reward": 0.11830357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 960 }, { "completion_length": 546.9620666503906, "epoch": 0.28705847210813235, "grad_norm": 0.8635010719299316, "kl": 0.5880126953125, "learning_rate": 9.074371597391708e-07, "loss": 0.0235, "reward": 1.1065848618745804, "reward_std": 0.16419083066284657, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 961 }, { "completion_length": 536.5736923217773, "epoch": 0.2873571801956538, "grad_norm": 0.5132439732551575, "kl": 0.55029296875, "learning_rate": 9.071518191499164e-07, "loss": 0.022, "reward": 1.2338170409202576, "reward_std": 0.18175739608705044, "rewards/accuracy_reward": 0.2410714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.992745578289032, "step": 962 }, { "completion_length": 553.4821701049805, "epoch": 0.2876558882831753, "grad_norm": 0.6164263486862183, "kl": 0.5677490234375, "learning_rate": 9.068660900144874e-07, "loss": 0.0227, "reward": 1.2198661267757416, "reward_std": 0.12985885236412287, "rewards/accuracy_reward": 0.2321428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 963 }, { "completion_length": 558.6093978881836, "epoch": 0.28795459637069676, "grad_norm": 0.8556486368179321, "kl": 0.5322265625, "learning_rate": 9.065799726437291e-07, "loss": 0.0213, "reward": 1.1188616752624512, "reward_std": 0.16926207393407822, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330931901932, "step": 964 }, { "completion_length": 527.1116333007812, "epoch": 0.28825330445821823, "grad_norm": 0.6971271634101868, "kl": 0.48388671875, "learning_rate": 9.062934673489091e-07, "loss": 0.0194, "reward": 1.109933078289032, "reward_std": 0.09146163892000914, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 965 }, { "completion_length": 534.272346496582, "epoch": 0.2885520125457397, "grad_norm": 0.8200138211250305, "kl": 0.8199462890625, "learning_rate": 9.060065744417172e-07, "loss": 0.0328, "reward": 1.1969866752624512, "reward_std": 0.17313473485410213, "rewards/accuracy_reward": 0.21875000977888703, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 966 }, { "completion_length": 500.6272659301758, "epoch": 0.28885072063326117, "grad_norm": 0.23145951330661774, "kl": 0.1480712890625, "learning_rate": 9.057192942342647e-07, "loss": 0.0059, "reward": 1.1422991752624512, "reward_std": 0.1881372556090355, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9972098469734192, "step": 967 }, { "completion_length": 608.7455596923828, "epoch": 0.28914942872078264, "grad_norm": 1.0173225402832031, "kl": 0.70068359375, "learning_rate": 9.054316270390844e-07, "loss": 0.0281, "reward": 1.1166295111179352, "reward_std": 0.1959120649844408, "rewards/accuracy_reward": 0.12946429383009672, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 968 }, { "completion_length": 573.1964569091797, "epoch": 0.28944813680830406, "grad_norm": 0.36160045862197876, "kl": 0.2926025390625, "learning_rate": 9.051435731691299e-07, "loss": 0.0117, "reward": 1.1322545111179352, "reward_std": 0.15792123973369598, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937649011612, "step": 969 }, { "completion_length": 542.6986618041992, "epoch": 0.28974684489582553, "grad_norm": 0.9291518330574036, "kl": 0.4638671875, "learning_rate": 9.048551329377755e-07, "loss": 0.0185, "reward": 1.1763393580913544, "reward_std": 0.16941198520362377, "rewards/accuracy_reward": 0.18526786798611283, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 970 }, { "completion_length": 503.0357437133789, "epoch": 0.290045552983347, "grad_norm": 0.3731624484062195, "kl": 0.1729736328125, "learning_rate": 9.04566306658816e-07, "loss": 0.0069, "reward": 1.2801339626312256, "reward_std": 0.17197789624333382, "rewards/accuracy_reward": 0.2834821604192257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966517984867096, "step": 971 }, { "completion_length": 554.8616256713867, "epoch": 0.29034426107086847, "grad_norm": 0.45624029636383057, "kl": 0.3260498046875, "learning_rate": 9.042770946464662e-07, "loss": 0.013, "reward": 1.0731027126312256, "reward_std": 0.11988046765327454, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 972 }, { "completion_length": 500.94869232177734, "epoch": 0.29064296915838994, "grad_norm": 0.8759233951568604, "kl": 0.50396728515625, "learning_rate": 9.039874972153604e-07, "loss": 0.0201, "reward": 1.146763414144516, "reward_std": 0.125909642316401, "rewards/accuracy_reward": 0.1607142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 973 }, { "completion_length": 585.9442138671875, "epoch": 0.2909416772459114, "grad_norm": 0.350046843290329, "kl": 0.277587890625, "learning_rate": 9.036975146805519e-07, "loss": 0.0111, "reward": 1.1891741752624512, "reward_std": 0.13748360611498356, "rewards/accuracy_reward": 0.19642858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.992745578289032, "step": 974 }, { "completion_length": 528.6406555175781, "epoch": 0.2912403853334329, "grad_norm": 0.676105260848999, "kl": 0.51470947265625, "learning_rate": 9.034071473575136e-07, "loss": 0.0206, "reward": 1.209821492433548, "reward_std": 0.15108664147555828, "rewards/accuracy_reward": 0.22544643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 975 }, { "completion_length": 499.6026916503906, "epoch": 0.29153909342095435, "grad_norm": 0.6738004088401794, "kl": 0.37646484375, "learning_rate": 9.031163955621365e-07, "loss": 0.0151, "reward": 1.1121652126312256, "reward_std": 0.15273745357990265, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937649011612, "step": 976 }, { "completion_length": 544.8705749511719, "epoch": 0.2918378015084758, "grad_norm": 0.3936157822608948, "kl": 0.474609375, "learning_rate": 9.028252596107303e-07, "loss": 0.019, "reward": 1.1088170111179352, "reward_std": 0.11432023718953133, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 977 }, { "completion_length": 617.1094055175781, "epoch": 0.2921365095959973, "grad_norm": 0.6906495094299316, "kl": 0.494140625, "learning_rate": 9.025337398200223e-07, "loss": 0.0198, "reward": 1.1707589626312256, "reward_std": 0.09915336966514587, "rewards/accuracy_reward": 0.18526786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 978 }, { "completion_length": 610.7678909301758, "epoch": 0.29243521768351877, "grad_norm": 0.6656478643417358, "kl": 0.250732421875, "learning_rate": 9.022418365071572e-07, "loss": 0.01, "reward": 1.1902902126312256, "reward_std": 0.1259167455136776, "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937649011612, "step": 979 }, { "completion_length": 582.0781555175781, "epoch": 0.29273392577104024, "grad_norm": 0.3080589175224304, "kl": 0.500244140625, "learning_rate": 9.019495499896975e-07, "loss": 0.02, "reward": 1.1233259290456772, "reward_std": 0.1363639384508133, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 980 }, { "completion_length": 496.70984649658203, "epoch": 0.2930326338585617, "grad_norm": 0.3209301829338074, "kl": 0.3553466796875, "learning_rate": 9.016568805856222e-07, "loss": 0.0142, "reward": 1.2031250596046448, "reward_std": 0.1640148162841797, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 981 }, { "completion_length": 591.3459930419922, "epoch": 0.2933313419460832, "grad_norm": 1.037645697593689, "kl": 0.70751953125, "learning_rate": 9.013638286133269e-07, "loss": 0.0283, "reward": 1.0781250596046448, "reward_std": 0.13508241064846516, "rewards/accuracy_reward": 0.08705357508733869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714775323868, "step": 982 }, { "completion_length": 518.6763687133789, "epoch": 0.29363005003360465, "grad_norm": 0.4137549102306366, "kl": 0.3533935546875, "learning_rate": 9.010703943916233e-07, "loss": 0.0141, "reward": 1.2008928954601288, "reward_std": 0.09096727706491947, "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9955357313156128, "step": 983 }, { "completion_length": 523.5692291259766, "epoch": 0.2939287581211261, "grad_norm": 2.005258560180664, "kl": 0.9237060546875, "learning_rate": 9.007765782397393e-07, "loss": 0.0369, "reward": 1.1858259439468384, "reward_std": 0.16875174641609192, "rewards/accuracy_reward": 0.19419643748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 984 }, { "completion_length": 582.2723388671875, "epoch": 0.2942274662086476, "grad_norm": 0.3479679822921753, "kl": 0.547119140625, "learning_rate": 9.004823804773179e-07, "loss": 0.022, "reward": 1.1830357611179352, "reward_std": 0.16440210491418839, "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 985 }, { "completion_length": 529.8102951049805, "epoch": 0.29452617429616906, "grad_norm": 0.44665059447288513, "kl": 0.41015625, "learning_rate": 9.001878014244175e-07, "loss": 0.0164, "reward": 1.2198661267757416, "reward_std": 0.09694511070847511, "rewards/accuracy_reward": 0.2254464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 986 }, { "completion_length": 548.9263687133789, "epoch": 0.29482488238369053, "grad_norm": 0.506196141242981, "kl": 0.55712890625, "learning_rate": 8.998928414015113e-07, "loss": 0.0223, "reward": 1.2109375596046448, "reward_std": 0.19942471385002136, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 987 }, { "completion_length": 573.3795013427734, "epoch": 0.295123590471212, "grad_norm": 0.6266669631004333, "kl": 0.251220703125, "learning_rate": 8.99597500729487e-07, "loss": 0.0101, "reward": 1.1166295111179352, "reward_std": 0.1385568082332611, "rewards/accuracy_reward": 0.12053572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937798023224, "step": 988 }, { "completion_length": 570.2545013427734, "epoch": 0.2954222985587335, "grad_norm": 0.2048204392194748, "kl": 0.1043701171875, "learning_rate": 8.993017797296458e-07, "loss": 0.0042, "reward": 1.1690848469734192, "reward_std": 0.08554022200405598, "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949776977300644, "step": 989 }, { "completion_length": 530.0268096923828, "epoch": 0.29572100664625495, "grad_norm": 0.33968475461006165, "kl": 0.2640380859375, "learning_rate": 8.990056787237038e-07, "loss": 0.0106, "reward": 1.077008992433548, "reward_std": 0.1445784643292427, "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 990 }, { "completion_length": 530.4710006713867, "epoch": 0.2960197147337764, "grad_norm": 0.3676024079322815, "kl": 0.262939453125, "learning_rate": 8.987091980337894e-07, "loss": 0.0105, "reward": 1.1657366752624512, "reward_std": 0.10067684762179852, "rewards/accuracy_reward": 0.16964286682195961, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937649011612, "step": 991 }, { "completion_length": 588.3281555175781, "epoch": 0.2963184228212979, "grad_norm": 0.5356528759002686, "kl": 0.430908203125, "learning_rate": 8.984123379824448e-07, "loss": 0.0172, "reward": 1.1143973767757416, "reward_std": 0.16125753335654736, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 992 }, { "completion_length": 566.0468902587891, "epoch": 0.29661713090881936, "grad_norm": 0.8930162787437439, "kl": 0.1507568359375, "learning_rate": 8.981150988926246e-07, "loss": 0.006, "reward": 1.096540242433548, "reward_std": 0.1464176131412387, "rewards/accuracy_reward": 0.10714286030270159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 993 }, { "completion_length": 565.5580749511719, "epoch": 0.29691583899634083, "grad_norm": 0.5559843182563782, "kl": 0.25555419921875, "learning_rate": 8.978174810876958e-07, "loss": 0.0103, "reward": 1.2472098469734192, "reward_std": 0.08972246665507555, "rewards/accuracy_reward": 0.2544642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455484867096, "step": 994 }, { "completion_length": 547.5513534545898, "epoch": 0.2972145470838623, "grad_norm": 1.3963654041290283, "kl": 0.3701171875, "learning_rate": 8.975194848914371e-07, "loss": 0.0148, "reward": 1.1439732611179352, "reward_std": 0.1476033292710781, "rewards/accuracy_reward": 0.15178572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 995 }, { "completion_length": 609.2545013427734, "epoch": 0.2975132551713838, "grad_norm": 0.7391143441200256, "kl": 0.375244140625, "learning_rate": 8.972211106280397e-07, "loss": 0.015, "reward": 1.1830357313156128, "reward_std": 0.11885813623666763, "rewards/accuracy_reward": 0.19642857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 996 }, { "completion_length": 534.7120819091797, "epoch": 0.29781196325890524, "grad_norm": 0.6340608596801758, "kl": 0.29833984375, "learning_rate": 8.96922358622105e-07, "loss": 0.0119, "reward": 1.2042411267757416, "reward_std": 0.13459870219230652, "rewards/accuracy_reward": 0.2120535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 997 }, { "completion_length": 580.9218978881836, "epoch": 0.2981106713464267, "grad_norm": 0.4047738313674927, "kl": 0.283203125, "learning_rate": 8.966232291986462e-07, "loss": 0.0113, "reward": 1.1992188096046448, "reward_std": 0.13496027700603008, "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616454601288, "step": 998 }, { "completion_length": 532.0781402587891, "epoch": 0.2984093794339482, "grad_norm": 1.3344427347183228, "kl": 0.418212890625, "learning_rate": 8.963237226830869e-07, "loss": 0.0167, "reward": 1.1802455484867096, "reward_std": 0.126281363889575, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.992745578289032, "step": 999 }, { "completion_length": 593.3013610839844, "epoch": 0.29870808752146966, "grad_norm": 0.5666632652282715, "kl": 0.6761474609375, "learning_rate": 8.960238394012607e-07, "loss": 0.0271, "reward": 1.148995578289032, "reward_std": 0.1426946995779872, "rewards/accuracy_reward": 0.1674107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 1000 }, { "completion_length": 539.1897583007812, "epoch": 0.2990067956089911, "grad_norm": 0.6678951978683472, "kl": 0.481689453125, "learning_rate": 8.957235796794111e-07, "loss": 0.0193, "reward": 1.1869420260190964, "reward_std": 0.12203333154320717, "rewards/accuracy_reward": 0.19642857951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 1001 }, { "completion_length": 539.584831237793, "epoch": 0.2993055036965126, "grad_norm": 2.327133893966675, "kl": 0.87255859375, "learning_rate": 8.954229438441915e-07, "loss": 0.0349, "reward": 1.1389509439468384, "reward_std": 0.15575110353529453, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1002 }, { "completion_length": 542.6138534545898, "epoch": 0.29960421178403407, "grad_norm": 1.3490790128707886, "kl": 0.5301513671875, "learning_rate": 8.951219322226638e-07, "loss": 0.0213, "reward": 1.1930804252624512, "reward_std": 0.10638864990323782, "rewards/accuracy_reward": 0.19866072130389512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 1003 }, { "completion_length": 547.3348388671875, "epoch": 0.29990291987155554, "grad_norm": 1.2849737405776978, "kl": 1.0673828125, "learning_rate": 8.948205451422996e-07, "loss": 0.0428, "reward": 1.1210938096046448, "reward_std": 0.11961755342781544, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987165242433548, "step": 1004 }, { "completion_length": 568.4531402587891, "epoch": 0.300201627959077, "grad_norm": 0.75725257396698, "kl": 0.51025390625, "learning_rate": 8.945187829309784e-07, "loss": 0.0205, "reward": 1.1679687798023224, "reward_std": 0.10561918467283249, "rewards/accuracy_reward": 0.17410714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 1005 }, { "completion_length": 543.0513687133789, "epoch": 0.3005003360465985, "grad_norm": 1.3493951559066772, "kl": 0.75146484375, "learning_rate": 8.942166459169879e-07, "loss": 0.03, "reward": 1.0619420409202576, "reward_std": 0.16540997475385666, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98604916036129, "step": 1006 }, { "completion_length": 502.3415298461914, "epoch": 0.30079904413411995, "grad_norm": 0.9776729941368103, "kl": 0.41943359375, "learning_rate": 8.939141344290233e-07, "loss": 0.0167, "reward": 1.1813616454601288, "reward_std": 0.16517659649252892, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 1007 }, { "completion_length": 587.6718902587891, "epoch": 0.3010977522216414, "grad_norm": 0.3025454878807068, "kl": 0.666015625, "learning_rate": 8.936112487961877e-07, "loss": 0.0266, "reward": 1.192522406578064, "reward_std": 0.14270322630181909, "rewards/accuracy_reward": 0.2075892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 1008 }, { "completion_length": 604.6339569091797, "epoch": 0.3013964603091629, "grad_norm": 0.41898101568222046, "kl": 0.3077392578125, "learning_rate": 8.933079893479911e-07, "loss": 0.0123, "reward": 1.1517857909202576, "reward_std": 0.18274055793881416, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1009 }, { "completion_length": 559.9107513427734, "epoch": 0.30169516839668437, "grad_norm": 0.9681908488273621, "kl": 0.28167724609375, "learning_rate": 8.930043564143497e-07, "loss": 0.0113, "reward": 1.1819197237491608, "reward_std": 0.16604626178741455, "rewards/accuracy_reward": 0.18973215483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 1010 }, { "completion_length": 573.647346496582, "epoch": 0.30199387648420584, "grad_norm": 0.2758485972881317, "kl": 0.2352294921875, "learning_rate": 8.927003503255866e-07, "loss": 0.0094, "reward": 1.1523438096046448, "reward_std": 0.15012029185891151, "rewards/accuracy_reward": 0.15848214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616454601288, "step": 1011 }, { "completion_length": 520.2723541259766, "epoch": 0.30229258457172725, "grad_norm": 0.14928746223449707, "kl": 0.1168212890625, "learning_rate": 8.923959714124306e-07, "loss": 0.0047, "reward": 1.1992188096046448, "reward_std": 0.13626900874078274, "rewards/accuracy_reward": 0.2008928693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9983258992433548, "step": 1012 }, { "completion_length": 530.8593978881836, "epoch": 0.3025912926592487, "grad_norm": 0.7706580758094788, "kl": 0.24481201171875, "learning_rate": 8.920912200060161e-07, "loss": 0.0098, "reward": 1.149553656578064, "reward_std": 0.17276743426918983, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 1013 }, { "completion_length": 531.3080520629883, "epoch": 0.3028900007467702, "grad_norm": 0.4516606628894806, "kl": 0.163818359375, "learning_rate": 8.917860964378829e-07, "loss": 0.0065, "reward": 1.1752232313156128, "reward_std": 0.09919771924614906, "rewards/accuracy_reward": 0.1852678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553656578064, "step": 1014 }, { "completion_length": 555.8259048461914, "epoch": 0.30318870883429166, "grad_norm": 0.7621433734893799, "kl": 0.25091552734375, "learning_rate": 8.914806010399753e-07, "loss": 0.01, "reward": 1.1813616454601288, "reward_std": 0.2097017578780651, "rewards/accuracy_reward": 0.1941964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1015 }, { "completion_length": 519.959831237793, "epoch": 0.30348741692181314, "grad_norm": 0.4264616370201111, "kl": 0.24774169921875, "learning_rate": 8.911747341446425e-07, "loss": 0.0099, "reward": 1.1657366454601288, "reward_std": 0.14757242146879435, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616156578064, "step": 1016 }, { "completion_length": 595.3660736083984, "epoch": 0.3037861250093346, "grad_norm": 0.7612556219100952, "kl": 0.594482421875, "learning_rate": 8.908684960846376e-07, "loss": 0.0238, "reward": 1.1450893133878708, "reward_std": 0.15371761098504066, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 1017 }, { "completion_length": 563.0647583007812, "epoch": 0.3040848330968561, "grad_norm": 0.3627396523952484, "kl": 0.4456787109375, "learning_rate": 8.905618871931177e-07, "loss": 0.0179, "reward": 1.2756697237491608, "reward_std": 0.16960839927196503, "rewards/accuracy_reward": 0.2901785857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 1018 }, { "completion_length": 530.5022735595703, "epoch": 0.30438354118437755, "grad_norm": 0.6870532631874084, "kl": 0.1846923828125, "learning_rate": 8.902549078036433e-07, "loss": 0.0074, "reward": 1.1975446939468384, "reward_std": 0.09651085920631886, "rewards/accuracy_reward": 0.2008928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966517984867096, "step": 1019 }, { "completion_length": 608.9219055175781, "epoch": 0.304682249271899, "grad_norm": 0.6067362427711487, "kl": 0.686767578125, "learning_rate": 8.899475582501775e-07, "loss": 0.0275, "reward": 1.1495536267757416, "reward_std": 0.1548860464245081, "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 1020 }, { "completion_length": 563.6607513427734, "epoch": 0.3049809573594205, "grad_norm": 0.5086573362350464, "kl": 0.59423828125, "learning_rate": 8.89639838867087e-07, "loss": 0.0238, "reward": 1.1902902126312256, "reward_std": 0.21590495109558105, "rewards/accuracy_reward": 0.2031250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1021 }, { "completion_length": 534.0424423217773, "epoch": 0.30527966544694196, "grad_norm": 0.7294259071350098, "kl": 0.25213623046875, "learning_rate": 8.893317499891401e-07, "loss": 0.0101, "reward": 1.1462053954601288, "reward_std": 0.17763005197048187, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9988839328289032, "step": 1022 }, { "completion_length": 581.194221496582, "epoch": 0.30557837353446343, "grad_norm": 0.8397225141525269, "kl": 0.67510986328125, "learning_rate": 8.890232919515071e-07, "loss": 0.027, "reward": 1.2739956080913544, "reward_std": 0.15907155722379684, "rewards/accuracy_reward": 0.2834821566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 1023 }, { "completion_length": 609.7031478881836, "epoch": 0.3058770816219849, "grad_norm": 1.3947949409484863, "kl": 1.11669921875, "learning_rate": 8.887144650897606e-07, "loss": 0.0447, "reward": 1.1227679252624512, "reward_std": 0.20128770545125008, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 1024 }, { "completion_length": 643.5960083007812, "epoch": 0.3061757897095064, "grad_norm": 0.6740752458572388, "kl": 0.921875, "learning_rate": 8.884052697398735e-07, "loss": 0.0369, "reward": 1.138950914144516, "reward_std": 0.18274320662021637, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 1025 }, { "completion_length": 531.6964492797852, "epoch": 0.30647449779702785, "grad_norm": 0.49010589718818665, "kl": 0.57928466796875, "learning_rate": 8.880957062382202e-07, "loss": 0.0232, "reward": 1.2070313394069672, "reward_std": 0.10831687413156033, "rewards/accuracy_reward": 0.21875000861473382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1026 }, { "completion_length": 594.9286041259766, "epoch": 0.3067732058845493, "grad_norm": 0.8860977292060852, "kl": 0.49658203125, "learning_rate": 8.877857749215755e-07, "loss": 0.0199, "reward": 1.0613839626312256, "reward_std": 0.12967952247709036, "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 1027 }, { "completion_length": 530.2968902587891, "epoch": 0.3070719139720708, "grad_norm": 1.1298305988311768, "kl": 0.47314453125, "learning_rate": 8.874754761271142e-07, "loss": 0.0189, "reward": 1.1255581080913544, "reward_std": 0.15838410146534443, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 1028 }, { "completion_length": 573.5870666503906, "epoch": 0.30737062205959226, "grad_norm": 0.3807738125324249, "kl": 0.260986328125, "learning_rate": 8.871648101924109e-07, "loss": 0.0104, "reward": 1.1026786267757416, "reward_std": 0.143080348148942, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9955357313156128, "step": 1029 }, { "completion_length": 592.8214492797852, "epoch": 0.30766933014711373, "grad_norm": 0.5649203062057495, "kl": 0.445068359375, "learning_rate": 8.8685377745544e-07, "loss": 0.0178, "reward": 1.041294664144516, "reward_std": 0.10299256816506386, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 1030 }, { "completion_length": 596.1964569091797, "epoch": 0.3079680382346352, "grad_norm": 0.33630338311195374, "kl": 0.2957763671875, "learning_rate": 8.865423782545745e-07, "loss": 0.0118, "reward": 1.1400670111179352, "reward_std": 0.19953331723809242, "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98604916036129, "step": 1031 }, { "completion_length": 527.7991333007812, "epoch": 0.30826674632215667, "grad_norm": 0.40133586525917053, "kl": 0.3160400390625, "learning_rate": 8.86230612928586e-07, "loss": 0.0127, "reward": 1.2382812947034836, "reward_std": 0.15001874789595604, "rewards/accuracy_reward": 0.24776787171140313, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134439468384, "step": 1032 }, { "completion_length": 580.0401916503906, "epoch": 0.30856545440967814, "grad_norm": 1.0630742311477661, "kl": 0.419921875, "learning_rate": 8.859184818166449e-07, "loss": 0.0168, "reward": 1.113839328289032, "reward_std": 0.15274640545248985, "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 1033 }, { "completion_length": 573.7254638671875, "epoch": 0.3088641624971996, "grad_norm": 0.5498565435409546, "kl": 0.1392822265625, "learning_rate": 8.85605985258319e-07, "loss": 0.0056, "reward": 1.2209821939468384, "reward_std": 0.09762001223862171, "rewards/accuracy_reward": 0.2276785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.993303582072258, "step": 1034 }, { "completion_length": 591.8125305175781, "epoch": 0.3091628705847211, "grad_norm": 0.7518613338470459, "kl": 0.46734619140625, "learning_rate": 8.852931235935741e-07, "loss": 0.0187, "reward": 1.0998884439468384, "reward_std": 0.14952272735536098, "rewards/accuracy_reward": 0.11830357927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 1035 }, { "completion_length": 626.3928833007812, "epoch": 0.30946157867224255, "grad_norm": 0.4691880941390991, "kl": 0.419921875, "learning_rate": 8.849798971627731e-07, "loss": 0.0168, "reward": 1.1517857611179352, "reward_std": 0.13957566022872925, "rewards/accuracy_reward": 0.1696428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428805589676, "step": 1036 }, { "completion_length": 560.8370819091797, "epoch": 0.309760286759764, "grad_norm": 0.7199621796607971, "kl": 0.294677734375, "learning_rate": 8.846663063066754e-07, "loss": 0.0118, "reward": 1.2014509439468384, "reward_std": 0.18592659384012222, "rewards/accuracy_reward": 0.2165178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 1037 }, { "completion_length": 522.084846496582, "epoch": 0.3100589948472855, "grad_norm": 0.23855365812778473, "kl": 0.3326416015625, "learning_rate": 8.843523513664373e-07, "loss": 0.0133, "reward": 1.1830357611179352, "reward_std": 0.17296337336301804, "rewards/accuracy_reward": 0.1919642984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 1038 }, { "completion_length": 525.8236923217773, "epoch": 0.31035770293480697, "grad_norm": 0.8133963346481323, "kl": 0.44189453125, "learning_rate": 8.840380326836111e-07, "loss": 0.0177, "reward": 1.151227742433548, "reward_std": 0.1600213348865509, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1039 }, { "completion_length": 572.5134124755859, "epoch": 0.31065641102232844, "grad_norm": 1.3736003637313843, "kl": 1.03759765625, "learning_rate": 8.837233506001443e-07, "loss": 0.0416, "reward": 1.1981027126312256, "reward_std": 0.18637044727802277, "rewards/accuracy_reward": 0.2254464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562798023224, "step": 1040 }, { "completion_length": 597.0022506713867, "epoch": 0.3109551191098499, "grad_norm": 0.8710724115371704, "kl": 0.2950439453125, "learning_rate": 8.834083054583807e-07, "loss": 0.0118, "reward": 1.0312500447034836, "reward_std": 0.10922972857952118, "rewards/accuracy_reward": 0.046875002793967724, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 1041 }, { "completion_length": 509.5312805175781, "epoch": 0.3112538271973714, "grad_norm": 1.1852549314498901, "kl": 0.44189453125, "learning_rate": 8.830928976010581e-07, "loss": 0.0177, "reward": 1.1696428954601288, "reward_std": 0.1320768054574728, "rewards/accuracy_reward": 0.17857143515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1042 }, { "completion_length": 569.9687652587891, "epoch": 0.31155253528489285, "grad_norm": 0.4044540822505951, "kl": 0.534423828125, "learning_rate": 8.827771273713097e-07, "loss": 0.0214, "reward": 1.1456473767757416, "reward_std": 0.14527087844908237, "rewards/accuracy_reward": 0.15848214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871651977300644, "step": 1043 }, { "completion_length": 525.8393096923828, "epoch": 0.3118512433724143, "grad_norm": 0.23724547028541565, "kl": 0.2332763671875, "learning_rate": 8.824609951126624e-07, "loss": 0.0093, "reward": 1.111607164144516, "reward_std": 0.10855910740792751, "rewards/accuracy_reward": 0.12053572130389512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 1044 }, { "completion_length": 587.5201263427734, "epoch": 0.3121499514599358, "grad_norm": 0.775738000869751, "kl": 0.29248046875, "learning_rate": 8.821445011690369e-07, "loss": 0.0117, "reward": 1.07979916036129, "reward_std": 0.14186640828847885, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 1045 }, { "completion_length": 587.8727874755859, "epoch": 0.31244865954745726, "grad_norm": 0.5745044946670532, "kl": 0.425048828125, "learning_rate": 8.81827645884748e-07, "loss": 0.017, "reward": 1.1361607611179352, "reward_std": 0.15937701240181923, "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1046 }, { "completion_length": 615.2812805175781, "epoch": 0.31274736763497873, "grad_norm": 0.4209800958633423, "kl": 0.649169921875, "learning_rate": 8.815104296045028e-07, "loss": 0.026, "reward": 1.1735491752624512, "reward_std": 0.15527445822954178, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 1047 }, { "completion_length": 548.8839645385742, "epoch": 0.3130460757225002, "grad_norm": 0.5552963614463806, "kl": 0.4527587890625, "learning_rate": 8.811928526734019e-07, "loss": 0.0181, "reward": 1.1707589626312256, "reward_std": 0.11726140789687634, "rewards/accuracy_reward": 0.18303572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232313156128, "step": 1048 }, { "completion_length": 551.2411041259766, "epoch": 0.3133447838100217, "grad_norm": 0.3398085832595825, "kl": 0.384765625, "learning_rate": 8.808749154369376e-07, "loss": 0.0154, "reward": 1.2137277722358704, "reward_std": 0.1911889612674713, "rewards/accuracy_reward": 0.2165178693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.997209832072258, "step": 1049 }, { "completion_length": 602.2812805175781, "epoch": 0.31364349189754315, "grad_norm": 0.24985334277153015, "kl": 0.397705078125, "learning_rate": 8.805566182409945e-07, "loss": 0.0159, "reward": 1.1250000596046448, "reward_std": 0.1331378084141761, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9955357611179352, "step": 1050 }, { "completion_length": 584.3102951049805, "epoch": 0.3139421999850646, "grad_norm": 0.3621298372745514, "kl": 0.50274658203125, "learning_rate": 8.802379614318486e-07, "loss": 0.0201, "reward": 1.2042411267757416, "reward_std": 0.1510007530450821, "rewards/accuracy_reward": 0.21205358393490314, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 1051 }, { "completion_length": 672.6852874755859, "epoch": 0.3142409080725861, "grad_norm": 0.9525297284126282, "kl": 0.788818359375, "learning_rate": 8.799189453561679e-07, "loss": 0.0315, "reward": 1.1450892984867096, "reward_std": 0.17271506786346436, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 1052 }, { "completion_length": 579.0602874755859, "epoch": 0.31453961616010756, "grad_norm": 0.318022221326828, "kl": 0.466064453125, "learning_rate": 8.795995703610097e-07, "loss": 0.0186, "reward": 1.319196492433548, "reward_std": 0.14612252358347178, "rewards/accuracy_reward": 0.3325893059372902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1053 }, { "completion_length": 640.9754943847656, "epoch": 0.31483832424762903, "grad_norm": 0.46860378980636597, "kl": 0.380859375, "learning_rate": 8.792798367938234e-07, "loss": 0.0153, "reward": 1.1015625298023224, "reward_std": 0.1498423982411623, "rewards/accuracy_reward": 0.11160714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1054 }, { "completion_length": 547.4866333007812, "epoch": 0.31513703233515045, "grad_norm": 0.5228725671768188, "kl": 0.2874755859375, "learning_rate": 8.789597450024478e-07, "loss": 0.0115, "reward": 1.0837053954601288, "reward_std": 0.1256950218230486, "rewards/accuracy_reward": 0.10267857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 1055 }, { "completion_length": 646.0669860839844, "epoch": 0.3154357404226719, "grad_norm": 0.5298771262168884, "kl": 0.5673828125, "learning_rate": 8.786392953351109e-07, "loss": 0.0226, "reward": 1.1523438096046448, "reward_std": 0.2183481939136982, "rewards/accuracy_reward": 0.17410714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 1056 }, { "completion_length": 646.3549346923828, "epoch": 0.3157344485101934, "grad_norm": 0.4125872850418091, "kl": 0.493408203125, "learning_rate": 8.783184881404307e-07, "loss": 0.0197, "reward": 1.0920759439468384, "reward_std": 0.17414774373173714, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 1057 }, { "completion_length": 675.8125305175781, "epoch": 0.31603315659771486, "grad_norm": 0.3862840235233307, "kl": 0.27716064453125, "learning_rate": 8.779973237674141e-07, "loss": 0.0111, "reward": 1.0943080484867096, "reward_std": 0.15957996621727943, "rewards/accuracy_reward": 0.10267857508733869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 1058 }, { "completion_length": 568.3281555175781, "epoch": 0.31633186468523633, "grad_norm": 0.5091144442558289, "kl": 0.28515625, "learning_rate": 8.776758025654566e-07, "loss": 0.0114, "reward": 1.1495536267757416, "reward_std": 0.12649997137486935, "rewards/accuracy_reward": 0.16294643515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1059 }, { "completion_length": 578.3616180419922, "epoch": 0.3166305727727578, "grad_norm": 0.4267047643661499, "kl": 0.27899169921875, "learning_rate": 8.773539248843416e-07, "loss": 0.0112, "reward": 1.1344866454601288, "reward_std": 0.15166506450623274, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1060 }, { "completion_length": 564.9665451049805, "epoch": 0.31692928086027927, "grad_norm": 0.2969653606414795, "kl": 0.28826904296875, "learning_rate": 8.770316910742403e-07, "loss": 0.0116, "reward": 1.1992187798023224, "reward_std": 0.16936093010008335, "rewards/accuracy_reward": 0.2075892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916294813156128, "step": 1061 }, { "completion_length": 530.1741256713867, "epoch": 0.31722798894780074, "grad_norm": 0.7283502817153931, "kl": 0.1707763671875, "learning_rate": 8.767091014857118e-07, "loss": 0.0068, "reward": 1.1757813096046448, "reward_std": 0.13435570895671844, "rewards/accuracy_reward": 0.18080358440056443, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949776977300644, "step": 1062 }, { "completion_length": 577.5178833007812, "epoch": 0.3175266970353222, "grad_norm": 0.20892684161663055, "kl": 0.289794921875, "learning_rate": 8.763861564697017e-07, "loss": 0.0116, "reward": 1.1316964775323868, "reward_std": 0.08015796635299921, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1063 }, { "completion_length": 569.4107437133789, "epoch": 0.3178254051228437, "grad_norm": 0.5866082310676575, "kl": 0.31201171875, "learning_rate": 8.760628563775426e-07, "loss": 0.0125, "reward": 1.2075893580913544, "reward_std": 0.21425072848796844, "rewards/accuracy_reward": 0.21651787124574184, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1064 }, { "completion_length": 663.9844055175781, "epoch": 0.31812411321036516, "grad_norm": 0.6562417149543762, "kl": 0.71484375, "learning_rate": 8.757392015609536e-07, "loss": 0.0286, "reward": 1.2031250298023224, "reward_std": 0.17872757092118263, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 1065 }, { "completion_length": 546.3504638671875, "epoch": 0.3184228212978866, "grad_norm": 0.47865399718284607, "kl": 0.5185546875, "learning_rate": 8.754151923720389e-07, "loss": 0.0208, "reward": 1.1925223767757416, "reward_std": 0.22350966185331345, "rewards/accuracy_reward": 0.2053571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 1066 }, { "completion_length": 579.8460006713867, "epoch": 0.3187215293854081, "grad_norm": 0.4059242010116577, "kl": 0.59033203125, "learning_rate": 8.750908291632893e-07, "loss": 0.0236, "reward": 1.1489956080913544, "reward_std": 0.13101091608405113, "rewards/accuracy_reward": 0.16294643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 1067 }, { "completion_length": 598.3772583007812, "epoch": 0.31902023747292957, "grad_norm": 1.03456711769104, "kl": 0.52587890625, "learning_rate": 8.747661122875796e-07, "loss": 0.021, "reward": 1.1953125596046448, "reward_std": 0.11430246941745281, "rewards/accuracy_reward": 0.20312501303851604, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 1068 }, { "completion_length": 638.5558471679688, "epoch": 0.31931894556045104, "grad_norm": 1.1584662199020386, "kl": 0.82470703125, "learning_rate": 8.744410420981703e-07, "loss": 0.033, "reward": 1.2717634439468384, "reward_std": 0.23665804415941238, "rewards/accuracy_reward": 0.2857142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 1069 }, { "completion_length": 601.7299423217773, "epoch": 0.3196176536479725, "grad_norm": 0.9846159219741821, "kl": 0.631103515625, "learning_rate": 8.741156189487058e-07, "loss": 0.0253, "reward": 1.1863839775323868, "reward_std": 0.15775888413190842, "rewards/accuracy_reward": 0.20535715157166123, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 1070 }, { "completion_length": 592.4152069091797, "epoch": 0.319916361735494, "grad_norm": 0.5177878737449646, "kl": 0.363525390625, "learning_rate": 8.737898431932149e-07, "loss": 0.0146, "reward": 1.1735491454601288, "reward_std": 0.14478073455393314, "rewards/accuracy_reward": 0.1897321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 1071 }, { "completion_length": 573.7545013427734, "epoch": 0.32021506982301545, "grad_norm": 0.2991879880428314, "kl": 0.42333984375, "learning_rate": 8.734637151861093e-07, "loss": 0.0169, "reward": 1.1272321939468384, "reward_std": 0.18154943734407425, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933036267757416, "step": 1072 }, { "completion_length": 628.5156555175781, "epoch": 0.3205137779105369, "grad_norm": 0.4156336784362793, "kl": 0.408447265625, "learning_rate": 8.731372352821843e-07, "loss": 0.0164, "reward": 1.1155134737491608, "reward_std": 0.14107432588934898, "rewards/accuracy_reward": 0.13169643771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 1073 }, { "completion_length": 556.7299346923828, "epoch": 0.3208124859980584, "grad_norm": 1.2403337955474854, "kl": 0.216796875, "learning_rate": 8.728104038366182e-07, "loss": 0.0087, "reward": 1.181919664144516, "reward_std": 0.1307121142745018, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9944196790456772, "step": 1074 }, { "completion_length": 609.1652069091797, "epoch": 0.32111119408557987, "grad_norm": 0.5564844608306885, "kl": 0.57080078125, "learning_rate": 8.724832212049716e-07, "loss": 0.0228, "reward": 1.1685268580913544, "reward_std": 0.167974341660738, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 1075 }, { "completion_length": 651.7701263427734, "epoch": 0.32140990217310134, "grad_norm": 0.4864408075809479, "kl": 0.660400390625, "learning_rate": 8.721556877431871e-07, "loss": 0.0264, "reward": 1.1311384439468384, "reward_std": 0.17838720232248306, "rewards/accuracy_reward": 0.14955358067527413, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848767757416, "step": 1076 }, { "completion_length": 668.6272735595703, "epoch": 0.3217086102606228, "grad_norm": 0.2329472154378891, "kl": 0.55126953125, "learning_rate": 8.718278038075891e-07, "loss": 0.0221, "reward": 1.0770090073347092, "reward_std": 0.1200780589133501, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 1077 }, { "completion_length": 625.9062957763672, "epoch": 0.3220073183481443, "grad_norm": 0.4683944880962372, "kl": 0.432373046875, "learning_rate": 8.714995697548828e-07, "loss": 0.0173, "reward": 1.2260045111179352, "reward_std": 0.15191962104290724, "rewards/accuracy_reward": 0.238839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1078 }, { "completion_length": 626.5893096923828, "epoch": 0.32230602643566575, "grad_norm": 0.39311519265174866, "kl": 0.47491455078125, "learning_rate": 8.711709859421551e-07, "loss": 0.019, "reward": 1.2773438096046448, "reward_std": 0.13003418780863285, "rewards/accuracy_reward": 0.2879464440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 1079 }, { "completion_length": 551.1183166503906, "epoch": 0.3226047345231872, "grad_norm": 1.1191459894180298, "kl": 0.4085693359375, "learning_rate": 8.708420527268728e-07, "loss": 0.0163, "reward": 1.2399554252624512, "reward_std": 0.15707029402256012, "rewards/accuracy_reward": 0.2522321566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 1080 }, { "completion_length": 574.6138763427734, "epoch": 0.3229034426107087, "grad_norm": 0.3384957015514374, "kl": 0.41259765625, "learning_rate": 8.705127704668831e-07, "loss": 0.0165, "reward": 1.1333705931901932, "reward_std": 0.14053201116621494, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1081 }, { "completion_length": 581.575927734375, "epoch": 0.32320215069823016, "grad_norm": 1.2691224813461304, "kl": 0.79638671875, "learning_rate": 8.701831395204127e-07, "loss": 0.0318, "reward": 1.2617187798023224, "reward_std": 0.16382508166134357, "rewards/accuracy_reward": 0.2790178693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 1082 }, { "completion_length": 628.2968978881836, "epoch": 0.32350085878575163, "grad_norm": 0.4709782898426056, "kl": 0.6776123046875, "learning_rate": 8.698531602460679e-07, "loss": 0.0271, "reward": 1.165178656578064, "reward_std": 0.15811746194958687, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 1083 }, { "completion_length": 626.841552734375, "epoch": 0.3237995668732731, "grad_norm": 0.7409815192222595, "kl": 0.5960693359375, "learning_rate": 8.695228330028336e-07, "loss": 0.0238, "reward": 1.13058041036129, "reward_std": 0.1639225585386157, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 1084 }, { "completion_length": 599.8527069091797, "epoch": 0.3240982749607946, "grad_norm": 0.6626558303833008, "kl": 0.6368408203125, "learning_rate": 8.691921581500735e-07, "loss": 0.0254, "reward": 1.160714328289032, "reward_std": 0.21054261177778244, "rewards/accuracy_reward": 0.18080358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 1085 }, { "completion_length": 567.9843902587891, "epoch": 0.32439698304831605, "grad_norm": 0.734987199306488, "kl": 0.35498046875, "learning_rate": 8.688611360475298e-07, "loss": 0.0142, "reward": 1.1969866454601288, "reward_std": 0.21803150326013565, "rewards/accuracy_reward": 0.20535715855658054, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 1086 }, { "completion_length": 609.6428833007812, "epoch": 0.3246956911358375, "grad_norm": 0.3272683322429657, "kl": 0.5574951171875, "learning_rate": 8.685297670553217e-07, "loss": 0.0223, "reward": 1.1367188096046448, "reward_std": 0.15750161185860634, "rewards/accuracy_reward": 0.15401786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 1087 }, { "completion_length": 552.5781402587891, "epoch": 0.324994399223359, "grad_norm": 0.4196349084377289, "kl": 0.24468994140625, "learning_rate": 8.681980515339463e-07, "loss": 0.0098, "reward": 1.1914063096046448, "reward_std": 0.15247402153909206, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949777126312256, "step": 1088 }, { "completion_length": 562.0714492797852, "epoch": 0.32529310731088046, "grad_norm": 0.6979930996894836, "kl": 0.3599853515625, "learning_rate": 8.678659898442776e-07, "loss": 0.0144, "reward": 1.2321429252624512, "reward_std": 0.1620564078912139, "rewards/accuracy_reward": 0.2455357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1089 }, { "completion_length": 630.1361999511719, "epoch": 0.32559181539840193, "grad_norm": 0.8696116805076599, "kl": 1.1396484375, "learning_rate": 8.675335823475662e-07, "loss": 0.0456, "reward": 1.1289063096046448, "reward_std": 0.20921489596366882, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205931901932, "step": 1090 }, { "completion_length": 695.950927734375, "epoch": 0.3258905234859234, "grad_norm": 0.49417468905448914, "kl": 0.986328125, "learning_rate": 8.67200829405439e-07, "loss": 0.0394, "reward": 1.1110491454601288, "reward_std": 0.22135711461305618, "rewards/accuracy_reward": 0.14062500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.97042416036129, "step": 1091 }, { "completion_length": 562.100471496582, "epoch": 0.32618923157344487, "grad_norm": 1.4711312055587769, "kl": 1.3701171875, "learning_rate": 8.668677313798981e-07, "loss": 0.0548, "reward": 1.1891741752624512, "reward_std": 0.18443894013762474, "rewards/accuracy_reward": 0.2053571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 1092 }, { "completion_length": 619.3995666503906, "epoch": 0.32648793966096634, "grad_norm": 0.9271537065505981, "kl": 0.3468017578125, "learning_rate": 8.66534288633322e-07, "loss": 0.0139, "reward": 1.1077009439468384, "reward_std": 0.14674895349889994, "rewards/accuracy_reward": 0.11383928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 1093 }, { "completion_length": 604.3259124755859, "epoch": 0.3267866477484878, "grad_norm": 1.3072930574417114, "kl": 0.93310546875, "learning_rate": 8.662005015284637e-07, "loss": 0.0373, "reward": 1.1858259439468384, "reward_std": 0.10812697233632207, "rewards/accuracy_reward": 0.2053571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 1094 }, { "completion_length": 567.0960083007812, "epoch": 0.3270853558360093, "grad_norm": 0.4825209975242615, "kl": 0.77685546875, "learning_rate": 8.658663704284505e-07, "loss": 0.031, "reward": 1.1813616752624512, "reward_std": 0.12141603697091341, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 1095 }, { "completion_length": 563.444221496582, "epoch": 0.32738406392353075, "grad_norm": 0.31931304931640625, "kl": 0.657958984375, "learning_rate": 8.655318956967845e-07, "loss": 0.0264, "reward": 1.2751116454601288, "reward_std": 0.15893417596817017, "rewards/accuracy_reward": 0.2857143022119999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 1096 }, { "completion_length": 548.2120819091797, "epoch": 0.3276827720110522, "grad_norm": 0.38104650378227234, "kl": 0.155517578125, "learning_rate": 8.651970776973417e-07, "loss": 0.0062, "reward": 1.135602742433548, "reward_std": 0.0883105993270874, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.997209832072258, "step": 1097 }, { "completion_length": 557.7969055175781, "epoch": 0.32798148009857364, "grad_norm": 0.4077795743942261, "kl": 0.399169921875, "learning_rate": 8.648619167943706e-07, "loss": 0.0159, "reward": 1.2399554550647736, "reward_std": 0.12815086729824543, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 1098 }, { "completion_length": 612.9085083007812, "epoch": 0.3282801881860951, "grad_norm": 0.731198787689209, "kl": 0.293212890625, "learning_rate": 8.645264133524942e-07, "loss": 0.0118, "reward": 1.1205357611179352, "reward_std": 0.17101881094276905, "rewards/accuracy_reward": 0.12946429220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1099 }, { "completion_length": 636.5111846923828, "epoch": 0.3285788962736166, "grad_norm": 0.2343093752861023, "kl": 0.2340087890625, "learning_rate": 8.641905677367066e-07, "loss": 0.0094, "reward": 1.1757813096046448, "reward_std": 0.11777756176888943, "rewards/accuracy_reward": 0.1852678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 1100 }, { "completion_length": 605.6451110839844, "epoch": 0.32887760436113805, "grad_norm": 1.0257505178451538, "kl": 0.3095703125, "learning_rate": 8.638543803123756e-07, "loss": 0.0124, "reward": 1.1077009439468384, "reward_std": 0.11713830498047173, "rewards/accuracy_reward": 0.12053571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1101 }, { "completion_length": 568.2232513427734, "epoch": 0.3291763124486595, "grad_norm": 0.3034556806087494, "kl": 0.2086181640625, "learning_rate": 8.635178514452397e-07, "loss": 0.0083, "reward": 1.1227678954601288, "reward_std": 0.13150811195373535, "rewards/accuracy_reward": 0.13169643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 1102 }, { "completion_length": 633.5067291259766, "epoch": 0.329475020536181, "grad_norm": 1.0146255493164062, "kl": 0.52197265625, "learning_rate": 8.631809815014095e-07, "loss": 0.0209, "reward": 1.0647321939468384, "reward_std": 0.13638061471283436, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464775323868, "step": 1103 }, { "completion_length": 556.3058471679688, "epoch": 0.32977372862370247, "grad_norm": 1.0429496765136719, "kl": 0.380615234375, "learning_rate": 8.628437708473664e-07, "loss": 0.0152, "reward": 1.082589328289032, "reward_std": 0.12929679825901985, "rewards/accuracy_reward": 0.09375000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1104 }, { "completion_length": 501.6027069091797, "epoch": 0.33007243671122394, "grad_norm": 0.6214088797569275, "kl": 0.332763671875, "learning_rate": 8.625062198499627e-07, "loss": 0.0133, "reward": 1.1713169813156128, "reward_std": 0.12570840492844582, "rewards/accuracy_reward": 0.1763392984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949777126312256, "step": 1105 }, { "completion_length": 592.8593902587891, "epoch": 0.3303711447987454, "grad_norm": 0.3859996199607849, "kl": 0.47802734375, "learning_rate": 8.621683288764207e-07, "loss": 0.0191, "reward": 1.172991156578064, "reward_std": 0.11862727743573487, "rewards/accuracy_reward": 0.17857143841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9944196790456772, "step": 1106 }, { "completion_length": 543.5982360839844, "epoch": 0.3306698528862669, "grad_norm": 0.47383204102516174, "kl": 0.34619140625, "learning_rate": 8.618300982943327e-07, "loss": 0.0138, "reward": 1.2382812798023224, "reward_std": 0.1349673792719841, "rewards/accuracy_reward": 0.2410714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.997209832072258, "step": 1107 }, { "completion_length": 557.1071701049805, "epoch": 0.33096856097378835, "grad_norm": 0.3236454725265503, "kl": 0.67431640625, "learning_rate": 8.614915284716603e-07, "loss": 0.0268, "reward": 1.1796875298023224, "reward_std": 0.18493080511689186, "rewards/accuracy_reward": 0.19419643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 1108 }, { "completion_length": 611.4286041259766, "epoch": 0.3312672690613098, "grad_norm": 0.7656009793281555, "kl": 0.569091796875, "learning_rate": 8.611526197767346e-07, "loss": 0.0228, "reward": 1.2025670111179352, "reward_std": 0.1404099699575454, "rewards/accuracy_reward": 0.2142857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882813096046448, "step": 1109 }, { "completion_length": 648.6428833007812, "epoch": 0.3315659771488313, "grad_norm": 0.9819284081459045, "kl": 1.0771484375, "learning_rate": 8.608133725782545e-07, "loss": 0.0432, "reward": 1.1356027126312256, "reward_std": 0.23813914507627487, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884290456772, "step": 1110 }, { "completion_length": 625.9620819091797, "epoch": 0.33186468523635276, "grad_norm": 0.35753753781318665, "kl": 0.7890625, "learning_rate": 8.604737872452881e-07, "loss": 0.0316, "reward": 1.1601562947034836, "reward_std": 0.1082913251593709, "rewards/accuracy_reward": 0.17187501094304025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 1111 }, { "completion_length": 632.6964569091797, "epoch": 0.33216339332387423, "grad_norm": 1.9958688020706177, "kl": 1.08984375, "learning_rate": 8.601338641472709e-07, "loss": 0.0436, "reward": 1.2087053954601288, "reward_std": 0.22942791879177094, "rewards/accuracy_reward": 0.227678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 1112 }, { "completion_length": 614.2277145385742, "epoch": 0.3324621014113957, "grad_norm": 0.4982960820198059, "kl": 0.7093505859375, "learning_rate": 8.597936036540061e-07, "loss": 0.0283, "reward": 1.0965402573347092, "reward_std": 0.1360120503231883, "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 1113 }, { "completion_length": 533.4397506713867, "epoch": 0.3327608094989172, "grad_norm": 0.39469942450523376, "kl": 0.43017578125, "learning_rate": 8.594530061356633e-07, "loss": 0.0172, "reward": 1.1026786267757416, "reward_std": 0.16075987182557583, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1114 }, { "completion_length": 640.5893096923828, "epoch": 0.33305951758643865, "grad_norm": 0.7358635663986206, "kl": 0.578369140625, "learning_rate": 8.591120719627796e-07, "loss": 0.0232, "reward": 1.2120536267757416, "reward_std": 0.15849604830145836, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1115 }, { "completion_length": 637.7187881469727, "epoch": 0.3333582256739601, "grad_norm": 0.4436490535736084, "kl": 0.86181640625, "learning_rate": 8.587708015062578e-07, "loss": 0.0345, "reward": 1.3030134439468384, "reward_std": 0.18187083303928375, "rewards/accuracy_reward": 0.3325892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9704241454601288, "step": 1116 }, { "completion_length": 611.3370819091797, "epoch": 0.3336569337614816, "grad_norm": 0.47812649607658386, "kl": 0.410888671875, "learning_rate": 8.584291951373668e-07, "loss": 0.0164, "reward": 1.0987723767757416, "reward_std": 0.18599152565002441, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1117 }, { "completion_length": 565.950927734375, "epoch": 0.33395564184900306, "grad_norm": 0.6650499701499939, "kl": 0.45458984375, "learning_rate": 8.580872532277407e-07, "loss": 0.0182, "reward": 1.1796875596046448, "reward_std": 0.18022987991571426, "rewards/accuracy_reward": 0.18973215110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1118 }, { "completion_length": 578.4553756713867, "epoch": 0.33425434993652453, "grad_norm": 0.26858848333358765, "kl": 0.486083984375, "learning_rate": 8.57744976149379e-07, "loss": 0.0194, "reward": 1.0675223767757416, "reward_std": 0.06766040623188019, "rewards/accuracy_reward": 0.07812500605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 1119 }, { "completion_length": 610.7879638671875, "epoch": 0.334553058024046, "grad_norm": 0.491128146648407, "kl": 0.4619140625, "learning_rate": 8.574023642746455e-07, "loss": 0.0185, "reward": 1.1160714626312256, "reward_std": 0.12044249288737774, "rewards/accuracy_reward": 0.13169643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 1120 }, { "completion_length": 576.5602874755859, "epoch": 0.3348517661115675, "grad_norm": 0.4871073067188263, "kl": 0.4068603515625, "learning_rate": 8.570594179762681e-07, "loss": 0.0163, "reward": 1.0954241454601288, "reward_std": 0.12945087626576424, "rewards/accuracy_reward": 0.10491071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 1121 }, { "completion_length": 621.6361923217773, "epoch": 0.33515047419908894, "grad_norm": 0.7727295160293579, "kl": 0.5157470703125, "learning_rate": 8.567161376273393e-07, "loss": 0.0206, "reward": 1.1517857611179352, "reward_std": 0.19760944321751595, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1122 }, { "completion_length": 553.834846496582, "epoch": 0.3354491822866104, "grad_norm": 0.311857134103775, "kl": 0.1943359375, "learning_rate": 8.563725236013139e-07, "loss": 0.0078, "reward": 1.1573660969734192, "reward_std": 0.15259517543017864, "rewards/accuracy_reward": 0.1607142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966517984867096, "step": 1123 }, { "completion_length": 572.3281402587891, "epoch": 0.3357478903741319, "grad_norm": 0.4830499291419983, "kl": 0.511962890625, "learning_rate": 8.560285762720109e-07, "loss": 0.0205, "reward": 1.1372767984867096, "reward_std": 0.12821389362215996, "rewards/accuracy_reward": 0.1495535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1124 }, { "completion_length": 593.1317367553711, "epoch": 0.33604659846165336, "grad_norm": 0.4874721169471741, "kl": 0.7633056640625, "learning_rate": 8.556842960136107e-07, "loss": 0.0305, "reward": 1.0876116454601288, "reward_std": 0.10479661263525486, "rewards/accuracy_reward": 0.10044643515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1125 }, { "completion_length": 575.9219055175781, "epoch": 0.3363453065491748, "grad_norm": 0.6842877864837646, "kl": 0.19952392578125, "learning_rate": 8.553396832006568e-07, "loss": 0.008, "reward": 1.2678571939468384, "reward_std": 0.17150209099054337, "rewards/accuracy_reward": 0.2700892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9977678805589676, "step": 1126 }, { "completion_length": 664.8236846923828, "epoch": 0.3366440146366963, "grad_norm": 0.8217383027076721, "kl": 0.5777587890625, "learning_rate": 8.54994738208054e-07, "loss": 0.0231, "reward": 1.0585937798023224, "reward_std": 0.18420810252428055, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 1127 }, { "completion_length": 556.6763610839844, "epoch": 0.33694272272421777, "grad_norm": 0.5455264449119568, "kl": 0.33544921875, "learning_rate": 8.546494614110688e-07, "loss": 0.0135, "reward": 1.0937500596046448, "reward_std": 0.09090579906478524, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1128 }, { "completion_length": 652.0937805175781, "epoch": 0.33724143081173924, "grad_norm": 0.8619608879089355, "kl": 0.55126953125, "learning_rate": 8.543038531853285e-07, "loss": 0.022, "reward": 1.1328125596046448, "reward_std": 0.12395171821117401, "rewards/accuracy_reward": 0.14955357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 1129 }, { "completion_length": 685.1741333007812, "epoch": 0.3375401388992607, "grad_norm": 0.4934943914413452, "kl": 0.75830078125, "learning_rate": 8.539579139068207e-07, "loss": 0.0303, "reward": 1.2472098767757416, "reward_std": 0.22441011294722557, "rewards/accuracy_reward": 0.26785715855658054, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 1130 }, { "completion_length": 668.5625457763672, "epoch": 0.3378388469867822, "grad_norm": 0.635417103767395, "kl": 0.720703125, "learning_rate": 8.536116439518938e-07, "loss": 0.0288, "reward": 1.1049107909202576, "reward_std": 0.21156042255461216, "rewards/accuracy_reward": 0.12276786100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 1131 }, { "completion_length": 576.3772735595703, "epoch": 0.33813755507430365, "grad_norm": 0.8172542452812195, "kl": 0.40673828125, "learning_rate": 8.532650436972555e-07, "loss": 0.0163, "reward": 1.1205357611179352, "reward_std": 0.18105152808129787, "rewards/accuracy_reward": 0.13839286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 1132 }, { "completion_length": 601.2232208251953, "epoch": 0.3384362631618251, "grad_norm": 0.4475604295730591, "kl": 0.418701171875, "learning_rate": 8.529181135199726e-07, "loss": 0.0168, "reward": 1.2248884737491608, "reward_std": 0.15185171365737915, "rewards/accuracy_reward": 0.2321428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 1133 }, { "completion_length": 650.2768249511719, "epoch": 0.3387349712493466, "grad_norm": 0.4324881434440613, "kl": 0.65234375, "learning_rate": 8.525708537974715e-07, "loss": 0.0261, "reward": 1.116071492433548, "reward_std": 0.1490493081510067, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428805589676, "step": 1134 }, { "completion_length": 603.9866485595703, "epoch": 0.33903367933686807, "grad_norm": 0.490304559469223, "kl": 0.53369140625, "learning_rate": 8.522232649075366e-07, "loss": 0.0214, "reward": 1.1445313096046448, "reward_std": 0.1540440432727337, "rewards/accuracy_reward": 0.1562500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1135 }, { "completion_length": 580.5647583007812, "epoch": 0.33933238742438954, "grad_norm": 0.6548090577125549, "kl": 0.64794921875, "learning_rate": 8.518753472283105e-07, "loss": 0.0259, "reward": 1.1428572088479996, "reward_std": 0.16177895292639732, "rewards/accuracy_reward": 0.15625000302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 1136 }, { "completion_length": 623.966552734375, "epoch": 0.339631095511911, "grad_norm": 0.5280685424804688, "kl": 0.620849609375, "learning_rate": 8.515271011382937e-07, "loss": 0.0249, "reward": 1.1356027126312256, "reward_std": 0.14928824454545975, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 1137 }, { "completion_length": 647.2678985595703, "epoch": 0.3399298035994325, "grad_norm": 0.27230721712112427, "kl": 0.3885498046875, "learning_rate": 8.511785270163436e-07, "loss": 0.0155, "reward": 1.0753348767757416, "reward_std": 0.14043200761079788, "rewards/accuracy_reward": 0.08705357392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812649011612, "step": 1138 }, { "completion_length": 578.5982360839844, "epoch": 0.34022851168695395, "grad_norm": 0.5188155770301819, "kl": 0.334228515625, "learning_rate": 8.508296252416748e-07, "loss": 0.0134, "reward": 1.1534598469734192, "reward_std": 0.15555134881287813, "rewards/accuracy_reward": 0.1674107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1139 }, { "completion_length": 600.4642944335938, "epoch": 0.3405272197744754, "grad_norm": 0.4698018729686737, "kl": 0.54833984375, "learning_rate": 8.504803961938582e-07, "loss": 0.0219, "reward": 1.1478795111179352, "reward_std": 0.1271045170724392, "rewards/accuracy_reward": 0.16071429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1140 }, { "completion_length": 617.9911041259766, "epoch": 0.3408259278619969, "grad_norm": 1.162142038345337, "kl": 0.4111328125, "learning_rate": 8.501308402528207e-07, "loss": 0.0164, "reward": 1.273995578289032, "reward_std": 0.2245153896510601, "rewards/accuracy_reward": 0.2924107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 1141 }, { "completion_length": 607.9754638671875, "epoch": 0.3411246359495183, "grad_norm": 0.8042078018188477, "kl": 0.2882080078125, "learning_rate": 8.497809577988451e-07, "loss": 0.0115, "reward": 1.037388414144516, "reward_std": 0.09305419120937586, "rewards/accuracy_reward": 0.046875000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 1142 }, { "completion_length": 588.2009048461914, "epoch": 0.3414233440370398, "grad_norm": 0.6097347736358643, "kl": 0.686767578125, "learning_rate": 8.494307492125691e-07, "loss": 0.0275, "reward": 1.1199777126312256, "reward_std": 0.13156504416838288, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884290456772, "step": 1143 }, { "completion_length": 605.0826110839844, "epoch": 0.34172205212456125, "grad_norm": 0.6929336786270142, "kl": 0.784912109375, "learning_rate": 8.490802148749853e-07, "loss": 0.0315, "reward": 1.2126116752624512, "reward_std": 0.2260439470410347, "rewards/accuracy_reward": 0.2254464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1144 }, { "completion_length": 662.7768096923828, "epoch": 0.3420207602120827, "grad_norm": 0.5171191692352295, "kl": 0.68359375, "learning_rate": 8.487293551674406e-07, "loss": 0.0274, "reward": 1.1735491752624512, "reward_std": 0.13936144206672907, "rewards/accuracy_reward": 0.19196429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 1145 }, { "completion_length": 686.7031555175781, "epoch": 0.3423194682996042, "grad_norm": 0.36153921484947205, "kl": 0.55908203125, "learning_rate": 8.483781704716363e-07, "loss": 0.0223, "reward": 1.040178656578064, "reward_std": 0.07422812143340707, "rewards/accuracy_reward": 0.05133928800933063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1146 }, { "completion_length": 634.4352874755859, "epoch": 0.34261817638712566, "grad_norm": 0.8436704277992249, "kl": 0.59716796875, "learning_rate": 8.480266611696266e-07, "loss": 0.0239, "reward": 1.2366071939468384, "reward_std": 0.14716866984963417, "rewards/accuracy_reward": 0.2477678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 1147 }, { "completion_length": 610.8281555175781, "epoch": 0.34291688447464713, "grad_norm": 0.39500388503074646, "kl": 0.256591796875, "learning_rate": 8.476748276438194e-07, "loss": 0.0103, "reward": 1.2734375596046448, "reward_std": 0.16732657700777054, "rewards/accuracy_reward": 0.2812500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 1148 }, { "completion_length": 639.5870666503906, "epoch": 0.3432155925621686, "grad_norm": 0.5973897576332092, "kl": 0.45458984375, "learning_rate": 8.473226702769749e-07, "loss": 0.0182, "reward": 1.2248884737491608, "reward_std": 0.20935048535466194, "rewards/accuracy_reward": 0.2410714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 1149 }, { "completion_length": 564.4576110839844, "epoch": 0.3435143006496901, "grad_norm": 0.3811553120613098, "kl": 0.38037109375, "learning_rate": 8.46970189452206e-07, "loss": 0.0152, "reward": 1.1930803954601288, "reward_std": 0.2012801319360733, "rewards/accuracy_reward": 0.1986607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 1150 }, { "completion_length": 651.8973541259766, "epoch": 0.34381300873721155, "grad_norm": 0.2730368971824646, "kl": 0.6103515625, "learning_rate": 8.46617385552977e-07, "loss": 0.0245, "reward": 1.092075914144516, "reward_std": 0.1273864544928074, "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 1151 }, { "completion_length": 594.841552734375, "epoch": 0.344111716824733, "grad_norm": 0.8663352131843567, "kl": 0.658203125, "learning_rate": 8.462642589631044e-07, "loss": 0.0264, "reward": 1.0669643580913544, "reward_std": 0.13857540115714073, "rewards/accuracy_reward": 0.08258928777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 1152 }, { "completion_length": 702.7232360839844, "epoch": 0.3444104249122545, "grad_norm": 0.4558485746383667, "kl": 0.88232421875, "learning_rate": 8.459108100667548e-07, "loss": 0.0353, "reward": 1.195870578289032, "reward_std": 0.18120446801185608, "rewards/accuracy_reward": 0.2209821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884439468384, "step": 1153 }, { "completion_length": 570.8303833007812, "epoch": 0.34470913299977596, "grad_norm": 0.28791072964668274, "kl": 0.10009765625, "learning_rate": 8.455570392484464e-07, "loss": 0.004, "reward": 1.2399553954601288, "reward_std": 0.10438933223485947, "rewards/accuracy_reward": 0.2433035895228386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966517984867096, "step": 1154 }, { "completion_length": 550.1250305175781, "epoch": 0.34500784108729743, "grad_norm": 0.6498427391052246, "kl": 0.81732177734375, "learning_rate": 8.45202946893047e-07, "loss": 0.0327, "reward": 1.207589328289032, "reward_std": 0.22183405235409737, "rewards/accuracy_reward": 0.2209821566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1155 }, { "completion_length": 663.3460083007812, "epoch": 0.3453065491748189, "grad_norm": 0.35071882605552673, "kl": 0.826416015625, "learning_rate": 8.448485333857742e-07, "loss": 0.0331, "reward": 1.148995578289032, "reward_std": 0.1126326103694737, "rewards/accuracy_reward": 0.1674107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 1156 }, { "completion_length": 582.4643096923828, "epoch": 0.34560525726234037, "grad_norm": 0.2685929834842682, "kl": 0.2633056640625, "learning_rate": 8.444937991121956e-07, "loss": 0.0105, "reward": 1.1289063096046448, "reward_std": 0.17297921143472195, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 1157 }, { "completion_length": 615.5044860839844, "epoch": 0.34590396534986184, "grad_norm": 0.3725630044937134, "kl": 0.7001953125, "learning_rate": 8.44138744458227e-07, "loss": 0.0279, "reward": 1.2237723767757416, "reward_std": 0.18418920040130615, "rewards/accuracy_reward": 0.243303582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 1158 }, { "completion_length": 586.138427734375, "epoch": 0.3462026734373833, "grad_norm": 0.3636914789676666, "kl": 0.54296875, "learning_rate": 8.437833698101331e-07, "loss": 0.0217, "reward": 1.121651828289032, "reward_std": 0.13372789323329926, "rewards/accuracy_reward": 0.13839286798611283, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 1159 }, { "completion_length": 577.779052734375, "epoch": 0.3465013815249048, "grad_norm": 0.4086763560771942, "kl": 0.450439453125, "learning_rate": 8.434276755545265e-07, "loss": 0.0181, "reward": 1.1289062947034836, "reward_std": 0.057159208226948977, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 1160 }, { "completion_length": 615.5469055175781, "epoch": 0.34680008961242625, "grad_norm": 0.42559802532196045, "kl": 0.7008056640625, "learning_rate": 8.430716620783683e-07, "loss": 0.0281, "reward": 1.1869420111179352, "reward_std": 0.15772286150604486, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 1161 }, { "completion_length": 566.5044860839844, "epoch": 0.3470987976999477, "grad_norm": 0.7405133247375488, "kl": 0.4700927734375, "learning_rate": 8.427153297689654e-07, "loss": 0.0188, "reward": 1.2165179252624512, "reward_std": 0.1039355993270874, "rewards/accuracy_reward": 0.2209821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9955357313156128, "step": 1162 }, { "completion_length": 589.1540451049805, "epoch": 0.3473975057874692, "grad_norm": 0.5706230998039246, "kl": 0.513671875, "learning_rate": 8.423586790139733e-07, "loss": 0.0205, "reward": 1.1322545260190964, "reward_std": 0.09371870197355747, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 1163 }, { "completion_length": 565.1138610839844, "epoch": 0.34769621387499067, "grad_norm": 0.41835445165634155, "kl": 0.59912109375, "learning_rate": 8.420017102013924e-07, "loss": 0.024, "reward": 1.0781250596046448, "reward_std": 0.12931328266859055, "rewards/accuracy_reward": 0.09598214458674192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 1164 }, { "completion_length": 583.4218978881836, "epoch": 0.34799492196251214, "grad_norm": 0.3149718642234802, "kl": 0.388916015625, "learning_rate": 8.416444237195701e-07, "loss": 0.0156, "reward": 1.1439732611179352, "reward_std": 0.16597646288573742, "rewards/accuracy_reward": 0.15625000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232313156128, "step": 1165 }, { "completion_length": 520.7544784545898, "epoch": 0.3482936300500336, "grad_norm": 0.5975250601768494, "kl": 0.54443359375, "learning_rate": 8.41286819957199e-07, "loss": 0.0217, "reward": 1.3035714626312256, "reward_std": 0.16962525062263012, "rewards/accuracy_reward": 0.31696430407464504, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 1166 }, { "completion_length": 614.0826263427734, "epoch": 0.3485923381375551, "grad_norm": 0.7669104337692261, "kl": 0.602294921875, "learning_rate": 8.409288993033171e-07, "loss": 0.0241, "reward": 1.1383928954601288, "reward_std": 0.1555468700826168, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1167 }, { "completion_length": 576.2187652587891, "epoch": 0.34889104622507655, "grad_norm": 0.35730722546577454, "kl": 0.302978515625, "learning_rate": 8.405706621473069e-07, "loss": 0.0121, "reward": 1.1026785969734192, "reward_std": 0.15513833612203598, "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933036267757416, "step": 1168 }, { "completion_length": 583.8036117553711, "epoch": 0.349189754312598, "grad_norm": 0.4342097043991089, "kl": 0.55810546875, "learning_rate": 8.40212108878895e-07, "loss": 0.0223, "reward": 1.1104911416769028, "reward_std": 0.16423183120787144, "rewards/accuracy_reward": 0.1294642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 1169 }, { "completion_length": 586.4196701049805, "epoch": 0.3494884624001195, "grad_norm": 0.37174397706985474, "kl": 0.7158203125, "learning_rate": 8.398532398881527e-07, "loss": 0.0287, "reward": 1.2047991752624512, "reward_std": 0.17085686326026917, "rewards/accuracy_reward": 0.223214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 1170 }, { "completion_length": 592.4598388671875, "epoch": 0.34978717048764096, "grad_norm": 0.5404679179191589, "kl": 0.611328125, "learning_rate": 8.39494055565494e-07, "loss": 0.0245, "reward": 1.1774553954601288, "reward_std": 0.15205827914178371, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1171 }, { "completion_length": 606.2991180419922, "epoch": 0.35008587857516243, "grad_norm": 0.7040765881538391, "kl": 1.16796875, "learning_rate": 8.391345563016763e-07, "loss": 0.0467, "reward": 1.2455357313156128, "reward_std": 0.20978058129549026, "rewards/accuracy_reward": 0.26562501303851604, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 1172 }, { "completion_length": 643.6763610839844, "epoch": 0.3503845866626839, "grad_norm": 0.4812161922454834, "kl": 0.56396484375, "learning_rate": 8.387747424877996e-07, "loss": 0.0225, "reward": 1.1489956080913544, "reward_std": 0.12525295745581388, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1173 }, { "completion_length": 665.9687805175781, "epoch": 0.3506832947502054, "grad_norm": 0.7215442061424255, "kl": 1.0654296875, "learning_rate": 8.384146145153059e-07, "loss": 0.0426, "reward": 1.2053571939468384, "reward_std": 0.2019849270582199, "rewards/accuracy_reward": 0.2254464402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 1174 }, { "completion_length": 653.2455673217773, "epoch": 0.35098200283772685, "grad_norm": 0.4770544469356537, "kl": 0.66064453125, "learning_rate": 8.380541727759794e-07, "loss": 0.0264, "reward": 1.2611607611179352, "reward_std": 0.1772213177755475, "rewards/accuracy_reward": 0.2767857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 1175 }, { "completion_length": 625.2254791259766, "epoch": 0.3512807109252483, "grad_norm": 0.4910041391849518, "kl": 0.732421875, "learning_rate": 8.376934176619454e-07, "loss": 0.0292, "reward": 1.1121652573347092, "reward_std": 0.1524258777499199, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 1176 }, { "completion_length": 611.0670013427734, "epoch": 0.3515794190127698, "grad_norm": 0.9161887764930725, "kl": 0.4453125, "learning_rate": 8.373323495656699e-07, "loss": 0.0178, "reward": 1.1467634439468384, "reward_std": 0.18345859367400408, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 1177 }, { "completion_length": 591.5893173217773, "epoch": 0.35187812710029126, "grad_norm": 0.9192818999290466, "kl": 0.486572265625, "learning_rate": 8.369709688799596e-07, "loss": 0.0195, "reward": 1.2031250298023224, "reward_std": 0.17341100797057152, "rewards/accuracy_reward": 0.21875000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 1178 }, { "completion_length": 687.2589721679688, "epoch": 0.35217683518781273, "grad_norm": 0.8695412278175354, "kl": 0.56298828125, "learning_rate": 8.366092759979612e-07, "loss": 0.0225, "reward": 1.1015625596046448, "reward_std": 0.1532701440155506, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 1179 }, { "completion_length": 569.6272583007812, "epoch": 0.3524755432753342, "grad_norm": 0.6638289093971252, "kl": 0.2867431640625, "learning_rate": 8.362472713131614e-07, "loss": 0.0115, "reward": 1.1969866454601288, "reward_std": 0.1037113864440471, "rewards/accuracy_reward": 0.2053571578580886, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916295111179352, "step": 1180 }, { "completion_length": 596.8393096923828, "epoch": 0.3527742513628557, "grad_norm": 0.6894627213478088, "kl": 0.6337890625, "learning_rate": 8.358849552193857e-07, "loss": 0.0254, "reward": 1.2014509737491608, "reward_std": 0.20966104418039322, "rewards/accuracy_reward": 0.2142857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 1181 }, { "completion_length": 609.6205673217773, "epoch": 0.35307295945037714, "grad_norm": 0.9037026166915894, "kl": 0.671875, "learning_rate": 8.355223281107985e-07, "loss": 0.0269, "reward": 1.2265625596046448, "reward_std": 0.1322999820113182, "rewards/accuracy_reward": 0.2544643022119999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.972098246216774, "step": 1182 }, { "completion_length": 583.1875305175781, "epoch": 0.3533716675378986, "grad_norm": 0.25965416431427, "kl": 0.28466796875, "learning_rate": 8.351593903819022e-07, "loss": 0.0114, "reward": 1.1210938096046448, "reward_std": 0.15477375127375126, "rewards/accuracy_reward": 0.12723215017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616156578064, "step": 1183 }, { "completion_length": 621.5692138671875, "epoch": 0.3536703756254201, "grad_norm": 0.7182223796844482, "kl": 0.330322265625, "learning_rate": 8.34796142427538e-07, "loss": 0.0132, "reward": 1.143415242433548, "reward_std": 0.17053956538438797, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 1184 }, { "completion_length": 720.2143096923828, "epoch": 0.3539690837129415, "grad_norm": 0.986210823059082, "kl": 0.69921875, "learning_rate": 8.344325846428839e-07, "loss": 0.028, "reward": 1.0150669813156128, "reward_std": 0.1714458353817463, "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9637277275323868, "step": 1185 }, { "completion_length": 678.0848541259766, "epoch": 0.35426779180046297, "grad_norm": 0.8578677177429199, "kl": 0.58544921875, "learning_rate": 8.340687174234551e-07, "loss": 0.0234, "reward": 1.055245578289032, "reward_std": 0.14821401238441467, "rewards/accuracy_reward": 0.07142857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 1186 }, { "completion_length": 666.3995971679688, "epoch": 0.35456649988798444, "grad_norm": 0.2608591318130493, "kl": 0.36865234375, "learning_rate": 8.337045411651034e-07, "loss": 0.0148, "reward": 1.1344866454601288, "reward_std": 0.20429446920752525, "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1187 }, { "completion_length": 613.1317138671875, "epoch": 0.3548652079755059, "grad_norm": 0.46586668491363525, "kl": 0.181884765625, "learning_rate": 8.333400562640172e-07, "loss": 0.0073, "reward": 1.0954241454601288, "reward_std": 0.13025295455008745, "rewards/accuracy_reward": 0.09821429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.997209832072258, "step": 1188 }, { "completion_length": 590.6763458251953, "epoch": 0.3551639160630274, "grad_norm": 0.6069368720054626, "kl": 0.357177734375, "learning_rate": 8.329752631167197e-07, "loss": 0.0143, "reward": 1.1830357909202576, "reward_std": 0.1739514358341694, "rewards/accuracy_reward": 0.2075892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464626312256, "step": 1189 }, { "completion_length": 528.053596496582, "epoch": 0.35546262415054886, "grad_norm": 0.18784385919570923, "kl": 0.139892578125, "learning_rate": 8.326101621200706e-07, "loss": 0.0056, "reward": 1.19698666036129, "reward_std": 0.07668075803667307, "rewards/accuracy_reward": 0.1986607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9983258992433548, "step": 1190 }, { "completion_length": 619.9286117553711, "epoch": 0.3557613322380703, "grad_norm": 1.5438019037246704, "kl": 0.364990234375, "learning_rate": 8.322447536712642e-07, "loss": 0.0146, "reward": 1.2494420111179352, "reward_std": 0.1961849294602871, "rewards/accuracy_reward": 0.2656250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838169813156128, "step": 1191 }, { "completion_length": 728.5491333007812, "epoch": 0.3560600403255918, "grad_norm": 1.3341543674468994, "kl": 0.737548828125, "learning_rate": 8.318790381678283e-07, "loss": 0.0296, "reward": 1.1188616454601288, "reward_std": 0.17918331548571587, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9648437798023224, "step": 1192 }, { "completion_length": 655.5960235595703, "epoch": 0.35635874841311327, "grad_norm": 0.876957356929779, "kl": 0.54248046875, "learning_rate": 8.315130160076263e-07, "loss": 0.0217, "reward": 1.1981027126312256, "reward_std": 0.15166340209543705, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 1193 }, { "completion_length": 586.5736846923828, "epoch": 0.35665745650063474, "grad_norm": 0.4625166654586792, "kl": 0.4561767578125, "learning_rate": 8.311466875888539e-07, "loss": 0.0182, "reward": 1.2343750596046448, "reward_std": 0.1218356229364872, "rewards/accuracy_reward": 0.2455357313156128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1194 }, { "completion_length": 552.2009124755859, "epoch": 0.3569561645881562, "grad_norm": 0.5275676250457764, "kl": 0.4169921875, "learning_rate": 8.307800533100409e-07, "loss": 0.0167, "reward": 1.2260045409202576, "reward_std": 0.09873227216303349, "rewards/accuracy_reward": 0.23437501303851604, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 1195 }, { "completion_length": 690.8616485595703, "epoch": 0.3572548726756777, "grad_norm": 0.8875755071640015, "kl": 1.007080078125, "learning_rate": 8.304131135700493e-07, "loss": 0.0402, "reward": 1.121651828289032, "reward_std": 0.18411646038293839, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 1196 }, { "completion_length": 545.5759124755859, "epoch": 0.35755358076319915, "grad_norm": 0.8381032347679138, "kl": 0.7578125, "learning_rate": 8.300458687680736e-07, "loss": 0.0303, "reward": 1.2243303954601288, "reward_std": 0.13996527902781963, "rewards/accuracy_reward": 0.2343750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1197 }, { "completion_length": 529.100471496582, "epoch": 0.3578522888507206, "grad_norm": 1.183221697807312, "kl": 0.3814697265625, "learning_rate": 8.296783193036399e-07, "loss": 0.0153, "reward": 1.2873884737491608, "reward_std": 0.13966247905045748, "rewards/accuracy_reward": 0.2901785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9972098618745804, "step": 1198 }, { "completion_length": 546.8393173217773, "epoch": 0.3581509969382421, "grad_norm": 0.41067343950271606, "kl": 0.25146484375, "learning_rate": 8.293104655766066e-07, "loss": 0.0101, "reward": 1.1897321939468384, "reward_std": 0.12995117343962193, "rewards/accuracy_reward": 0.1986607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1199 }, { "completion_length": 641.7656326293945, "epoch": 0.35844970502576357, "grad_norm": 0.8249923586845398, "kl": 0.65673828125, "learning_rate": 8.289423079871618e-07, "loss": 0.0263, "reward": 1.1300223767757416, "reward_std": 0.12452179193496704, "rewards/accuracy_reward": 0.14285714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 1200 }, { "completion_length": 604.8236846923828, "epoch": 0.35874841311328504, "grad_norm": 0.41685187816619873, "kl": 0.349853515625, "learning_rate": 8.285738469358253e-07, "loss": 0.014, "reward": 1.2505581080913544, "reward_std": 0.15053939074277878, "rewards/accuracy_reward": 0.2566964402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 1201 }, { "completion_length": 568.8303985595703, "epoch": 0.3590471212008065, "grad_norm": 0.3856782615184784, "kl": 0.43408203125, "learning_rate": 8.282050828234464e-07, "loss": 0.0173, "reward": 1.0920759290456772, "reward_std": 0.10348978638648987, "rewards/accuracy_reward": 0.1026785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 1202 }, { "completion_length": 553.1473541259766, "epoch": 0.359345829288328, "grad_norm": 0.6017321348190308, "kl": 0.31201171875, "learning_rate": 8.278360160512046e-07, "loss": 0.0124, "reward": 1.0993303954601288, "reward_std": 0.16195609234273434, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1203 }, { "completion_length": 626.9977874755859, "epoch": 0.35964453737584945, "grad_norm": 0.6073652505874634, "kl": 0.27734375, "learning_rate": 8.27466647020608e-07, "loss": 0.0111, "reward": 1.05245541036129, "reward_std": 0.13582559116184711, "rewards/accuracy_reward": 0.06250000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1204 }, { "completion_length": 640.4241409301758, "epoch": 0.3599432454633709, "grad_norm": 0.7101991772651672, "kl": 0.5927734375, "learning_rate": 8.270969761334944e-07, "loss": 0.0237, "reward": 1.1093750298023224, "reward_std": 0.17780992574989796, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 1205 }, { "completion_length": 581.4486770629883, "epoch": 0.3602419535508924, "grad_norm": 0.4328235983848572, "kl": 0.2193603515625, "learning_rate": 8.267270037920288e-07, "loss": 0.0088, "reward": 1.1093750596046448, "reward_std": 0.109775316901505, "rewards/accuracy_reward": 0.11607143748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1206 }, { "completion_length": 608.7210083007812, "epoch": 0.36054066163841386, "grad_norm": 0.27594196796417236, "kl": 0.4676513671875, "learning_rate": 8.263567303987056e-07, "loss": 0.0187, "reward": 1.1037946939468384, "reward_std": 0.1895589791238308, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 1207 }, { "completion_length": 593.7835006713867, "epoch": 0.36083936972593533, "grad_norm": 0.44137877225875854, "kl": 0.4765625, "learning_rate": 8.259861563563453e-07, "loss": 0.0191, "reward": 1.0931920111179352, "reward_std": 0.12001731991767883, "rewards/accuracy_reward": 0.1071428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1208 }, { "completion_length": 621.216552734375, "epoch": 0.3611380778134568, "grad_norm": 0.6886295080184937, "kl": 0.423095703125, "learning_rate": 8.256152820680967e-07, "loss": 0.0169, "reward": 1.1367187798023224, "reward_std": 0.11855984851717949, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 1209 }, { "completion_length": 609.9598541259766, "epoch": 0.3614367859009783, "grad_norm": 0.3569580316543579, "kl": 0.2908935546875, "learning_rate": 8.252441079374342e-07, "loss": 0.0116, "reward": 1.1657366156578064, "reward_std": 0.1823197938501835, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937798023224, "step": 1210 }, { "completion_length": 622.419677734375, "epoch": 0.36173549398849975, "grad_norm": 0.6852257251739502, "kl": 0.580322265625, "learning_rate": 8.248726343681591e-07, "loss": 0.0232, "reward": 1.1556920111179352, "reward_std": 0.14932671375572681, "rewards/accuracy_reward": 0.1674107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 1211 }, { "completion_length": 555.2009124755859, "epoch": 0.3620342020760212, "grad_norm": 0.5203437209129333, "kl": 0.4700927734375, "learning_rate": 8.245008617643984e-07, "loss": 0.0188, "reward": 1.1897321939468384, "reward_std": 0.16079882625490427, "rewards/accuracy_reward": 0.1964285857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1212 }, { "completion_length": 592.2678756713867, "epoch": 0.3623329101635427, "grad_norm": 0.3875824511051178, "kl": 0.37274169921875, "learning_rate": 8.241287905306038e-07, "loss": 0.0149, "reward": 1.293526828289032, "reward_std": 0.14586079586297274, "rewards/accuracy_reward": 0.3013392984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875149011612, "step": 1213 }, { "completion_length": 581.1406555175781, "epoch": 0.36263161825106416, "grad_norm": 0.27638113498687744, "kl": 0.440185546875, "learning_rate": 8.237564210715528e-07, "loss": 0.0176, "reward": 1.0998884439468384, "reward_std": 0.12191870529204607, "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812649011612, "step": 1214 }, { "completion_length": 559.9263687133789, "epoch": 0.36293032633858563, "grad_norm": 0.5464193224906921, "kl": 0.52508544921875, "learning_rate": 8.233837537923467e-07, "loss": 0.021, "reward": 1.2656250596046448, "reward_std": 0.20826926920562983, "rewards/accuracy_reward": 0.2834821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428656578064, "step": 1215 }, { "completion_length": 650.3147506713867, "epoch": 0.3632290344261071, "grad_norm": 0.4507879316806793, "kl": 0.360595703125, "learning_rate": 8.230107890984109e-07, "loss": 0.0144, "reward": 1.1289063096046448, "reward_std": 0.1320003615692258, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 1216 }, { "completion_length": 632.9486999511719, "epoch": 0.36352774251362857, "grad_norm": 0.5855457782745361, "kl": 0.34228515625, "learning_rate": 8.226375273954945e-07, "loss": 0.0137, "reward": 1.0998884439468384, "reward_std": 0.17220113053917885, "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1217 }, { "completion_length": 557.4888610839844, "epoch": 0.36382645060115004, "grad_norm": 0.803177535533905, "kl": 0.27703857421875, "learning_rate": 8.222639690896698e-07, "loss": 0.0111, "reward": 1.2204241454601288, "reward_std": 0.18075227364897728, "rewards/accuracy_reward": 0.2299107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 1218 }, { "completion_length": 632.8683166503906, "epoch": 0.3641251586886715, "grad_norm": 0.8300203680992126, "kl": 0.40478515625, "learning_rate": 8.218901145873312e-07, "loss": 0.0162, "reward": 1.1121652275323868, "reward_std": 0.13379839807748795, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871651977300644, "step": 1219 }, { "completion_length": 605.5379638671875, "epoch": 0.364423866776193, "grad_norm": 0.6606036424636841, "kl": 0.407470703125, "learning_rate": 8.215159642951962e-07, "loss": 0.0163, "reward": 1.2042411267757416, "reward_std": 0.22454018518328667, "rewards/accuracy_reward": 0.2165178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1220 }, { "completion_length": 630.4799423217773, "epoch": 0.36472257486371445, "grad_norm": 0.5308197736740112, "kl": 0.308837890625, "learning_rate": 8.211415186203033e-07, "loss": 0.0124, "reward": 1.1635045409202576, "reward_std": 0.1978573575615883, "rewards/accuracy_reward": 0.1741071566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 1221 }, { "completion_length": 618.4397506713867, "epoch": 0.3650212829512359, "grad_norm": 0.2563576400279999, "kl": 0.16986083984375, "learning_rate": 8.207667779700131e-07, "loss": 0.0068, "reward": 1.1573661267757416, "reward_std": 0.18257353454828262, "rewards/accuracy_reward": 0.1629464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9944196790456772, "step": 1222 }, { "completion_length": 585.3326187133789, "epoch": 0.3653199910387574, "grad_norm": 0.9815369248390198, "kl": 0.329833984375, "learning_rate": 8.203917427520064e-07, "loss": 0.0132, "reward": 1.1071429252624512, "reward_std": 0.11692679719999433, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1223 }, { "completion_length": 630.4977874755859, "epoch": 0.36561869912627887, "grad_norm": 0.41942650079727173, "kl": 0.51123046875, "learning_rate": 8.200164133742847e-07, "loss": 0.0204, "reward": 1.2120535969734192, "reward_std": 0.18361322209239006, "rewards/accuracy_reward": 0.22321429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1224 }, { "completion_length": 617.5893096923828, "epoch": 0.36591740721380034, "grad_norm": 0.565599799156189, "kl": 0.560302734375, "learning_rate": 8.196407902451699e-07, "loss": 0.0224, "reward": 1.0842634439468384, "reward_std": 0.10868707112967968, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1225 }, { "completion_length": 586.2388763427734, "epoch": 0.3662161153013218, "grad_norm": 0.27697888016700745, "kl": 0.28094482421875, "learning_rate": 8.192648737733026e-07, "loss": 0.0112, "reward": 1.148995578289032, "reward_std": 0.1362458188086748, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9972098469734192, "step": 1226 }, { "completion_length": 560.1317291259766, "epoch": 0.3665148233888433, "grad_norm": 0.45983487367630005, "kl": 0.260498046875, "learning_rate": 8.188886643676438e-07, "loss": 0.0104, "reward": 1.2572545409202576, "reward_std": 0.13744275458157063, "rewards/accuracy_reward": 0.263392873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 1227 }, { "completion_length": 575.3928833007812, "epoch": 0.3668135314763647, "grad_norm": 0.25072601437568665, "kl": 0.3187255859375, "learning_rate": 8.185121624374719e-07, "loss": 0.0127, "reward": 1.227120578289032, "reward_std": 0.14186138845980167, "rewards/accuracy_reward": 0.2343750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 1228 }, { "completion_length": 623.6763610839844, "epoch": 0.36711223956388617, "grad_norm": 0.7511720061302185, "kl": 0.6199951171875, "learning_rate": 8.181353683923844e-07, "loss": 0.0248, "reward": 1.2449777126312256, "reward_std": 0.24729911610484123, "rewards/accuracy_reward": 0.26116072945296764, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838169813156128, "step": 1229 }, { "completion_length": 563.4375152587891, "epoch": 0.36741094765140764, "grad_norm": 0.591836154460907, "kl": 0.49359130859375, "learning_rate": 8.177582826422961e-07, "loss": 0.0198, "reward": 1.2282366752624512, "reward_std": 0.1372864842414856, "rewards/accuracy_reward": 0.2366071566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 1230 }, { "completion_length": 571.4888687133789, "epoch": 0.3677096557389291, "grad_norm": 0.9428937435150146, "kl": 0.739013671875, "learning_rate": 8.173809055974394e-07, "loss": 0.0295, "reward": 1.2014509737491608, "reward_std": 0.11295094341039658, "rewards/accuracy_reward": 0.20758929662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 1231 }, { "completion_length": 649.4799346923828, "epoch": 0.3680083638264506, "grad_norm": 0.8856450915336609, "kl": 0.576416015625, "learning_rate": 8.170032376683637e-07, "loss": 0.0231, "reward": 1.0976563096046448, "reward_std": 0.13956713490188122, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1232 }, { "completion_length": 575.3192138671875, "epoch": 0.36830707191397205, "grad_norm": 0.7136337161064148, "kl": 0.71484375, "learning_rate": 8.166252792659344e-07, "loss": 0.0286, "reward": 1.233258992433548, "reward_std": 0.17260544747114182, "rewards/accuracy_reward": 0.2522321566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 1233 }, { "completion_length": 583.9174346923828, "epoch": 0.3686057800014935, "grad_norm": 0.44775626063346863, "kl": 0.241455078125, "learning_rate": 8.162470308013332e-07, "loss": 0.0097, "reward": 1.2170759439468384, "reward_std": 0.170747397467494, "rewards/accuracy_reward": 0.227678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 1234 }, { "completion_length": 608.5714569091797, "epoch": 0.368904488089015, "grad_norm": 0.4520457088947296, "kl": 0.4580078125, "learning_rate": 8.158684926860579e-07, "loss": 0.0183, "reward": 1.1233259290456772, "reward_std": 0.11932547949254513, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 1235 }, { "completion_length": 636.1808319091797, "epoch": 0.36920319617653646, "grad_norm": 0.28096914291381836, "kl": 0.282470703125, "learning_rate": 8.154896653319202e-07, "loss": 0.0113, "reward": 1.0820313096046448, "reward_std": 0.1000449163839221, "rewards/accuracy_reward": 0.08705357764847577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949776977300644, "step": 1236 }, { "completion_length": 650.9598388671875, "epoch": 0.36950190426405793, "grad_norm": 0.32381007075309753, "kl": 0.2908935546875, "learning_rate": 8.151105491510473e-07, "loss": 0.0117, "reward": 1.1612723767757416, "reward_std": 0.12435029074549675, "rewards/accuracy_reward": 0.1674107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 1237 }, { "completion_length": 608.2388687133789, "epoch": 0.3698006123515794, "grad_norm": 0.3865061402320862, "kl": 0.18798828125, "learning_rate": 8.147311445558807e-07, "loss": 0.0075, "reward": 1.1858259439468384, "reward_std": 0.128113211132586, "rewards/accuracy_reward": 0.1941964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916294813156128, "step": 1238 }, { "completion_length": 549.3482437133789, "epoch": 0.3700993204391009, "grad_norm": 0.7588030099868774, "kl": 0.524658203125, "learning_rate": 8.143514519591754e-07, "loss": 0.021, "reward": 1.1947545111179352, "reward_std": 0.1459543751552701, "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 1239 }, { "completion_length": 613.654052734375, "epoch": 0.37039802852662235, "grad_norm": 0.48634281754493713, "kl": 0.51904296875, "learning_rate": 8.139714717739993e-07, "loss": 0.0208, "reward": 1.170758992433548, "reward_std": 0.12232479639351368, "rewards/accuracy_reward": 0.18080358440056443, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1240 }, { "completion_length": 629.8259124755859, "epoch": 0.3706967366141438, "grad_norm": 0.5753811001777649, "kl": 0.574951171875, "learning_rate": 8.135912044137342e-07, "loss": 0.023, "reward": 1.0457589626312256, "reward_std": 0.06795338913798332, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1241 }, { "completion_length": 653.3527069091797, "epoch": 0.3709954447016653, "grad_norm": 0.4164588749408722, "kl": 0.593994140625, "learning_rate": 8.132106502920733e-07, "loss": 0.0237, "reward": 1.106026828289032, "reward_std": 0.1259214375168085, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 1242 }, { "completion_length": 620.1361846923828, "epoch": 0.37129415278918676, "grad_norm": 0.6009331941604614, "kl": 0.4296875, "learning_rate": 8.128298098230222e-07, "loss": 0.0172, "reward": 1.1422991752624512, "reward_std": 0.14448361471295357, "rewards/accuracy_reward": 0.15178572619333863, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 1243 }, { "completion_length": 620.2857284545898, "epoch": 0.37159286087670823, "grad_norm": 0.7874107360839844, "kl": 0.7607421875, "learning_rate": 8.124486834208981e-07, "loss": 0.0304, "reward": 1.1116072237491608, "reward_std": 0.16489230282604694, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1244 }, { "completion_length": 645.1071624755859, "epoch": 0.3718915689642297, "grad_norm": 0.22442983090877533, "kl": 0.294921875, "learning_rate": 8.120672715003294e-07, "loss": 0.0118, "reward": 1.1791295111179352, "reward_std": 0.12290902249515057, "rewards/accuracy_reward": 0.18750001350417733, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 1245 }, { "completion_length": 640.7455596923828, "epoch": 0.3721902770517512, "grad_norm": 0.4430517256259918, "kl": 0.9404296875, "learning_rate": 8.116855744762544e-07, "loss": 0.0376, "reward": 1.1947545260190964, "reward_std": 0.12087605893611908, "rewards/accuracy_reward": 0.20535715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 1246 }, { "completion_length": 650.9062805175781, "epoch": 0.37248898513927264, "grad_norm": 1.3904207944869995, "kl": 0.5015869140625, "learning_rate": 8.113035927639226e-07, "loss": 0.0201, "reward": 1.117745578289032, "reward_std": 0.20677881687879562, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 1247 }, { "completion_length": 559.0022430419922, "epoch": 0.3727876932267941, "grad_norm": 0.2965732216835022, "kl": 0.539794921875, "learning_rate": 8.109213267788921e-07, "loss": 0.0216, "reward": 1.184151828289032, "reward_std": 0.14900873601436615, "rewards/accuracy_reward": 0.19866072619333863, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911267757416, "step": 1248 }, { "completion_length": 659.2411041259766, "epoch": 0.3730864013143156, "grad_norm": 1.4942799806594849, "kl": 0.697265625, "learning_rate": 8.105387769370312e-07, "loss": 0.0279, "reward": 1.1462053954601288, "reward_std": 0.16937654092907906, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9743303954601288, "step": 1249 }, { "completion_length": 559.9018096923828, "epoch": 0.37338510940183706, "grad_norm": 0.34102192521095276, "kl": 0.5452880859375, "learning_rate": 8.101559436545165e-07, "loss": 0.0218, "reward": 1.1696428954601288, "reward_std": 0.11673358455300331, "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1250 }, { "completion_length": 675.1719055175781, "epoch": 0.3736838174893585, "grad_norm": 0.42847388982772827, "kl": 0.60009765625, "learning_rate": 8.097728273478332e-07, "loss": 0.024, "reward": 1.1099330633878708, "reward_std": 0.1493628080934286, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1251 }, { "completion_length": 628.1205749511719, "epoch": 0.37398252557688, "grad_norm": 0.4676903784275055, "kl": 0.71923828125, "learning_rate": 8.093894284337742e-07, "loss": 0.0288, "reward": 1.1311384439468384, "reward_std": 0.19415288046002388, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 1252 }, { "completion_length": 628.544677734375, "epoch": 0.37428123366440147, "grad_norm": 1.2375503778457642, "kl": 0.468994140625, "learning_rate": 8.090057473294398e-07, "loss": 0.0188, "reward": 1.1216518580913544, "reward_std": 0.12584522180259228, "rewards/accuracy_reward": 0.13392857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 1253 }, { "completion_length": 577.6250152587891, "epoch": 0.37457994175192294, "grad_norm": 0.7011925578117371, "kl": 0.2872314453125, "learning_rate": 8.086217844522377e-07, "loss": 0.0115, "reward": 1.1830357611179352, "reward_std": 0.10501632373780012, "rewards/accuracy_reward": 0.18526786426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9977678656578064, "step": 1254 }, { "completion_length": 703.5491333007812, "epoch": 0.3748786498394444, "grad_norm": 1.0041956901550293, "kl": 0.966796875, "learning_rate": 8.082375402198819e-07, "loss": 0.0387, "reward": 1.0574777126312256, "reward_std": 0.1590760350227356, "rewards/accuracy_reward": 0.07812500302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 1255 }, { "completion_length": 655.5803680419922, "epoch": 0.3751773579269659, "grad_norm": 0.39797985553741455, "kl": 1.203125, "learning_rate": 8.078530150503923e-07, "loss": 0.0481, "reward": 1.147321492433548, "reward_std": 0.16559918969869614, "rewards/accuracy_reward": 0.17187500838190317, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464775323868, "step": 1256 }, { "completion_length": 646.2946624755859, "epoch": 0.37547606601448735, "grad_norm": 0.4083019196987152, "kl": 0.958984375, "learning_rate": 8.074682093620946e-07, "loss": 0.0384, "reward": 1.0652902275323868, "reward_std": 0.18440337479114532, "rewards/accuracy_reward": 0.09151786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723618745804, "step": 1257 }, { "completion_length": 635.1718902587891, "epoch": 0.3757747741020088, "grad_norm": 0.6485071778297424, "kl": 0.7236328125, "learning_rate": 8.070831235736197e-07, "loss": 0.029, "reward": 1.1657366454601288, "reward_std": 0.12682905048131943, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 1258 }, { "completion_length": 636.5268249511719, "epoch": 0.3760734821895303, "grad_norm": 0.7079984545707703, "kl": 0.8291015625, "learning_rate": 8.066977581039033e-07, "loss": 0.0332, "reward": 1.1088170409202576, "reward_std": 0.1464228630065918, "rewards/accuracy_reward": 0.12723215017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 1259 }, { "completion_length": 656.5223541259766, "epoch": 0.37637219027705177, "grad_norm": 0.375508576631546, "kl": 0.58154296875, "learning_rate": 8.063121133721849e-07, "loss": 0.0233, "reward": 1.196428656578064, "reward_std": 0.14454157650470734, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1260 }, { "completion_length": 568.1607513427734, "epoch": 0.37667089836457324, "grad_norm": 0.5743986964225769, "kl": 0.568359375, "learning_rate": 8.059261897980086e-07, "loss": 0.0228, "reward": 1.1579241454601288, "reward_std": 0.17525088600814342, "rewards/accuracy_reward": 0.16964286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1261 }, { "completion_length": 549.6049346923828, "epoch": 0.3769696064520947, "grad_norm": 0.3159407377243042, "kl": 0.3621826171875, "learning_rate": 8.055399878012214e-07, "loss": 0.0145, "reward": 1.1657366752624512, "reward_std": 0.09602024592459202, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871651977300644, "step": 1262 }, { "completion_length": 647.8192291259766, "epoch": 0.3772683145396162, "grad_norm": 0.4762651324272156, "kl": 0.5234375, "learning_rate": 8.051535078019729e-07, "loss": 0.0209, "reward": 1.2633928954601288, "reward_std": 0.2362644374370575, "rewards/accuracy_reward": 0.27008930034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1263 }, { "completion_length": 532.1763687133789, "epoch": 0.37756702262713765, "grad_norm": 0.35489600896835327, "kl": 0.444091796875, "learning_rate": 8.047667502207157e-07, "loss": 0.0178, "reward": 1.2271206080913544, "reward_std": 0.2162898238748312, "rewards/accuracy_reward": 0.2343750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 1264 }, { "completion_length": 760.6339721679688, "epoch": 0.3778657307146591, "grad_norm": 0.38307440280914307, "kl": 0.77197265625, "learning_rate": 8.043797154782041e-07, "loss": 0.0309, "reward": 1.0697545111179352, "reward_std": 0.12820773292332888, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9648437798023224, "step": 1265 }, { "completion_length": 649.0022583007812, "epoch": 0.3781644388021806, "grad_norm": 0.3074822723865509, "kl": 0.4794921875, "learning_rate": 8.039924039954939e-07, "loss": 0.0192, "reward": 1.2494420409202576, "reward_std": 0.20991818979382515, "rewards/accuracy_reward": 0.2633928693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98604916036129, "step": 1266 }, { "completion_length": 637.2165374755859, "epoch": 0.37846314688970206, "grad_norm": 0.7780102491378784, "kl": 0.8037109375, "learning_rate": 8.036048161939422e-07, "loss": 0.0322, "reward": 1.1356027126312256, "reward_std": 0.13627473823726177, "rewards/accuracy_reward": 0.15401786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 1267 }, { "completion_length": 538.7567138671875, "epoch": 0.37876185497722353, "grad_norm": 0.4940779209136963, "kl": 0.430908203125, "learning_rate": 8.032169524952062e-07, "loss": 0.0172, "reward": 1.2315848767757416, "reward_std": 0.14555366523563862, "rewards/accuracy_reward": 0.23660715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949777126312256, "step": 1268 }, { "completion_length": 654.5536041259766, "epoch": 0.379060563064745, "grad_norm": 0.4111938178539276, "kl": 1.0947265625, "learning_rate": 8.028288133212441e-07, "loss": 0.0437, "reward": 1.1389509439468384, "reward_std": 0.19634631648659706, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 1269 }, { "completion_length": 691.1451263427734, "epoch": 0.3793592711522665, "grad_norm": 0.5466085076332092, "kl": 0.62109375, "learning_rate": 8.024403990943128e-07, "loss": 0.0249, "reward": 1.237165242433548, "reward_std": 0.20879048481583595, "rewards/accuracy_reward": 0.25669644214212894, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 1270 }, { "completion_length": 566.3393173217773, "epoch": 0.3796579792397879, "grad_norm": 0.2905943691730499, "kl": 0.3760986328125, "learning_rate": 8.020517102369692e-07, "loss": 0.015, "reward": 1.1674107611179352, "reward_std": 0.13467265851795673, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1271 }, { "completion_length": 630.9062805175781, "epoch": 0.37995668732730936, "grad_norm": 0.3093145787715912, "kl": 0.38330078125, "learning_rate": 8.016627471720684e-07, "loss": 0.0153, "reward": 1.178571492433548, "reward_std": 0.19607875123620033, "rewards/accuracy_reward": 0.18750000838190317, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 1272 }, { "completion_length": 604.3951110839844, "epoch": 0.38025539541483083, "grad_norm": 0.358951598405838, "kl": 0.329345703125, "learning_rate": 8.012735103227644e-07, "loss": 0.0132, "reward": 1.1953125298023224, "reward_std": 0.11402589920908213, "rewards/accuracy_reward": 0.2008928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9944196790456772, "step": 1273 }, { "completion_length": 647.319221496582, "epoch": 0.3805541035023523, "grad_norm": 0.8449181318283081, "kl": 0.780517578125, "learning_rate": 8.008840001125088e-07, "loss": 0.0312, "reward": 1.1975447237491608, "reward_std": 0.23275436833500862, "rewards/accuracy_reward": 0.2209821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625298023224, "step": 1274 }, { "completion_length": 611.4888534545898, "epoch": 0.3808528115898738, "grad_norm": 0.32872986793518066, "kl": 0.38232421875, "learning_rate": 8.004942169650501e-07, "loss": 0.0153, "reward": 1.1545759439468384, "reward_std": 0.12451130524277687, "rewards/accuracy_reward": 0.16517857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 1275 }, { "completion_length": 614.1830596923828, "epoch": 0.38115151967739525, "grad_norm": 0.7250230312347412, "kl": 0.564453125, "learning_rate": 8.001041613044346e-07, "loss": 0.0226, "reward": 1.1143973469734192, "reward_std": 0.17086196690797806, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 1276 }, { "completion_length": 615.7969055175781, "epoch": 0.3814502277649167, "grad_norm": 0.4436688721179962, "kl": 0.444091796875, "learning_rate": 7.997138335550043e-07, "loss": 0.0177, "reward": 1.148995578289032, "reward_std": 0.07063603587448597, "rewards/accuracy_reward": 0.1674107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 1277 }, { "completion_length": 582.7545013427734, "epoch": 0.3817489358524382, "grad_norm": 0.2710120677947998, "kl": 0.3363037109375, "learning_rate": 7.993232341413977e-07, "loss": 0.0134, "reward": 1.2455357611179352, "reward_std": 0.14616254717111588, "rewards/accuracy_reward": 0.25446429941803217, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1278 }, { "completion_length": 603.8303985595703, "epoch": 0.38204764393995966, "grad_norm": 0.32490259408950806, "kl": 0.21331787109375, "learning_rate": 7.989323634885488e-07, "loss": 0.0085, "reward": 1.2064732611179352, "reward_std": 0.13732203282415867, "rewards/accuracy_reward": 0.2120535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9944196492433548, "step": 1279 }, { "completion_length": 521.0647583007812, "epoch": 0.38234635202748113, "grad_norm": 0.9075049161911011, "kl": 0.510498046875, "learning_rate": 7.985412220216861e-07, "loss": 0.0205, "reward": 1.1623884290456772, "reward_std": 0.17436019890010357, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 1280 }, { "completion_length": 668.7277069091797, "epoch": 0.3826450601150026, "grad_norm": 0.8088012337684631, "kl": 0.41943359375, "learning_rate": 7.981498101663337e-07, "loss": 0.0168, "reward": 1.1300223767757416, "reward_std": 0.1721312254667282, "rewards/accuracy_reward": 0.14508928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 1281 }, { "completion_length": 622.6071624755859, "epoch": 0.38294376820252407, "grad_norm": 0.9257245063781738, "kl": 0.3369140625, "learning_rate": 7.977581283483091e-07, "loss": 0.0135, "reward": 1.1824777126312256, "reward_std": 0.10295762121677399, "rewards/accuracy_reward": 0.1919642984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 1282 }, { "completion_length": 674.6964569091797, "epoch": 0.38324247629004554, "grad_norm": 0.6556270718574524, "kl": 0.7327880859375, "learning_rate": 7.973661769937239e-07, "loss": 0.0293, "reward": 1.1367188096046448, "reward_std": 0.1539031770080328, "rewards/accuracy_reward": 0.16071429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 1283 }, { "completion_length": 674.7411117553711, "epoch": 0.383541184377567, "grad_norm": 0.19718168675899506, "kl": 0.31573486328125, "learning_rate": 7.969739565289826e-07, "loss": 0.0126, "reward": 1.1573660969734192, "reward_std": 0.10971677117049694, "rewards/accuracy_reward": 0.1718750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 1284 }, { "completion_length": 620.8549194335938, "epoch": 0.3838398924650885, "grad_norm": 0.6689097285270691, "kl": 0.5577392578125, "learning_rate": 7.965814673807825e-07, "loss": 0.0223, "reward": 1.1517857611179352, "reward_std": 0.1590730957686901, "rewards/accuracy_reward": 0.16071428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1285 }, { "completion_length": 566.8013610839844, "epoch": 0.38413860055260995, "grad_norm": 0.26985153555870056, "kl": 0.29296875, "learning_rate": 7.961887099761136e-07, "loss": 0.0117, "reward": 1.1004465073347092, "reward_std": 0.11065321788191795, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1286 }, { "completion_length": 597.4844055175781, "epoch": 0.3844373086401314, "grad_norm": 0.6557042002677917, "kl": 0.439697265625, "learning_rate": 7.957956847422572e-07, "loss": 0.0176, "reward": 1.1183036416769028, "reward_std": 0.14363372698426247, "rewards/accuracy_reward": 0.13169643888249993, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 1287 }, { "completion_length": 625.8281402587891, "epoch": 0.3847360167276529, "grad_norm": 0.5386815071105957, "kl": 0.63134765625, "learning_rate": 7.954023921067865e-07, "loss": 0.0253, "reward": 1.3147321939468384, "reward_std": 0.17061302065849304, "rewards/accuracy_reward": 0.3303571566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 1288 }, { "completion_length": 604.8236846923828, "epoch": 0.38503472481517437, "grad_norm": 0.41745424270629883, "kl": 0.395751953125, "learning_rate": 7.95008832497565e-07, "loss": 0.0159, "reward": 1.1785714626312256, "reward_std": 0.19870617240667343, "rewards/accuracy_reward": 0.1897321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1289 }, { "completion_length": 601.7053756713867, "epoch": 0.38533343290269584, "grad_norm": 0.4460349977016449, "kl": 0.4603271484375, "learning_rate": 7.946150063427473e-07, "loss": 0.0184, "reward": 1.1015625596046448, "reward_std": 0.09337223321199417, "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 1290 }, { "completion_length": 625.2120819091797, "epoch": 0.3856321409902173, "grad_norm": 0.47511449456214905, "kl": 0.679931640625, "learning_rate": 7.942209140707777e-07, "loss": 0.0271, "reward": 1.0948661416769028, "reward_std": 0.1659368760883808, "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1291 }, { "completion_length": 603.7857360839844, "epoch": 0.3859308490777388, "grad_norm": 1.2771066427230835, "kl": 1.0478515625, "learning_rate": 7.938265561103897e-07, "loss": 0.042, "reward": 1.2126116752624512, "reward_std": 0.1759602390229702, "rewards/accuracy_reward": 0.2388392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723767757416, "step": 1292 }, { "completion_length": 588.0647506713867, "epoch": 0.38622955716526025, "grad_norm": 0.3782700002193451, "kl": 0.50732421875, "learning_rate": 7.934319328906061e-07, "loss": 0.0203, "reward": 1.0680803954601288, "reward_std": 0.1388622634112835, "rewards/accuracy_reward": 0.08258929033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 1293 }, { "completion_length": 627.2701263427734, "epoch": 0.3865282652527817, "grad_norm": 0.41159695386886597, "kl": 0.7197265625, "learning_rate": 7.930370448407386e-07, "loss": 0.0288, "reward": 1.1651785969734192, "reward_std": 0.21295686811208725, "rewards/accuracy_reward": 0.1830357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 1294 }, { "completion_length": 632.122802734375, "epoch": 0.3868269733403032, "grad_norm": 0.7364798188209534, "kl": 0.6181640625, "learning_rate": 7.926418923903863e-07, "loss": 0.0247, "reward": 1.2483259290456772, "reward_std": 0.15529091283679008, "rewards/accuracy_reward": 0.25892857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 1295 }, { "completion_length": 636.709846496582, "epoch": 0.38712568142782466, "grad_norm": 0.6990874409675598, "kl": 0.78662109375, "learning_rate": 7.922464759694369e-07, "loss": 0.0314, "reward": 1.1975446939468384, "reward_std": 0.185587290674448, "rewards/accuracy_reward": 0.2120535857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 1296 }, { "completion_length": 587.3906555175781, "epoch": 0.38742438951534613, "grad_norm": 0.6041667461395264, "kl": 0.78564453125, "learning_rate": 7.918507960080641e-07, "loss": 0.0315, "reward": 1.1674107909202576, "reward_std": 0.19773035869002342, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1297 }, { "completion_length": 632.4710159301758, "epoch": 0.3877230976028676, "grad_norm": 0.9526256918907166, "kl": 0.442138671875, "learning_rate": 7.914548529367291e-07, "loss": 0.0177, "reward": 1.1746652126312256, "reward_std": 0.2130836360156536, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 1298 }, { "completion_length": 628.6763763427734, "epoch": 0.3880218056903891, "grad_norm": 0.8584056496620178, "kl": 0.35107421875, "learning_rate": 7.910586471861793e-07, "loss": 0.0141, "reward": 1.152901828289032, "reward_std": 0.1569897048175335, "rewards/accuracy_reward": 0.16071429220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 1299 }, { "completion_length": 623.4955596923828, "epoch": 0.38832051377791055, "grad_norm": 0.5083205103874207, "kl": 0.547119140625, "learning_rate": 7.906621791874477e-07, "loss": 0.0219, "reward": 1.1668527126312256, "reward_std": 0.1807012725621462, "rewards/accuracy_reward": 0.18080358067527413, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 1300 }, { "completion_length": 605.2076034545898, "epoch": 0.388619221865432, "grad_norm": 1.006015658378601, "kl": 0.385009765625, "learning_rate": 7.902654493718525e-07, "loss": 0.0154, "reward": 1.2455357909202576, "reward_std": 0.20167416334152222, "rewards/accuracy_reward": 0.2566964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 1301 }, { "completion_length": 610.1071624755859, "epoch": 0.3889179299529535, "grad_norm": 0.7098153829574585, "kl": 1.0301513671875, "learning_rate": 7.898684581709969e-07, "loss": 0.0412, "reward": 1.1177456080913544, "reward_std": 0.1696338076144457, "rewards/accuracy_reward": 0.13616072479635477, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 1302 }, { "completion_length": 573.779052734375, "epoch": 0.38921663804047496, "grad_norm": 1.2501518726348877, "kl": 0.318115234375, "learning_rate": 7.894712060167686e-07, "loss": 0.0127, "reward": 1.0892857611179352, "reward_std": 0.18014884926378727, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 1303 }, { "completion_length": 586.0982360839844, "epoch": 0.38951534612799643, "grad_norm": 0.41886618733406067, "kl": 0.357666015625, "learning_rate": 7.890736933413388e-07, "loss": 0.0143, "reward": 1.090401828289032, "reward_std": 0.09899749979376793, "rewards/accuracy_reward": 0.09598214598372579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 1304 }, { "completion_length": 609.1674270629883, "epoch": 0.3898140542155179, "grad_norm": 0.6576173901557922, "kl": 0.42626953125, "learning_rate": 7.886759205771624e-07, "loss": 0.0171, "reward": 1.1088170260190964, "reward_std": 0.14779921434819698, "rewards/accuracy_reward": 0.1272321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 1305 }, { "completion_length": 642.5915374755859, "epoch": 0.3901127623030394, "grad_norm": 0.5875458121299744, "kl": 0.5078125, "learning_rate": 7.882778881569769e-07, "loss": 0.0203, "reward": 1.1830357611179352, "reward_std": 0.17492645420134068, "rewards/accuracy_reward": 0.2008928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 1306 }, { "completion_length": 600.5268249511719, "epoch": 0.39041147039056084, "grad_norm": 0.6734119057655334, "kl": 0.64190673828125, "learning_rate": 7.878795965138032e-07, "loss": 0.0257, "reward": 1.1875000298023224, "reward_std": 0.1323699075728655, "rewards/accuracy_reward": 0.2075892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 1307 }, { "completion_length": 661.6719207763672, "epoch": 0.3907101784780823, "grad_norm": 0.8493114113807678, "kl": 0.470703125, "learning_rate": 7.874810460809429e-07, "loss": 0.0188, "reward": 1.1255581080913544, "reward_std": 0.15465542301535606, "rewards/accuracy_reward": 0.13839286752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1308 }, { "completion_length": 593.1138610839844, "epoch": 0.3910088865656038, "grad_norm": 0.3490576148033142, "kl": 0.24169921875, "learning_rate": 7.870822372919802e-07, "loss": 0.0097, "reward": 1.162946492433548, "reward_std": 0.15295577980577946, "rewards/accuracy_reward": 0.17633929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1309 }, { "completion_length": 674.1986846923828, "epoch": 0.39130759465312526, "grad_norm": 0.8326044678688049, "kl": 0.940185546875, "learning_rate": 7.866831705807801e-07, "loss": 0.0376, "reward": 1.106026828289032, "reward_std": 0.14542033709585667, "rewards/accuracy_reward": 0.12946428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625447034836, "step": 1310 }, { "completion_length": 549.7678833007812, "epoch": 0.39160630274064673, "grad_norm": 0.9121289253234863, "kl": 0.5087890625, "learning_rate": 7.862838463814876e-07, "loss": 0.0203, "reward": 1.1010045111179352, "reward_std": 0.13927717506885529, "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1311 }, { "completion_length": 602.2388763427734, "epoch": 0.3919050108281682, "grad_norm": 0.6211568713188171, "kl": 0.84375, "learning_rate": 7.858842651285286e-07, "loss": 0.0337, "reward": 1.147321492433548, "reward_std": 0.14174135774374008, "rewards/accuracy_reward": 0.16294643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 1312 }, { "completion_length": 640.4375305175781, "epoch": 0.39220371891568967, "grad_norm": 0.7491273283958435, "kl": 0.72216796875, "learning_rate": 7.854844272566082e-07, "loss": 0.0289, "reward": 1.273995578289032, "reward_std": 0.13640299625694752, "rewards/accuracy_reward": 0.2924107275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 1313 }, { "completion_length": 475.07144927978516, "epoch": 0.3925024270032111, "grad_norm": 0.7187085151672363, "kl": 0.5, "learning_rate": 7.850843332007111e-07, "loss": 0.02, "reward": 1.1992187798023224, "reward_std": 0.14372826553881168, "rewards/accuracy_reward": 0.2075892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 1314 }, { "completion_length": 660.5044860839844, "epoch": 0.39280113509073256, "grad_norm": 0.6730219125747681, "kl": 0.52392578125, "learning_rate": 7.846839833961002e-07, "loss": 0.021, "reward": 1.2321428954601288, "reward_std": 0.15208894573152065, "rewards/accuracy_reward": 0.2410714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1315 }, { "completion_length": 647.9955749511719, "epoch": 0.393099843178254, "grad_norm": 1.1370707750320435, "kl": 0.9736328125, "learning_rate": 7.842833782783167e-07, "loss": 0.039, "reward": 1.241629496216774, "reward_std": 0.1605849675834179, "rewards/accuracy_reward": 0.2611607313156128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 1316 }, { "completion_length": 671.1808319091797, "epoch": 0.3933985512657755, "grad_norm": 1.3468713760375977, "kl": 1.005859375, "learning_rate": 7.8388251828318e-07, "loss": 0.0403, "reward": 1.059709906578064, "reward_std": 0.1534763164818287, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1317 }, { "completion_length": 648.9375305175781, "epoch": 0.39369725935329697, "grad_norm": 1.3116270303726196, "kl": 0.7802734375, "learning_rate": 7.834814038467864e-07, "loss": 0.0312, "reward": 1.1679688096046448, "reward_std": 0.17867061868309975, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1318 }, { "completion_length": 646.7745819091797, "epoch": 0.39399596744081844, "grad_norm": 0.884302020072937, "kl": 0.810546875, "learning_rate": 7.830800354055088e-07, "loss": 0.0325, "reward": 1.1199777275323868, "reward_std": 0.08146596141159534, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1319 }, { "completion_length": 551.5245819091797, "epoch": 0.3942946755283399, "grad_norm": 0.28516024351119995, "kl": 0.2509765625, "learning_rate": 7.826784133959972e-07, "loss": 0.01, "reward": 1.1768973767757416, "reward_std": 0.1289837323129177, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937798023224, "step": 1320 }, { "completion_length": 627.2611846923828, "epoch": 0.3945933836158614, "grad_norm": 1.2436332702636719, "kl": 0.39892578125, "learning_rate": 7.822765382551768e-07, "loss": 0.0159, "reward": 1.1322545111179352, "reward_std": 0.17353492230176926, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1321 }, { "completion_length": 613.9888763427734, "epoch": 0.39489209170338285, "grad_norm": 0.4290448725223541, "kl": 0.546875, "learning_rate": 7.818744104202483e-07, "loss": 0.0218, "reward": 1.109933078289032, "reward_std": 0.17974120192229748, "rewards/accuracy_reward": 0.12946429662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 1322 }, { "completion_length": 584.9196624755859, "epoch": 0.3951907997909043, "grad_norm": 0.2804012894630432, "kl": 0.2933349609375, "learning_rate": 7.814720303286871e-07, "loss": 0.0117, "reward": 1.1132813096046448, "reward_std": 0.11455171927809715, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 1323 }, { "completion_length": 530.0781555175781, "epoch": 0.3954895078784258, "grad_norm": 0.3662077486515045, "kl": 0.344970703125, "learning_rate": 7.810693984182439e-07, "loss": 0.0138, "reward": 1.184151828289032, "reward_std": 0.12688929215073586, "rewards/accuracy_reward": 0.194196441443637, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1324 }, { "completion_length": 627.4821624755859, "epoch": 0.39578821596594727, "grad_norm": 0.8877968192100525, "kl": 0.348388671875, "learning_rate": 7.806665151269424e-07, "loss": 0.014, "reward": 1.217633992433548, "reward_std": 0.14805839024484158, "rewards/accuracy_reward": 0.2254464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 1325 }, { "completion_length": 590.7611999511719, "epoch": 0.39608692405346874, "grad_norm": 0.6005164980888367, "kl": 0.34912109375, "learning_rate": 7.802633808930802e-07, "loss": 0.014, "reward": 1.191964328289032, "reward_std": 0.14460213109850883, "rewards/accuracy_reward": 0.2053571566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 1326 }, { "completion_length": 634.685302734375, "epoch": 0.3963856321409902, "grad_norm": 0.4693801701068878, "kl": 0.32421875, "learning_rate": 7.798599961552277e-07, "loss": 0.013, "reward": 1.166852742433548, "reward_std": 0.1772840293124318, "rewards/accuracy_reward": 0.17857143585570157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1327 }, { "completion_length": 632.7254791259766, "epoch": 0.3966843402285117, "grad_norm": 0.5603277087211609, "kl": 0.48583984375, "learning_rate": 7.794563613522284e-07, "loss": 0.0194, "reward": 1.2025670111179352, "reward_std": 0.12904871813952923, "rewards/accuracy_reward": 0.22321430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 1328 }, { "completion_length": 619.513427734375, "epoch": 0.39698304831603315, "grad_norm": 0.4866662323474884, "kl": 0.5244140625, "learning_rate": 7.790524769231968e-07, "loss": 0.021, "reward": 1.2896206080913544, "reward_std": 0.21686404198408127, "rewards/accuracy_reward": 0.3013393022119999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 1329 }, { "completion_length": 630.0960083007812, "epoch": 0.3972817564035546, "grad_norm": 0.42232632637023926, "kl": 0.4227294921875, "learning_rate": 7.786483433075199e-07, "loss": 0.0169, "reward": 1.1093750298023224, "reward_std": 0.19707242026925087, "rewards/accuracy_reward": 0.1272321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 1330 }, { "completion_length": 627.7924346923828, "epoch": 0.3975804644910761, "grad_norm": 0.569861114025116, "kl": 0.66259765625, "learning_rate": 7.782439609448555e-07, "loss": 0.0265, "reward": 1.2299107611179352, "reward_std": 0.1263651866465807, "rewards/accuracy_reward": 0.2455357238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 1331 }, { "completion_length": 564.4643173217773, "epoch": 0.39787917257859756, "grad_norm": 0.5815261602401733, "kl": 0.77783203125, "learning_rate": 7.778393302751318e-07, "loss": 0.0311, "reward": 1.166852742433548, "reward_std": 0.16743363440036774, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 1332 }, { "completion_length": 670.3460083007812, "epoch": 0.39817788066611903, "grad_norm": 0.5100416541099548, "kl": 0.907958984375, "learning_rate": 7.774344517385476e-07, "loss": 0.0363, "reward": 1.1093750298023224, "reward_std": 0.22792423889040947, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9732142984867096, "step": 1333 }, { "completion_length": 630.5067291259766, "epoch": 0.3984765887536405, "grad_norm": 0.3775476813316345, "kl": 0.646484375, "learning_rate": 7.770293257755707e-07, "loss": 0.0259, "reward": 1.2265625596046448, "reward_std": 0.1487268265336752, "rewards/accuracy_reward": 0.24107143841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 1334 }, { "completion_length": 607.4620819091797, "epoch": 0.398775296841162, "grad_norm": 0.8896520137786865, "kl": 0.454345703125, "learning_rate": 7.766239528269387e-07, "loss": 0.0182, "reward": 1.2131696939468384, "reward_std": 0.16906048730015755, "rewards/accuracy_reward": 0.2232142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1335 }, { "completion_length": 681.0982513427734, "epoch": 0.39907400492868345, "grad_norm": 1.25383722782135, "kl": 0.773681640625, "learning_rate": 7.762183333336576e-07, "loss": 0.031, "reward": 1.2555803954601288, "reward_std": 0.18379441648721695, "rewards/accuracy_reward": 0.2745535895228386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 1336 }, { "completion_length": 663.2366485595703, "epoch": 0.3993727130162049, "grad_norm": 0.5522739887237549, "kl": 0.927734375, "learning_rate": 7.758124677370014e-07, "loss": 0.0371, "reward": 1.2070313096046448, "reward_std": 0.17462411895394325, "rewards/accuracy_reward": 0.2254464440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 1337 }, { "completion_length": 521.9241409301758, "epoch": 0.3996714211037264, "grad_norm": 0.31343916058540344, "kl": 0.3973388671875, "learning_rate": 7.754063564785125e-07, "loss": 0.0159, "reward": 1.2338170111179352, "reward_std": 0.18125825189054012, "rewards/accuracy_reward": 0.24553572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1338 }, { "completion_length": 623.7076263427734, "epoch": 0.39997012919124786, "grad_norm": 0.6596025824546814, "kl": 0.5625, "learning_rate": 7.75e-07, "loss": 0.0225, "reward": 1.2873884439468384, "reward_std": 0.19536777772009373, "rewards/accuracy_reward": 0.299107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812649011612, "step": 1339 }, { "completion_length": 640.4286041259766, "epoch": 0.40026883727876933, "grad_norm": 0.767911970615387, "kl": 0.6630859375, "learning_rate": 7.745933987435398e-07, "loss": 0.0265, "reward": 1.174665242433548, "reward_std": 0.16060064174234867, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 1340 }, { "completion_length": 680.2344055175781, "epoch": 0.4005675453662908, "grad_norm": 0.5409372448921204, "kl": 0.82421875, "learning_rate": 7.741865531514743e-07, "loss": 0.0329, "reward": 1.1796875298023224, "reward_std": 0.24622860550880432, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9698661118745804, "step": 1341 }, { "completion_length": 616.7812652587891, "epoch": 0.40086625345381227, "grad_norm": 0.2625238299369812, "kl": 0.5478515625, "learning_rate": 7.737794636664116e-07, "loss": 0.0219, "reward": 1.1289062798023224, "reward_std": 0.15170356445014477, "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 1342 }, { "completion_length": 611.9777069091797, "epoch": 0.40116496154133374, "grad_norm": 0.6017227172851562, "kl": 0.53515625, "learning_rate": 7.733721307312251e-07, "loss": 0.0214, "reward": 1.075334906578064, "reward_std": 0.16120092663913965, "rewards/accuracy_reward": 0.08705357392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1343 }, { "completion_length": 754.2500305175781, "epoch": 0.4014636696288552, "grad_norm": 0.8467931747436523, "kl": 1.172607421875, "learning_rate": 7.729645547890533e-07, "loss": 0.0469, "reward": 1.0608259439468384, "reward_std": 0.1634054258465767, "rewards/accuracy_reward": 0.0959821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9648437947034836, "step": 1344 }, { "completion_length": 673.4308471679688, "epoch": 0.4017623777163767, "grad_norm": 0.33862999081611633, "kl": 0.572998046875, "learning_rate": 7.725567362832986e-07, "loss": 0.0229, "reward": 1.1713170260190964, "reward_std": 0.20247994177043438, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 1345 }, { "completion_length": 569.7545013427734, "epoch": 0.40206108580389815, "grad_norm": 0.8622769117355347, "kl": 0.3040771484375, "learning_rate": 7.721486756576279e-07, "loss": 0.0122, "reward": 1.1032366752624512, "reward_std": 0.13735291920602322, "rewards/accuracy_reward": 0.11607143515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1346 }, { "completion_length": 735.7366333007812, "epoch": 0.4023597938914196, "grad_norm": 0.8811339735984802, "kl": 0.59716796875, "learning_rate": 7.71740373355971e-07, "loss": 0.0239, "reward": 1.0608259439468384, "reward_std": 0.1531983967870474, "rewards/accuracy_reward": 0.08035714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 1347 }, { "completion_length": 695.2031707763672, "epoch": 0.4026585019789411, "grad_norm": 1.3595460653305054, "kl": 0.8818359375, "learning_rate": 7.713318298225206e-07, "loss": 0.0352, "reward": 1.089843824505806, "reward_std": 0.13518392853438854, "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.976004496216774, "step": 1348 }, { "completion_length": 626.0937805175781, "epoch": 0.40295721006646257, "grad_norm": 0.859271228313446, "kl": 0.5172119140625, "learning_rate": 7.709230455017323e-07, "loss": 0.0207, "reward": 1.2890625596046448, "reward_std": 0.24343406409025192, "rewards/accuracy_reward": 0.3035714477300644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.985491082072258, "step": 1349 }, { "completion_length": 595.0692138671875, "epoch": 0.40325591815398404, "grad_norm": 0.8536102175712585, "kl": 0.330078125, "learning_rate": 7.705140208383234e-07, "loss": 0.0132, "reward": 1.1774554252624512, "reward_std": 0.08378885313868523, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875447034836, "step": 1350 }, { "completion_length": 658.3214569091797, "epoch": 0.4035546262415055, "grad_norm": 1.3726831674575806, "kl": 0.635009765625, "learning_rate": 7.701047562772725e-07, "loss": 0.0254, "reward": 1.168526828289032, "reward_std": 0.16750404238700867, "rewards/accuracy_reward": 0.17633929941803217, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 1351 }, { "completion_length": 642.0134124755859, "epoch": 0.403853334329027, "grad_norm": 0.7350481748580933, "kl": 0.493896484375, "learning_rate": 7.696952522638192e-07, "loss": 0.0198, "reward": 1.0507813096046448, "reward_std": 0.12782700918614864, "rewards/accuracy_reward": 0.06473214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1352 }, { "completion_length": 603.4241333007812, "epoch": 0.40415204241654845, "grad_norm": 0.2900804579257965, "kl": 0.53070068359375, "learning_rate": 7.692855092434639e-07, "loss": 0.0212, "reward": 1.1406250596046448, "reward_std": 0.18553192541003227, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 1353 }, { "completion_length": 650.6406555175781, "epoch": 0.4044507505040699, "grad_norm": 0.7190585732460022, "kl": 0.451171875, "learning_rate": 7.68875527661967e-07, "loss": 0.0181, "reward": 1.2483259737491608, "reward_std": 0.14335498213768005, "rewards/accuracy_reward": 0.26562500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 1354 }, { "completion_length": 537.2120819091797, "epoch": 0.4047494585915914, "grad_norm": 0.5145024061203003, "kl": 0.44464111328125, "learning_rate": 7.684653079653479e-07, "loss": 0.0178, "reward": 1.1250000298023224, "reward_std": 0.10257072374224663, "rewards/accuracy_reward": 0.13392857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1355 }, { "completion_length": 615.9397583007812, "epoch": 0.40504816667911286, "grad_norm": 0.4496782720088959, "kl": 0.517578125, "learning_rate": 7.680548505998857e-07, "loss": 0.0207, "reward": 1.0892857611179352, "reward_std": 0.14105987641960382, "rewards/accuracy_reward": 0.10044643143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1356 }, { "completion_length": 561.1406478881836, "epoch": 0.4053468747666343, "grad_norm": 0.25497081875801086, "kl": 0.370361328125, "learning_rate": 7.676441560121178e-07, "loss": 0.0149, "reward": 1.161272406578064, "reward_std": 0.17870360054075718, "rewards/accuracy_reward": 0.1674107275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 1357 }, { "completion_length": 524.8884124755859, "epoch": 0.40564558285415575, "grad_norm": 0.3096652030944824, "kl": 0.28662109375, "learning_rate": 7.672332246488396e-07, "loss": 0.0114, "reward": 1.203683078289032, "reward_std": 0.14316585287451744, "rewards/accuracy_reward": 0.2053571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9983258992433548, "step": 1358 }, { "completion_length": 600.3951187133789, "epoch": 0.4059442909416772, "grad_norm": 0.6691049933433533, "kl": 0.75146484375, "learning_rate": 7.66822056957104e-07, "loss": 0.0301, "reward": 1.2488839626312256, "reward_std": 0.13587015680968761, "rewards/accuracy_reward": 0.2589285895228386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 1359 }, { "completion_length": 639.1763610839844, "epoch": 0.4062429990291987, "grad_norm": 0.30231350660324097, "kl": 0.4869384765625, "learning_rate": 7.664106533842214e-07, "loss": 0.0195, "reward": 1.1780134439468384, "reward_std": 0.17997842282056808, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 1360 }, { "completion_length": 635.0736846923828, "epoch": 0.40654170711672016, "grad_norm": 1.1398173570632935, "kl": 0.78857421875, "learning_rate": 7.659990143777584e-07, "loss": 0.0316, "reward": 1.2421875298023224, "reward_std": 0.15046405419707298, "rewards/accuracy_reward": 0.25446429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1361 }, { "completion_length": 617.2723541259766, "epoch": 0.40684041520424163, "grad_norm": 0.44310206174850464, "kl": 0.52978515625, "learning_rate": 7.655871403855378e-07, "loss": 0.0212, "reward": 1.1813616305589676, "reward_std": 0.16594385914504528, "rewards/accuracy_reward": 0.19196429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 1362 }, { "completion_length": 667.1562728881836, "epoch": 0.4071391232917631, "grad_norm": 0.4519258141517639, "kl": 0.71044921875, "learning_rate": 7.651750318556384e-07, "loss": 0.0285, "reward": 1.1718750596046448, "reward_std": 0.19780221953988075, "rewards/accuracy_reward": 0.191964291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 1363 }, { "completion_length": 640.3058395385742, "epoch": 0.4074378313792846, "grad_norm": 0.8512429594993591, "kl": 0.85205078125, "learning_rate": 7.647626892363938e-07, "loss": 0.0341, "reward": 1.0931920260190964, "reward_std": 0.1324351653456688, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205633878708, "step": 1364 }, { "completion_length": 605.1607360839844, "epoch": 0.40773653946680605, "grad_norm": 0.6412508487701416, "kl": 0.4022216796875, "learning_rate": 7.643501129763923e-07, "loss": 0.0161, "reward": 1.1233259439468384, "reward_std": 0.1272702096030116, "rewards/accuracy_reward": 0.13169643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916295111179352, "step": 1365 }, { "completion_length": 658.3794860839844, "epoch": 0.4080352475543275, "grad_norm": 0.847835123538971, "kl": 0.337158203125, "learning_rate": 7.639373035244763e-07, "loss": 0.0135, "reward": 1.2712054252624512, "reward_std": 0.20274684205651283, "rewards/accuracy_reward": 0.2834821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1366 }, { "completion_length": 618.8147659301758, "epoch": 0.408333955641849, "grad_norm": 0.6701129674911499, "kl": 0.6552734375, "learning_rate": 7.635242613297423e-07, "loss": 0.0262, "reward": 1.2209822237491608, "reward_std": 0.16616434790194035, "rewards/accuracy_reward": 0.2343750037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1367 }, { "completion_length": 671.4799499511719, "epoch": 0.40863266372937046, "grad_norm": 0.31540119647979736, "kl": 0.573486328125, "learning_rate": 7.631109868415397e-07, "loss": 0.023, "reward": 1.1088170111179352, "reward_std": 0.15341992862522602, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838169813156128, "step": 1368 }, { "completion_length": 693.803596496582, "epoch": 0.40893137181689193, "grad_norm": 0.69855135679245, "kl": 0.417724609375, "learning_rate": 7.626974805094704e-07, "loss": 0.0167, "reward": 1.1897321939468384, "reward_std": 0.16981075704097748, "rewards/accuracy_reward": 0.20312500838190317, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1369 }, { "completion_length": 606.1875152587891, "epoch": 0.4092300799044134, "grad_norm": 0.6349068880081177, "kl": 0.37255859375, "learning_rate": 7.62283742783389e-07, "loss": 0.0149, "reward": 1.1992188096046448, "reward_std": 0.22106780111789703, "rewards/accuracy_reward": 0.20982144260779023, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 1370 }, { "completion_length": 716.2411041259766, "epoch": 0.4095287879919349, "grad_norm": 0.490157812833786, "kl": 0.505859375, "learning_rate": 7.618697741134012e-07, "loss": 0.0202, "reward": 1.0652902126312256, "reward_std": 0.13129147794097662, "rewards/accuracy_reward": 0.08705357392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366305589676, "step": 1371 }, { "completion_length": 603.6629791259766, "epoch": 0.40982749607945634, "grad_norm": 0.9113552570343018, "kl": 0.280517578125, "learning_rate": 7.61455574949865e-07, "loss": 0.0112, "reward": 1.1445313096046448, "reward_std": 0.15434524789452553, "rewards/accuracy_reward": 0.1562500111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 1372 }, { "completion_length": 654.2411041259766, "epoch": 0.4101262041669778, "grad_norm": 0.4304272532463074, "kl": 0.55322265625, "learning_rate": 7.610411457433878e-07, "loss": 0.0221, "reward": 1.1261161416769028, "reward_std": 0.12776850163936615, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 1373 }, { "completion_length": 618.919677734375, "epoch": 0.4104249122544993, "grad_norm": 0.41115549206733704, "kl": 0.4554443359375, "learning_rate": 7.606264869448285e-07, "loss": 0.0182, "reward": 1.1668527126312256, "reward_std": 0.18420500308275223, "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1374 }, { "completion_length": 673.1071929931641, "epoch": 0.41072362034202076, "grad_norm": 0.6518970131874084, "kl": 0.33837890625, "learning_rate": 7.602115990052952e-07, "loss": 0.0135, "reward": 1.0658482760190964, "reward_std": 0.13611850887537003, "rewards/accuracy_reward": 0.08258929033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 1375 }, { "completion_length": 619.1875228881836, "epoch": 0.4110223284295422, "grad_norm": 0.5061976909637451, "kl": 0.5389404296875, "learning_rate": 7.59796482376145e-07, "loss": 0.0215, "reward": 1.2494420111179352, "reward_std": 0.1539950706064701, "rewards/accuracy_reward": 0.2611607313156128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812649011612, "step": 1376 }, { "completion_length": 584.6317291259766, "epoch": 0.4113210365170637, "grad_norm": 0.2622819244861603, "kl": 0.31787109375, "learning_rate": 7.59381137508984e-07, "loss": 0.0127, "reward": 1.215401828289032, "reward_std": 0.15066053718328476, "rewards/accuracy_reward": 0.22544644214212894, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1377 }, { "completion_length": 614.1183395385742, "epoch": 0.41161974460458517, "grad_norm": 1.1103510856628418, "kl": 0.43701171875, "learning_rate": 7.58965564855667e-07, "loss": 0.0174, "reward": 1.2020089626312256, "reward_std": 0.11052538268268108, "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1378 }, { "completion_length": 656.4888458251953, "epoch": 0.41191845269210664, "grad_norm": 0.554939329624176, "kl": 0.1844482421875, "learning_rate": 7.585497648682965e-07, "loss": 0.0074, "reward": 1.1891741752624512, "reward_std": 0.11766618117690086, "rewards/accuracy_reward": 0.19196429708972573, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9972098469734192, "step": 1379 }, { "completion_length": 633.2768096923828, "epoch": 0.4122171607796281, "grad_norm": 0.20970772206783295, "kl": 0.23455810546875, "learning_rate": 7.581337379992218e-07, "loss": 0.0094, "reward": 1.0770089626312256, "reward_std": 0.06360011314973235, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 1380 }, { "completion_length": 563.3951110839844, "epoch": 0.4125158688671496, "grad_norm": 0.3798684775829315, "kl": 0.452392578125, "learning_rate": 7.577174847010394e-07, "loss": 0.0181, "reward": 1.2260045111179352, "reward_std": 0.15665122587233782, "rewards/accuracy_reward": 0.238839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1381 }, { "completion_length": 713.4241333007812, "epoch": 0.41281457695467105, "grad_norm": 0.7783136963844299, "kl": 0.706787109375, "learning_rate": 7.573010054265921e-07, "loss": 0.0283, "reward": 1.1439732611179352, "reward_std": 0.1449077557772398, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 1382 }, { "completion_length": 716.6138916015625, "epoch": 0.4131132850421925, "grad_norm": 0.8494437336921692, "kl": 0.86083984375, "learning_rate": 7.568843006289687e-07, "loss": 0.0345, "reward": 1.1679687798023224, "reward_std": 0.20959337055683136, "rewards/accuracy_reward": 0.19196429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 1383 }, { "completion_length": 563.5826110839844, "epoch": 0.413411993129714, "grad_norm": 0.8652703762054443, "kl": 0.2525634765625, "learning_rate": 7.564673707615029e-07, "loss": 0.0101, "reward": 1.1685268580913544, "reward_std": 0.13038633950054646, "rewards/accuracy_reward": 0.17633929662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 1384 }, { "completion_length": 607.0781402587891, "epoch": 0.41371070121723547, "grad_norm": 0.6293861865997314, "kl": 0.72900390625, "learning_rate": 7.560502162777739e-07, "loss": 0.0292, "reward": 1.1093750298023224, "reward_std": 0.1599922478199005, "rewards/accuracy_reward": 0.12276786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1385 }, { "completion_length": 615.6763610839844, "epoch": 0.41400940930475694, "grad_norm": 0.7291809320449829, "kl": 0.4676513671875, "learning_rate": 7.556328376316046e-07, "loss": 0.0187, "reward": 1.176339328289032, "reward_std": 0.1228245198726654, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 1386 }, { "completion_length": 612.5201110839844, "epoch": 0.4143081173922784, "grad_norm": 0.5574337244033813, "kl": 0.595458984375, "learning_rate": 7.552152352770622e-07, "loss": 0.0238, "reward": 1.0636161118745804, "reward_std": 0.11220115143805742, "rewards/accuracy_reward": 0.07812500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 1387 }, { "completion_length": 639.2567138671875, "epoch": 0.4146068254797999, "grad_norm": 0.2929748594760895, "kl": 0.50018310546875, "learning_rate": 7.547974096684569e-07, "loss": 0.02, "reward": 1.1411831080913544, "reward_std": 0.1462094932794571, "rewards/accuracy_reward": 0.1540178693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 1388 }, { "completion_length": 601.8170013427734, "epoch": 0.41490553356732135, "grad_norm": 0.3993217945098877, "kl": 0.60400390625, "learning_rate": 7.543793612603423e-07, "loss": 0.0242, "reward": 1.135044664144516, "reward_std": 0.19981817714869976, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 1389 }, { "completion_length": 606.9129638671875, "epoch": 0.4152042416548428, "grad_norm": 0.4589727222919464, "kl": 0.6318359375, "learning_rate": 7.53961090507514e-07, "loss": 0.0253, "reward": 1.1724331080913544, "reward_std": 0.10057485103607178, "rewards/accuracy_reward": 0.1852678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1390 }, { "completion_length": 596.1830444335938, "epoch": 0.4155029497423643, "grad_norm": 0.34336572885513306, "kl": 0.54638671875, "learning_rate": 7.535425978650095e-07, "loss": 0.0219, "reward": 1.1183036267757416, "reward_std": 0.09236582228913903, "rewards/accuracy_reward": 0.1250000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1391 }, { "completion_length": 610.3125228881836, "epoch": 0.41580165782988576, "grad_norm": 0.6828690767288208, "kl": 0.79296875, "learning_rate": 7.531238837881079e-07, "loss": 0.0317, "reward": 1.1389509439468384, "reward_std": 0.2043272852897644, "rewards/accuracy_reward": 0.16517857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723618745804, "step": 1392 }, { "completion_length": 648.3973617553711, "epoch": 0.41610036591740723, "grad_norm": 0.7716690897941589, "kl": 0.5947265625, "learning_rate": 7.52704948732329e-07, "loss": 0.0238, "reward": 1.1573661267757416, "reward_std": 0.11577848345041275, "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 1393 }, { "completion_length": 672.2723541259766, "epoch": 0.4163990740049287, "grad_norm": 0.34380167722702026, "kl": 0.5250244140625, "learning_rate": 7.522857931534331e-07, "loss": 0.0209, "reward": 1.13448666036129, "reward_std": 0.1429257858544588, "rewards/accuracy_reward": 0.149553582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 1394 }, { "completion_length": 654.169677734375, "epoch": 0.4166977820924502, "grad_norm": 0.45996421575546265, "kl": 0.775390625, "learning_rate": 7.518664175074202e-07, "loss": 0.031, "reward": 1.0931920111179352, "reward_std": 0.1354754976928234, "rewards/accuracy_reward": 0.11383928707800806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 1395 }, { "completion_length": 769.8170013427734, "epoch": 0.41699649017997165, "grad_norm": 0.5196108818054199, "kl": 0.9296875, "learning_rate": 7.514468222505303e-07, "loss": 0.0372, "reward": 1.123883992433548, "reward_std": 0.20479588955640793, "rewards/accuracy_reward": 0.15178572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9720982760190964, "step": 1396 }, { "completion_length": 653.2924652099609, "epoch": 0.4172951982674931, "grad_norm": 0.7734765410423279, "kl": 0.79638671875, "learning_rate": 7.510270078392417e-07, "loss": 0.0319, "reward": 1.1160714626312256, "reward_std": 0.179915864020586, "rewards/accuracy_reward": 0.13839286658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776785969734192, "step": 1397 }, { "completion_length": 658.8192291259766, "epoch": 0.4175939063550146, "grad_norm": 0.9815713763237, "kl": 0.802490234375, "learning_rate": 7.506069747302712e-07, "loss": 0.0321, "reward": 1.017857164144516, "reward_std": 0.14211370050907135, "rewards/accuracy_reward": 0.04017857415601611, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776785969734192, "step": 1398 }, { "completion_length": 616.6294708251953, "epoch": 0.41789261444253606, "grad_norm": 0.39281025528907776, "kl": 0.48974609375, "learning_rate": 7.501867233805739e-07, "loss": 0.0196, "reward": 1.1595982611179352, "reward_std": 0.16079415380954742, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 1399 }, { "completion_length": 573.6094055175781, "epoch": 0.4181913225300575, "grad_norm": 0.9433607459068298, "kl": 0.64111328125, "learning_rate": 7.49766254247342e-07, "loss": 0.0256, "reward": 1.1668527722358704, "reward_std": 0.1896820403635502, "rewards/accuracy_reward": 0.18750000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 1400 }, { "completion_length": 599.7232513427734, "epoch": 0.41849003061757895, "grad_norm": 0.5905833840370178, "kl": 0.346435546875, "learning_rate": 7.493455677880046e-07, "loss": 0.0139, "reward": 1.0965401828289032, "reward_std": 0.12660153210163116, "rewards/accuracy_reward": 0.10491072130389512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 1401 }, { "completion_length": 626.6116333007812, "epoch": 0.4187887387051004, "grad_norm": 0.44144177436828613, "kl": 0.46923828125, "learning_rate": 7.489246644602273e-07, "loss": 0.0188, "reward": 1.1992188096046448, "reward_std": 0.16194962710142136, "rewards/accuracy_reward": 0.20982143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 1402 }, { "completion_length": 605.7433166503906, "epoch": 0.4190874467926219, "grad_norm": 0.3174358606338501, "kl": 0.20849609375, "learning_rate": 7.485035447219122e-07, "loss": 0.0083, "reward": 1.1489956080913544, "reward_std": 0.15658069029450417, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 1403 }, { "completion_length": 733.4107360839844, "epoch": 0.41938615488014336, "grad_norm": 0.4814624786376953, "kl": 1.0606689453125, "learning_rate": 7.480822090311955e-07, "loss": 0.0424, "reward": 1.2237723469734192, "reward_std": 0.2857779487967491, "rewards/accuracy_reward": 0.2522321566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9715402126312256, "step": 1404 }, { "completion_length": 599.5714721679688, "epoch": 0.41968486296766483, "grad_norm": 0.271753191947937, "kl": 0.4586181640625, "learning_rate": 7.476606578464496e-07, "loss": 0.0183, "reward": 1.2734375298023224, "reward_std": 0.22241934016346931, "rewards/accuracy_reward": 0.2857143022119999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1405 }, { "completion_length": 647.919677734375, "epoch": 0.4199835710551863, "grad_norm": 0.6001323461532593, "kl": 0.650390625, "learning_rate": 7.472388916262809e-07, "loss": 0.026, "reward": 1.215401828289032, "reward_std": 0.1582790780812502, "rewards/accuracy_reward": 0.2276785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 1406 }, { "completion_length": 615.6227874755859, "epoch": 0.42028227914270777, "grad_norm": 0.3221498429775238, "kl": 0.319091796875, "learning_rate": 7.468169108295295e-07, "loss": 0.0127, "reward": 1.1785714626312256, "reward_std": 0.11063082050532103, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714775323868, "step": 1407 }, { "completion_length": 637.9576110839844, "epoch": 0.42058098723022924, "grad_norm": 0.998139500617981, "kl": 0.42431640625, "learning_rate": 7.463947159152692e-07, "loss": 0.017, "reward": 1.1283482760190964, "reward_std": 0.1111024022102356, "rewards/accuracy_reward": 0.13616072107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 1408 }, { "completion_length": 638.1629791259766, "epoch": 0.4208796953177507, "grad_norm": 0.24097898602485657, "kl": 0.32958984375, "learning_rate": 7.459723073428067e-07, "loss": 0.0132, "reward": 1.212053656578064, "reward_std": 0.17519251815974712, "rewards/accuracy_reward": 0.21875000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.993303582072258, "step": 1409 }, { "completion_length": 681.9308471679688, "epoch": 0.4211784034052722, "grad_norm": 0.8188039660453796, "kl": 0.6605224609375, "learning_rate": 7.455496855716813e-07, "loss": 0.0264, "reward": 1.1216517984867096, "reward_std": 0.12907522916793823, "rewards/accuracy_reward": 0.1406250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 1410 }, { "completion_length": 659.2991333007812, "epoch": 0.42147711149279365, "grad_norm": 0.595941960811615, "kl": 0.7021484375, "learning_rate": 7.45126851061664e-07, "loss": 0.0281, "reward": 1.1272321790456772, "reward_std": 0.148629579693079, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 1411 }, { "completion_length": 649.2477874755859, "epoch": 0.4217758195803151, "grad_norm": 0.7873960137367249, "kl": 0.708984375, "learning_rate": 7.447038042727571e-07, "loss": 0.0283, "reward": 1.2137277126312256, "reward_std": 0.2443506456911564, "rewards/accuracy_reward": 0.2410714477300644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562947034836, "step": 1412 }, { "completion_length": 644.6741180419922, "epoch": 0.4220745276678366, "grad_norm": 0.5430670976638794, "kl": 0.58935546875, "learning_rate": 7.442805456651941e-07, "loss": 0.0236, "reward": 1.2232143580913544, "reward_std": 0.20552482269704342, "rewards/accuracy_reward": 0.23660715855658054, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1413 }, { "completion_length": 570.8303833007812, "epoch": 0.42237323575535807, "grad_norm": 0.5342994928359985, "kl": 0.353759765625, "learning_rate": 7.438570756994391e-07, "loss": 0.0142, "reward": 1.0686384290456772, "reward_std": 0.13622495159506798, "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 1414 }, { "completion_length": 574.5558319091797, "epoch": 0.42267194384287954, "grad_norm": 0.3476768434047699, "kl": 0.282958984375, "learning_rate": 7.434333948361857e-07, "loss": 0.0113, "reward": 1.1171875596046448, "reward_std": 0.14759570732712746, "rewards/accuracy_reward": 0.12723215017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 1415 }, { "completion_length": 627.4821624755859, "epoch": 0.422970651930401, "grad_norm": 0.9635498523712158, "kl": 0.409423828125, "learning_rate": 7.430095035363572e-07, "loss": 0.0164, "reward": 1.1590402126312256, "reward_std": 0.11677963845431805, "rewards/accuracy_reward": 0.17187500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1416 }, { "completion_length": 680.8527069091797, "epoch": 0.4232693600179225, "grad_norm": 0.5199729800224304, "kl": 0.56884765625, "learning_rate": 7.425854022611059e-07, "loss": 0.0228, "reward": 1.2639509737491608, "reward_std": 0.1923672929406166, "rewards/accuracy_reward": 0.28571430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 1417 }, { "completion_length": 624.1607513427734, "epoch": 0.42356806810544395, "grad_norm": 0.5456810593605042, "kl": 0.62353515625, "learning_rate": 7.421610914718122e-07, "loss": 0.0249, "reward": 1.0675223767757416, "reward_std": 0.12033034861087799, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 1418 }, { "completion_length": 652.9397430419922, "epoch": 0.4238667761929654, "grad_norm": 2.6958444118499756, "kl": 1.26605224609375, "learning_rate": 7.41736571630085e-07, "loss": 0.0508, "reward": 1.1819197237491608, "reward_std": 0.22622967883944511, "rewards/accuracy_reward": 0.20312501210719347, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946790456772, "step": 1419 }, { "completion_length": 609.2254638671875, "epoch": 0.4241654842804869, "grad_norm": 0.5149848461151123, "kl": 0.43701171875, "learning_rate": 7.413118431977598e-07, "loss": 0.0175, "reward": 1.1556920111179352, "reward_std": 0.13862841576337814, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455484867096, "step": 1420 }, { "completion_length": 609.1719055175781, "epoch": 0.42446419236800836, "grad_norm": 0.38870400190353394, "kl": 0.370849609375, "learning_rate": 7.408869066369e-07, "loss": 0.0148, "reward": 1.1791295409202576, "reward_std": 0.16421685554087162, "rewards/accuracy_reward": 0.18526786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 1421 }, { "completion_length": 622.4442138671875, "epoch": 0.42476290045552983, "grad_norm": 1.1079500913619995, "kl": 0.72509765625, "learning_rate": 7.404617624097948e-07, "loss": 0.029, "reward": 1.2566964626312256, "reward_std": 0.20330602303147316, "rewards/accuracy_reward": 0.2700893022119999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 1422 }, { "completion_length": 634.7232360839844, "epoch": 0.4250616085430513, "grad_norm": 0.46368634700775146, "kl": 0.5390625, "learning_rate": 7.400364109789591e-07, "loss": 0.0216, "reward": 1.126116156578064, "reward_std": 0.1542229955084622, "rewards/accuracy_reward": 0.14062500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 1423 }, { "completion_length": 634.4286041259766, "epoch": 0.4253603166305728, "grad_norm": 0.9726168513298035, "kl": 0.74169921875, "learning_rate": 7.396108528071339e-07, "loss": 0.0297, "reward": 1.18136166036129, "reward_std": 0.10328194918110967, "rewards/accuracy_reward": 0.1964285857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 1424 }, { "completion_length": 635.4955596923828, "epoch": 0.42565902471809425, "grad_norm": 0.8008522391319275, "kl": 0.6207275390625, "learning_rate": 7.391850883572849e-07, "loss": 0.0248, "reward": 1.1981026828289032, "reward_std": 0.13868976384401321, "rewards/accuracy_reward": 0.2165178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 1425 }, { "completion_length": 627.7500152587891, "epoch": 0.4259577328056157, "grad_norm": 0.30999717116355896, "kl": 0.425048828125, "learning_rate": 7.387591180926015e-07, "loss": 0.017, "reward": 1.164620578289032, "reward_std": 0.16565853916108608, "rewards/accuracy_reward": 0.1741071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 1426 }, { "completion_length": 649.7076263427734, "epoch": 0.4262564408931372, "grad_norm": 0.6452813148498535, "kl": 0.5743408203125, "learning_rate": 7.383329424764982e-07, "loss": 0.0229, "reward": 1.2377232611179352, "reward_std": 0.19883515127003193, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232313156128, "step": 1427 }, { "completion_length": 675.8013763427734, "epoch": 0.42655514898065866, "grad_norm": 0.4055112898349762, "kl": 0.55126953125, "learning_rate": 7.379065619726123e-07, "loss": 0.022, "reward": 1.2516741752624512, "reward_std": 0.2544848546385765, "rewards/accuracy_reward": 0.2678571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838169813156128, "step": 1428 }, { "completion_length": 640.0826263427734, "epoch": 0.42685385706818013, "grad_norm": 0.3300393521785736, "kl": 0.3406982421875, "learning_rate": 7.374799770448036e-07, "loss": 0.0136, "reward": 1.2561384439468384, "reward_std": 0.16955218464136124, "rewards/accuracy_reward": 0.2656250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 1429 }, { "completion_length": 671.6920013427734, "epoch": 0.4271525651557016, "grad_norm": 0.8646880984306335, "kl": 0.3232421875, "learning_rate": 7.370531881571548e-07, "loss": 0.0129, "reward": 1.1506697237491608, "reward_std": 0.19939374551177025, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 1430 }, { "completion_length": 692.6518402099609, "epoch": 0.4274512732432231, "grad_norm": 0.964487612247467, "kl": 0.4052734375, "learning_rate": 7.366261957739705e-07, "loss": 0.0162, "reward": 1.1060268580913544, "reward_std": 0.12857976276427507, "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 1431 }, { "completion_length": 620.4598541259766, "epoch": 0.42774998133074454, "grad_norm": 0.595452070236206, "kl": 0.28912353515625, "learning_rate": 7.361990003597767e-07, "loss": 0.0115, "reward": 1.180245578289032, "reward_std": 0.1875687725841999, "rewards/accuracy_reward": 0.18750000977888703, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 1432 }, { "completion_length": 633.2143096923828, "epoch": 0.428048689418266, "grad_norm": 0.5602209568023682, "kl": 0.34521484375, "learning_rate": 7.357716023793199e-07, "loss": 0.0138, "reward": 1.1780134737491608, "reward_std": 0.1678912676870823, "rewards/accuracy_reward": 0.1897321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1433 }, { "completion_length": 732.5625305175781, "epoch": 0.4283473975057875, "grad_norm": 0.3048519790172577, "kl": 0.6707763671875, "learning_rate": 7.353440022975674e-07, "loss": 0.0269, "reward": 1.1227678954601288, "reward_std": 0.17857176065444946, "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464477300644, "step": 1434 }, { "completion_length": 601.9620819091797, "epoch": 0.42864610559330896, "grad_norm": 0.9732804894447327, "kl": 0.5135498046875, "learning_rate": 7.349162005797058e-07, "loss": 0.0205, "reward": 1.1573661267757416, "reward_std": 0.11808374337852001, "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911267757416, "step": 1435 }, { "completion_length": 595.0960159301758, "epoch": 0.42894481368083043, "grad_norm": 0.6426535844802856, "kl": 0.3509521484375, "learning_rate": 7.344881976911419e-07, "loss": 0.0141, "reward": 1.1484375596046448, "reward_std": 0.12253744341433048, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 1436 }, { "completion_length": 611.5558319091797, "epoch": 0.4292435217683519, "grad_norm": 0.9203409552574158, "kl": 0.452880859375, "learning_rate": 7.340599940975005e-07, "loss": 0.0181, "reward": 1.1523438096046448, "reward_std": 0.16203291341662407, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916295111179352, "step": 1437 }, { "completion_length": 603.1049346923828, "epoch": 0.42954222985587337, "grad_norm": 0.314910888671875, "kl": 0.29345703125, "learning_rate": 7.336315902646255e-07, "loss": 0.0118, "reward": 1.184151828289032, "reward_std": 0.12110715825110674, "rewards/accuracy_reward": 0.1897321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9944196790456772, "step": 1438 }, { "completion_length": 588.9665451049805, "epoch": 0.42984093794339484, "grad_norm": 0.23464490473270416, "kl": 0.518798828125, "learning_rate": 7.332029866585781e-07, "loss": 0.0207, "reward": 1.191964328289032, "reward_std": 0.11930775828659534, "rewards/accuracy_reward": 0.20089286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1439 }, { "completion_length": 619.5960083007812, "epoch": 0.4301396460309163, "grad_norm": 0.46707895398139954, "kl": 0.64697265625, "learning_rate": 7.32774183745637e-07, "loss": 0.0259, "reward": 1.1316964626312256, "reward_std": 0.16883916221559048, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98214291036129, "step": 1440 }, { "completion_length": 639.1160888671875, "epoch": 0.4304383541184378, "grad_norm": 0.9037429690361023, "kl": 0.56298828125, "learning_rate": 7.323451819922979e-07, "loss": 0.0225, "reward": 1.1233259439468384, "reward_std": 0.11952542327344418, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 1441 }, { "completion_length": 604.7277145385742, "epoch": 0.43073706220595925, "grad_norm": 0.4933215081691742, "kl": 0.58935546875, "learning_rate": 7.319159818652725e-07, "loss": 0.0236, "reward": 1.1657366752624512, "reward_std": 0.16735896095633507, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 1442 }, { "completion_length": 661.7232513427734, "epoch": 0.43103577029348067, "grad_norm": 0.9153129458427429, "kl": 0.82177734375, "learning_rate": 7.314865838314885e-07, "loss": 0.0329, "reward": 1.031808078289032, "reward_std": 0.1246337927877903, "rewards/accuracy_reward": 0.04464286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1443 }, { "completion_length": 607.6049346923828, "epoch": 0.43133447838100214, "grad_norm": 0.3566618859767914, "kl": 0.874267578125, "learning_rate": 7.310569883580887e-07, "loss": 0.035, "reward": 1.0446428954601288, "reward_std": 0.16938776522874832, "rewards/accuracy_reward": 0.06919643026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464775323868, "step": 1444 }, { "completion_length": 624.966552734375, "epoch": 0.4316331864685236, "grad_norm": 0.8042998909950256, "kl": 0.83642578125, "learning_rate": 7.306271959124313e-07, "loss": 0.0335, "reward": 1.1149554252624512, "reward_std": 0.15912922844290733, "rewards/accuracy_reward": 0.13169643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 1445 }, { "completion_length": 545.2835083007812, "epoch": 0.4319318945560451, "grad_norm": 0.5201475024223328, "kl": 0.528564453125, "learning_rate": 7.301972069620881e-07, "loss": 0.0212, "reward": 1.2087054252624512, "reward_std": 0.1794067732989788, "rewards/accuracy_reward": 0.2187500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1446 }, { "completion_length": 619.700927734375, "epoch": 0.43223060264356655, "grad_norm": 0.34313225746154785, "kl": 0.6689453125, "learning_rate": 7.297670219748447e-07, "loss": 0.0268, "reward": 1.1824777126312256, "reward_std": 0.17796414345502853, "rewards/accuracy_reward": 0.1964285857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1447 }, { "completion_length": 592.9620666503906, "epoch": 0.432529310731088, "grad_norm": 0.5402969717979431, "kl": 0.476318359375, "learning_rate": 7.293366414187008e-07, "loss": 0.0191, "reward": 1.1551339626312256, "reward_std": 0.15784652903676033, "rewards/accuracy_reward": 0.1674107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232313156128, "step": 1448 }, { "completion_length": 598.0848388671875, "epoch": 0.4328280188186095, "grad_norm": 0.47246402502059937, "kl": 0.921875, "learning_rate": 7.289060657618677e-07, "loss": 0.0368, "reward": 1.2103795409202576, "reward_std": 0.1373517895117402, "rewards/accuracy_reward": 0.2321428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.97823666036129, "step": 1449 }, { "completion_length": 659.0156555175781, "epoch": 0.43312672690613097, "grad_norm": 0.2965523302555084, "kl": 0.8095703125, "learning_rate": 7.284752954727698e-07, "loss": 0.0324, "reward": 1.1623884290456772, "reward_std": 0.1268753595650196, "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 1450 }, { "completion_length": 640.5446624755859, "epoch": 0.43342543499365244, "grad_norm": 0.3215709328651428, "kl": 0.49169921875, "learning_rate": 7.280443310200429e-07, "loss": 0.0196, "reward": 1.2193081080913544, "reward_std": 0.1696546133607626, "rewards/accuracy_reward": 0.2299107275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 1451 }, { "completion_length": 557.2254791259766, "epoch": 0.4337241430811739, "grad_norm": 0.42646485567092896, "kl": 0.572265625, "learning_rate": 7.27613172872534e-07, "loss": 0.0229, "reward": 1.3141741752624512, "reward_std": 0.13293790258467197, "rewards/accuracy_reward": 0.32366072852164507, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134439468384, "step": 1452 }, { "completion_length": 640.2500381469727, "epoch": 0.4340228511686954, "grad_norm": 0.2602379620075226, "kl": 0.355712890625, "learning_rate": 7.271818214993011e-07, "loss": 0.0142, "reward": 1.0585937798023224, "reward_std": 0.1124067921191454, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 1453 }, { "completion_length": 594.1138610839844, "epoch": 0.43432155925621685, "grad_norm": 0.5524561405181885, "kl": 0.3729248046875, "learning_rate": 7.267502773696118e-07, "loss": 0.0149, "reward": 1.1579241752624512, "reward_std": 0.13547004107385874, "rewards/accuracy_reward": 0.16517857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 1454 }, { "completion_length": 614.0915603637695, "epoch": 0.4346202673437383, "grad_norm": 0.7436174154281616, "kl": 0.68408203125, "learning_rate": 7.263185409529444e-07, "loss": 0.0274, "reward": 1.1166295260190964, "reward_std": 0.15733742155134678, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 1455 }, { "completion_length": 637.3080596923828, "epoch": 0.4349189754312598, "grad_norm": 0.41981759667396545, "kl": 0.740234375, "learning_rate": 7.258866127189854e-07, "loss": 0.0296, "reward": 1.1417411267757416, "reward_std": 0.23119713366031647, "rewards/accuracy_reward": 0.1651785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625298023224, "step": 1456 }, { "completion_length": 632.2344055175781, "epoch": 0.43521768351878126, "grad_norm": 0.3536185920238495, "kl": 0.804931640625, "learning_rate": 7.254544931376305e-07, "loss": 0.0322, "reward": 1.160714328289032, "reward_std": 0.15602970495820045, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 1457 }, { "completion_length": 630.6719131469727, "epoch": 0.43551639160630273, "grad_norm": 0.5691048502922058, "kl": 0.57177734375, "learning_rate": 7.250221826789836e-07, "loss": 0.0229, "reward": 1.2734375894069672, "reward_std": 0.2226005122065544, "rewards/accuracy_reward": 0.2901785857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 1458 }, { "completion_length": 578.7299423217773, "epoch": 0.4358150996938242, "grad_norm": 0.7984748482704163, "kl": 0.4925537109375, "learning_rate": 7.245896818133558e-07, "loss": 0.0197, "reward": 1.2745536267757416, "reward_std": 0.20543773099780083, "rewards/accuracy_reward": 0.2901785857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 1459 }, { "completion_length": 612.8661041259766, "epoch": 0.4361138077813457, "grad_norm": 0.5901951193809509, "kl": 0.49896240234375, "learning_rate": 7.241569910112658e-07, "loss": 0.02, "reward": 1.1071428954601288, "reward_std": 0.16247492097318172, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1460 }, { "completion_length": 622.7835083007812, "epoch": 0.43641251586886715, "grad_norm": 0.67143714427948, "kl": 0.49365234375, "learning_rate": 7.237241107434389e-07, "loss": 0.0197, "reward": 1.058035746216774, "reward_std": 0.09677966684103012, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071939468384, "step": 1461 }, { "completion_length": 708.6406402587891, "epoch": 0.4367112239563886, "grad_norm": 0.8985415697097778, "kl": 0.9609375, "learning_rate": 7.232910414808063e-07, "loss": 0.0385, "reward": 1.0959821939468384, "reward_std": 0.17816635221242905, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 1462 }, { "completion_length": 652.4576110839844, "epoch": 0.4370099320439101, "grad_norm": 0.3379538357257843, "kl": 0.513671875, "learning_rate": 7.228577836945049e-07, "loss": 0.0205, "reward": 1.1132812798023224, "reward_std": 0.1653684787452221, "rewards/accuracy_reward": 0.1272321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 1463 }, { "completion_length": 638.4174499511719, "epoch": 0.43730864013143156, "grad_norm": 0.6636784076690674, "kl": 0.372802734375, "learning_rate": 7.224243378558768e-07, "loss": 0.0149, "reward": 1.0625000447034836, "reward_std": 0.05357143096625805, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 1464 }, { "completion_length": 568.3058319091797, "epoch": 0.43760734821895303, "grad_norm": 0.5368974804878235, "kl": 0.48974609375, "learning_rate": 7.219907044364682e-07, "loss": 0.0196, "reward": 1.1266741454601288, "reward_std": 0.15534160658717155, "rewards/accuracy_reward": 0.14062500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1465 }, { "completion_length": 637.9754791259766, "epoch": 0.4379060563064745, "grad_norm": 0.3782538175582886, "kl": 0.62646484375, "learning_rate": 7.215568839080304e-07, "loss": 0.025, "reward": 1.037946492433548, "reward_std": 0.13284175377339125, "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776785969734192, "step": 1466 }, { "completion_length": 634.0736999511719, "epoch": 0.43820476439399597, "grad_norm": 0.9461608529090881, "kl": 0.577392578125, "learning_rate": 7.211228767425172e-07, "loss": 0.0231, "reward": 1.0931920409202576, "reward_std": 0.15213202685117722, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205633878708, "step": 1467 }, { "completion_length": 592.303596496582, "epoch": 0.43850347248151744, "grad_norm": 0.8749924302101135, "kl": 0.3363037109375, "learning_rate": 7.20688683412086e-07, "loss": 0.0135, "reward": 1.2477679252624512, "reward_std": 0.22815340384840965, "rewards/accuracy_reward": 0.2522321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9955357313156128, "step": 1468 }, { "completion_length": 656.7277221679688, "epoch": 0.4388021805690389, "grad_norm": 0.2975999116897583, "kl": 0.525390625, "learning_rate": 7.202543043890964e-07, "loss": 0.021, "reward": 1.051339328289032, "reward_std": 0.1459441650658846, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1469 }, { "completion_length": 627.209846496582, "epoch": 0.4391008886565604, "grad_norm": 1.1062157154083252, "kl": 0.78173828125, "learning_rate": 7.198197401461103e-07, "loss": 0.0313, "reward": 1.1015625298023224, "reward_std": 0.161621680483222, "rewards/accuracy_reward": 0.1138392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1470 }, { "completion_length": 626.263427734375, "epoch": 0.43939959674408186, "grad_norm": 0.7480711340904236, "kl": 0.365966796875, "learning_rate": 7.193849911558913e-07, "loss": 0.0146, "reward": 1.182477742433548, "reward_std": 0.10056817112490535, "rewards/accuracy_reward": 0.1852678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.997209832072258, "step": 1471 }, { "completion_length": 526.4464569091797, "epoch": 0.4396983048316033, "grad_norm": 0.5628089308738708, "kl": 0.415771484375, "learning_rate": 7.189500578914033e-07, "loss": 0.0166, "reward": 1.1361607611179352, "reward_std": 0.12846367806196213, "rewards/accuracy_reward": 0.14732143888249993, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393431901932, "step": 1472 }, { "completion_length": 560.4553833007812, "epoch": 0.4399970129191248, "grad_norm": 0.37836897373199463, "kl": 0.2410888671875, "learning_rate": 7.185149408258112e-07, "loss": 0.0096, "reward": 1.117745578289032, "reward_std": 0.09176876954734325, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9994419813156128, "step": 1473 }, { "completion_length": 657.4442138671875, "epoch": 0.44029572100664627, "grad_norm": 0.7696290612220764, "kl": 0.54638671875, "learning_rate": 7.180796404324797e-07, "loss": 0.0219, "reward": 1.1813616752624512, "reward_std": 0.20396704226732254, "rewards/accuracy_reward": 0.1986607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 1474 }, { "completion_length": 550.2946701049805, "epoch": 0.44059442909416774, "grad_norm": 0.28905171155929565, "kl": 0.1846923828125, "learning_rate": 7.17644157184973e-07, "loss": 0.0074, "reward": 1.215401828289032, "reward_std": 0.09000019449740648, "rewards/accuracy_reward": 0.2165178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9988839626312256, "step": 1475 }, { "completion_length": 597.5982513427734, "epoch": 0.4408931371816892, "grad_norm": 0.5674852728843689, "kl": 0.28369140625, "learning_rate": 7.172084915570541e-07, "loss": 0.0114, "reward": 1.2712053954601288, "reward_std": 0.19436673633754253, "rewards/accuracy_reward": 0.279017873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875447034836, "step": 1476 }, { "completion_length": 692.2522888183594, "epoch": 0.4411918452692107, "grad_norm": 0.8987843990325928, "kl": 0.541015625, "learning_rate": 7.167726440226846e-07, "loss": 0.0217, "reward": 1.1127232611179352, "reward_std": 0.1541577596217394, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268431901932, "step": 1477 }, { "completion_length": 607.5335235595703, "epoch": 0.44149055335673215, "grad_norm": 0.3793784976005554, "kl": 0.479248046875, "learning_rate": 7.16336615056024e-07, "loss": 0.0192, "reward": 1.1462053656578064, "reward_std": 0.16379479225724936, "rewards/accuracy_reward": 0.15848214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 1478 }, { "completion_length": 568.7678833007812, "epoch": 0.4417892614442536, "grad_norm": 0.7372945547103882, "kl": 0.2723388671875, "learning_rate": 7.159004051314289e-07, "loss": 0.0109, "reward": 1.0993303954601288, "reward_std": 0.08855590410530567, "rewards/accuracy_reward": 0.10714286495931447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875149011612, "step": 1479 }, { "completion_length": 670.466552734375, "epoch": 0.4420879695317751, "grad_norm": 0.5743534564971924, "kl": 0.48486328125, "learning_rate": 7.154640147234529e-07, "loss": 0.0194, "reward": 1.2639509439468384, "reward_std": 0.19688913598656654, "rewards/accuracy_reward": 0.279017873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 1480 }, { "completion_length": 581.7768096923828, "epoch": 0.44238667761929656, "grad_norm": 0.5945284366607666, "kl": 0.2171630859375, "learning_rate": 7.150274443068463e-07, "loss": 0.0087, "reward": 1.1484375298023224, "reward_std": 0.11463516391813755, "rewards/accuracy_reward": 0.1495535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9988839328289032, "step": 1481 }, { "completion_length": 543.9732437133789, "epoch": 0.44268538570681804, "grad_norm": 0.25574949383735657, "kl": 0.4146728515625, "learning_rate": 7.145906943565546e-07, "loss": 0.0166, "reward": 1.148995578289032, "reward_std": 0.05655038543045521, "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949777126312256, "step": 1482 }, { "completion_length": 607.1942291259766, "epoch": 0.4429840937943395, "grad_norm": 0.469987154006958, "kl": 0.54949951171875, "learning_rate": 7.14153765347719e-07, "loss": 0.022, "reward": 1.207589328289032, "reward_std": 0.2316565252840519, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888392984867096, "step": 1483 }, { "completion_length": 749.450927734375, "epoch": 0.443282801881861, "grad_norm": 1.506432056427002, "kl": 1.181640625, "learning_rate": 7.137166577556757e-07, "loss": 0.0472, "reward": 1.1590402126312256, "reward_std": 0.1667316071689129, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 1484 }, { "completion_length": 541.5826263427734, "epoch": 0.44358150996938245, "grad_norm": 0.5415467023849487, "kl": 0.284912109375, "learning_rate": 7.132793720559547e-07, "loss": 0.0114, "reward": 1.1824777722358704, "reward_std": 0.17657750099897385, "rewards/accuracy_reward": 0.1897321566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.992745578289032, "step": 1485 }, { "completion_length": 575.7567138671875, "epoch": 0.44388021805690386, "grad_norm": 0.4046364426612854, "kl": 0.212646484375, "learning_rate": 7.128419087242797e-07, "loss": 0.0085, "reward": 1.2806920111179352, "reward_std": 0.20714757218956947, "rewards/accuracy_reward": 0.2924107275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812649011612, "step": 1486 }, { "completion_length": 654.9330596923828, "epoch": 0.44417892614442533, "grad_norm": 0.6669549942016602, "kl": 0.49072265625, "learning_rate": 7.124042682365685e-07, "loss": 0.0196, "reward": 1.1462054252624512, "reward_std": 0.16008400544524193, "rewards/accuracy_reward": 0.1562500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1487 }, { "completion_length": 597.9107437133789, "epoch": 0.4444776342319468, "grad_norm": 0.6554607152938843, "kl": 0.390869140625, "learning_rate": 7.119664510689307e-07, "loss": 0.0156, "reward": 1.1651786267757416, "reward_std": 0.17177035845816135, "rewards/accuracy_reward": 0.1763392984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1488 }, { "completion_length": 594.8683166503906, "epoch": 0.4447763423194683, "grad_norm": 0.3084026277065277, "kl": 0.485107421875, "learning_rate": 7.115284576976685e-07, "loss": 0.0194, "reward": 1.0585937947034836, "reward_std": 0.10308996308594942, "rewards/accuracy_reward": 0.066964291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 1489 }, { "completion_length": 630.5491333007812, "epoch": 0.44507505040698975, "grad_norm": 0.4564475119113922, "kl": 0.925537109375, "learning_rate": 7.110902885992759e-07, "loss": 0.037, "reward": 1.104352742433548, "reward_std": 0.12481730058789253, "rewards/accuracy_reward": 0.12276786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 1490 }, { "completion_length": 560.0759124755859, "epoch": 0.4453737584945112, "grad_norm": 0.2839459776878357, "kl": 0.5152587890625, "learning_rate": 7.10651944250438e-07, "loss": 0.0206, "reward": 1.1662946939468384, "reward_std": 0.14757511485368013, "rewards/accuracy_reward": 0.17633929708972573, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1491 }, { "completion_length": 589.2031555175781, "epoch": 0.4456724665820327, "grad_norm": 0.5765348672866821, "kl": 0.4549560546875, "learning_rate": 7.102134251280302e-07, "loss": 0.0182, "reward": 1.1339285969734192, "reward_std": 0.15500257723033428, "rewards/accuracy_reward": 0.14285715157166123, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1492 }, { "completion_length": 601.0937652587891, "epoch": 0.44597117466955416, "grad_norm": 0.8038132786750793, "kl": 0.7139892578125, "learning_rate": 7.097747317091183e-07, "loss": 0.0286, "reward": 1.121651828289032, "reward_std": 0.14052782766520977, "rewards/accuracy_reward": 0.13616072572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 1493 }, { "completion_length": 629.9196624755859, "epoch": 0.44626988275707563, "grad_norm": 0.645817756652832, "kl": 0.76513671875, "learning_rate": 7.09335864470958e-07, "loss": 0.0306, "reward": 1.1077009439468384, "reward_std": 0.18199672549962997, "rewards/accuracy_reward": 0.1272321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 1494 }, { "completion_length": 660.9286041259766, "epoch": 0.4465685908445971, "grad_norm": 0.42223384976387024, "kl": 1.03564453125, "learning_rate": 7.08896823890994e-07, "loss": 0.0414, "reward": 1.1194196939468384, "reward_std": 0.11550209112465382, "rewards/accuracy_reward": 0.13392858114093542, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 1495 }, { "completion_length": 587.482177734375, "epoch": 0.4468672989321186, "grad_norm": 0.8786957859992981, "kl": 0.8592529296875, "learning_rate": 7.084576104468588e-07, "loss": 0.0344, "reward": 1.1378348469734192, "reward_std": 0.13846388086676598, "rewards/accuracy_reward": 0.14955357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 1496 }, { "completion_length": 580.6495819091797, "epoch": 0.44716600701964004, "grad_norm": 0.3811487853527069, "kl": 0.437744140625, "learning_rate": 7.080182246163741e-07, "loss": 0.0175, "reward": 1.0753348618745804, "reward_std": 0.10612958017736673, "rewards/accuracy_reward": 0.08705357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812649011612, "step": 1497 }, { "completion_length": 723.2567291259766, "epoch": 0.4474647151071615, "grad_norm": 0.47113633155822754, "kl": 0.75390625, "learning_rate": 7.075786668775485e-07, "loss": 0.0302, "reward": 1.1155134290456772, "reward_std": 0.18921110033988953, "rewards/accuracy_reward": 0.14732143562287092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.968191996216774, "step": 1498 }, { "completion_length": 671.1250457763672, "epoch": 0.447763423194683, "grad_norm": 1.0142594575881958, "kl": 0.9990234375, "learning_rate": 7.071389377085777e-07, "loss": 0.0398, "reward": 1.1651786267757416, "reward_std": 0.1922668751794845, "rewards/accuracy_reward": 0.1875000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786267757416, "step": 1499 }, { "completion_length": 576.9419860839844, "epoch": 0.44806213128220446, "grad_norm": 0.8316694498062134, "kl": 0.5302734375, "learning_rate": 7.066990375878439e-07, "loss": 0.0212, "reward": 1.193638414144516, "reward_std": 0.16998617816716433, "rewards/accuracy_reward": 0.2053571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1500 }, { "completion_length": 588.4598388671875, "epoch": 0.4483608393697259, "grad_norm": 0.5691549777984619, "kl": 0.87451171875, "learning_rate": 7.062589669939154e-07, "loss": 0.0349, "reward": 1.266183078289032, "reward_std": 0.15532956272363663, "rewards/accuracy_reward": 0.285714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 1501 }, { "completion_length": 607.6741333007812, "epoch": 0.4486595474572474, "grad_norm": 0.3528524339199066, "kl": 0.29541015625, "learning_rate": 7.058187264055459e-07, "loss": 0.0118, "reward": 1.127790242433548, "reward_std": 0.15851459838449955, "rewards/accuracy_reward": 0.13839286752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 1502 }, { "completion_length": 679.8772583007812, "epoch": 0.44895825554476887, "grad_norm": 0.8974506258964539, "kl": 0.579833984375, "learning_rate": 7.053783163016739e-07, "loss": 0.0232, "reward": 1.0647321939468384, "reward_std": 0.11128801666200161, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 1503 }, { "completion_length": 675.8951263427734, "epoch": 0.44925696363229034, "grad_norm": 0.4861716032028198, "kl": 0.4638671875, "learning_rate": 7.049377371614224e-07, "loss": 0.0186, "reward": 1.1729911267757416, "reward_std": 0.15948057174682617, "rewards/accuracy_reward": 0.18750000838190317, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 1504 }, { "completion_length": 619.2143020629883, "epoch": 0.4495556717198118, "grad_norm": 0.6232990622520447, "kl": 0.6407470703125, "learning_rate": 7.044969894640984e-07, "loss": 0.0256, "reward": 1.116071492433548, "reward_std": 0.09951354237273335, "rewards/accuracy_reward": 0.12946428963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 1505 }, { "completion_length": 652.8638610839844, "epoch": 0.4498543798073333, "grad_norm": 1.1905795335769653, "kl": 0.513671875, "learning_rate": 7.040560736891922e-07, "loss": 0.0205, "reward": 1.1015625447034836, "reward_std": 0.13553397729992867, "rewards/accuracy_reward": 0.12053572223521769, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 1506 }, { "completion_length": 581.7477874755859, "epoch": 0.45015308789485475, "grad_norm": 0.3822959065437317, "kl": 0.46044921875, "learning_rate": 7.036149903163771e-07, "loss": 0.0184, "reward": 1.1456473767757416, "reward_std": 0.12617485132068396, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 1507 }, { "completion_length": 575.6227798461914, "epoch": 0.4504517959823762, "grad_norm": 0.31901612877845764, "kl": 0.5745849609375, "learning_rate": 7.031737398255083e-07, "loss": 0.023, "reward": 1.2511161267757416, "reward_std": 0.16100951842963696, "rewards/accuracy_reward": 0.2566964402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9944196790456772, "step": 1508 }, { "completion_length": 648.8817291259766, "epoch": 0.4507505040698977, "grad_norm": 0.5892578363418579, "kl": 0.8701171875, "learning_rate": 7.027323226966232e-07, "loss": 0.0348, "reward": 1.106584906578064, "reward_std": 0.14732926338911057, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 1509 }, { "completion_length": 685.0245971679688, "epoch": 0.45104921215741917, "grad_norm": 0.40520912408828735, "kl": 1.0654296875, "learning_rate": 7.022907394099404e-07, "loss": 0.0426, "reward": 1.215401828289032, "reward_std": 0.19009610451757908, "rewards/accuracy_reward": 0.2366071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946939468384, "step": 1510 }, { "completion_length": 630.0312805175781, "epoch": 0.45134792024494064, "grad_norm": 0.6964714527130127, "kl": 1.1484375, "learning_rate": 7.018489904458592e-07, "loss": 0.0459, "reward": 1.1199777126312256, "reward_std": 0.1823432482779026, "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205633878708, "step": 1511 }, { "completion_length": 595.0268249511719, "epoch": 0.4516466283324621, "grad_norm": 0.45935508608818054, "kl": 0.56298828125, "learning_rate": 7.014070762849593e-07, "loss": 0.0225, "reward": 1.0959822237491608, "reward_std": 0.1381012760102749, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1512 }, { "completion_length": 571.428596496582, "epoch": 0.4519453364199836, "grad_norm": 0.747838020324707, "kl": 0.52099609375, "learning_rate": 7.009649974079997e-07, "loss": 0.0208, "reward": 1.364397406578064, "reward_std": 0.14908301830291748, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 1513 }, { "completion_length": 593.3370819091797, "epoch": 0.45224404450750505, "grad_norm": 0.5873208045959473, "kl": 0.93408203125, "learning_rate": 7.005227542959192e-07, "loss": 0.0374, "reward": 1.1796875596046448, "reward_std": 0.1357632353901863, "rewards/accuracy_reward": 0.19642857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 1514 }, { "completion_length": 591.5446548461914, "epoch": 0.4525427525950265, "grad_norm": 0.5923188328742981, "kl": 0.733154296875, "learning_rate": 7.000803474298349e-07, "loss": 0.0293, "reward": 1.1049107611179352, "reward_std": 0.12604091688990593, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 1515 }, { "completion_length": 574.4085083007812, "epoch": 0.452841460682548, "grad_norm": 0.8584862947463989, "kl": 0.76123046875, "learning_rate": 6.99637777291042e-07, "loss": 0.0305, "reward": 1.1975446939468384, "reward_std": 0.16168363625183702, "rewards/accuracy_reward": 0.207589291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 1516 }, { "completion_length": 599.9397735595703, "epoch": 0.45314016877006946, "grad_norm": 0.3902330994606018, "kl": 0.6279296875, "learning_rate": 6.991950443610134e-07, "loss": 0.0251, "reward": 1.1406250596046448, "reward_std": 0.14058991754427552, "rewards/accuracy_reward": 0.15178572293370962, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 1517 }, { "completion_length": 669.2768249511719, "epoch": 0.45343887685759093, "grad_norm": 0.5374045372009277, "kl": 0.91650390625, "learning_rate": 6.987521491213992e-07, "loss": 0.0367, "reward": 1.1763393580913544, "reward_std": 0.12774822860956192, "rewards/accuracy_reward": 0.19642858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 1518 }, { "completion_length": 686.4241333007812, "epoch": 0.4537375849451124, "grad_norm": 0.5297364592552185, "kl": 0.64794921875, "learning_rate": 6.983090920540261e-07, "loss": 0.0259, "reward": 1.0881697088479996, "reward_std": 0.16296186670660973, "rewards/accuracy_reward": 0.10714286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 1519 }, { "completion_length": 637.7946624755859, "epoch": 0.4540362930326339, "grad_norm": 0.674101710319519, "kl": 0.62548828125, "learning_rate": 6.978658736408969e-07, "loss": 0.0251, "reward": 1.1584821939468384, "reward_std": 0.13775984197854996, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 1520 }, { "completion_length": 643.3683166503906, "epoch": 0.45433500112015535, "grad_norm": 0.8872155547142029, "kl": 0.6845703125, "learning_rate": 6.974224943641893e-07, "loss": 0.0273, "reward": 1.1127232909202576, "reward_std": 0.17504994198679924, "rewards/accuracy_reward": 0.13169643562287092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 1521 }, { "completion_length": 645.4219055175781, "epoch": 0.4546337092076768, "grad_norm": 0.3985927999019623, "kl": 0.3004150390625, "learning_rate": 6.969789547062569e-07, "loss": 0.012, "reward": 1.1065848469734192, "reward_std": 0.10209709405899048, "rewards/accuracy_reward": 0.11383929033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 1522 }, { "completion_length": 672.794677734375, "epoch": 0.4549324172951983, "grad_norm": 0.4637318551540375, "kl": 0.54443359375, "learning_rate": 6.965352551496273e-07, "loss": 0.0218, "reward": 1.0742187649011612, "reward_std": 0.11839162278920412, "rewards/accuracy_reward": 0.09375000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 1523 }, { "completion_length": 662.5982513427734, "epoch": 0.45523112538271976, "grad_norm": 1.442086100578308, "kl": 0.54052734375, "learning_rate": 6.960913961770021e-07, "loss": 0.0216, "reward": 1.1194196939468384, "reward_std": 0.1676341462880373, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 1524 }, { "completion_length": 615.3839416503906, "epoch": 0.45552983347024123, "grad_norm": 0.4157986640930176, "kl": 0.643798828125, "learning_rate": 6.956473782712562e-07, "loss": 0.0258, "reward": 1.1545759439468384, "reward_std": 0.1327409315854311, "rewards/accuracy_reward": 0.1674107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871651977300644, "step": 1525 }, { "completion_length": 706.2410888671875, "epoch": 0.4558285415577627, "grad_norm": 0.3739357888698578, "kl": 0.76416015625, "learning_rate": 6.952032019154378e-07, "loss": 0.0306, "reward": 1.166852742433548, "reward_std": 0.2010368574410677, "rewards/accuracy_reward": 0.1941964402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562947034836, "step": 1526 }, { "completion_length": 680.2924346923828, "epoch": 0.45612724964528417, "grad_norm": 0.5745802521705627, "kl": 0.509765625, "learning_rate": 6.947588675927673e-07, "loss": 0.0204, "reward": 1.2405134439468384, "reward_std": 0.1302214376628399, "rewards/accuracy_reward": 0.2566964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 1527 }, { "completion_length": 588.1562652587891, "epoch": 0.45642595773280564, "grad_norm": 0.6137392520904541, "kl": 0.34375, "learning_rate": 6.943143757866365e-07, "loss": 0.0137, "reward": 1.199776828289032, "reward_std": 0.1984895057976246, "rewards/accuracy_reward": 0.212053582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1528 }, { "completion_length": 612.5759201049805, "epoch": 0.45672466582032706, "grad_norm": 0.5690286755561829, "kl": 0.309326171875, "learning_rate": 6.938697269806094e-07, "loss": 0.0124, "reward": 1.1512277126312256, "reward_std": 0.13026561588048935, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9972098469734192, "step": 1529 }, { "completion_length": 607.9509124755859, "epoch": 0.45702337390784853, "grad_norm": 0.5193927884101868, "kl": 0.3662109375, "learning_rate": 6.934249216584202e-07, "loss": 0.0147, "reward": 1.2087054252624512, "reward_std": 0.20683452114462852, "rewards/accuracy_reward": 0.2187500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1530 }, { "completion_length": 642.0245819091797, "epoch": 0.45732208199537, "grad_norm": 0.5763760209083557, "kl": 0.638671875, "learning_rate": 6.929799603039731e-07, "loss": 0.0256, "reward": 1.0909598767757416, "reward_std": 0.15177812427282333, "rewards/accuracy_reward": 0.10714286426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 1531 }, { "completion_length": 590.3259124755859, "epoch": 0.45762079008289147, "grad_norm": 0.5942915081977844, "kl": 0.7021484375, "learning_rate": 6.925348434013428e-07, "loss": 0.028, "reward": 1.3024554252624512, "reward_std": 0.21360939741134644, "rewards/accuracy_reward": 0.3169642984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 1532 }, { "completion_length": 624.7098541259766, "epoch": 0.45791949817041294, "grad_norm": 0.24648791551589966, "kl": 0.1844482421875, "learning_rate": 6.920895714347729e-07, "loss": 0.0074, "reward": 1.0725446939468384, "reward_std": 0.08211474400013685, "rewards/accuracy_reward": 0.07589286169968545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966517984867096, "step": 1533 }, { "completion_length": 612.0111770629883, "epoch": 0.4582182062579344, "grad_norm": 0.7268358469009399, "kl": 0.4808349609375, "learning_rate": 6.916441448886754e-07, "loss": 0.0193, "reward": 1.1791295111179352, "reward_std": 0.1842237040400505, "rewards/accuracy_reward": 0.1897321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.989397332072258, "step": 1534 }, { "completion_length": 720.3147583007812, "epoch": 0.4585169143454559, "grad_norm": 0.7627559900283813, "kl": 0.49951171875, "learning_rate": 6.911985642476309e-07, "loss": 0.02, "reward": 1.1261161267757416, "reward_std": 0.1363976150751114, "rewards/accuracy_reward": 0.15178571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9743303954601288, "step": 1535 }, { "completion_length": 631.1361694335938, "epoch": 0.45881562243297735, "grad_norm": 0.3577114939689636, "kl": 0.3740234375, "learning_rate": 6.907528299963875e-07, "loss": 0.015, "reward": 1.141183078289032, "reward_std": 0.1150731761008501, "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1536 }, { "completion_length": 625.4888763427734, "epoch": 0.4591143305204988, "grad_norm": 0.30673086643218994, "kl": 0.38385009765625, "learning_rate": 6.903069426198605e-07, "loss": 0.0153, "reward": 1.1674107611179352, "reward_std": 0.16101441718637943, "rewards/accuracy_reward": 0.17857143888249993, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1537 }, { "completion_length": 709.8861846923828, "epoch": 0.4594130386080203, "grad_norm": 0.8127787113189697, "kl": 0.70947265625, "learning_rate": 6.898609026031312e-07, "loss": 0.0284, "reward": 1.2064732611179352, "reward_std": 0.14094501361250877, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 1538 }, { "completion_length": 696.4911041259766, "epoch": 0.45971174669554177, "grad_norm": 0.6181342005729675, "kl": 0.79052734375, "learning_rate": 6.894147104314478e-07, "loss": 0.0317, "reward": 1.2126116454601288, "reward_std": 0.17294725961983204, "rewards/accuracy_reward": 0.23437501676380634, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.97823666036129, "step": 1539 }, { "completion_length": 681.8125305175781, "epoch": 0.46001045478306324, "grad_norm": 0.2841760814189911, "kl": 0.556396484375, "learning_rate": 6.889683665902237e-07, "loss": 0.0223, "reward": 1.0864955931901932, "reward_std": 0.15404004231095314, "rewards/accuracy_reward": 0.09821428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1540 }, { "completion_length": 646.0558319091797, "epoch": 0.4603091628705847, "grad_norm": 0.41096198558807373, "kl": 0.659423828125, "learning_rate": 6.885218715650369e-07, "loss": 0.0264, "reward": 1.1478795409202576, "reward_std": 0.1689961925148964, "rewards/accuracy_reward": 0.16294643888249993, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 1541 }, { "completion_length": 695.3192291259766, "epoch": 0.4606078709581062, "grad_norm": 0.4043578505516052, "kl": 0.685546875, "learning_rate": 6.880752258416306e-07, "loss": 0.0275, "reward": 1.1651786267757416, "reward_std": 0.15362485125660896, "rewards/accuracy_reward": 0.1785714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1542 }, { "completion_length": 698.7924346923828, "epoch": 0.46090657904562765, "grad_norm": 0.41317397356033325, "kl": 0.66650390625, "learning_rate": 6.876284299059113e-07, "loss": 0.0267, "reward": 1.0312500298023224, "reward_std": 0.14415723830461502, "rewards/accuracy_reward": 0.04910714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428805589676, "step": 1543 }, { "completion_length": 653.5111846923828, "epoch": 0.4612052871331491, "grad_norm": 1.1813702583312988, "kl": 0.447998046875, "learning_rate": 6.871814842439494e-07, "loss": 0.0179, "reward": 1.168526828289032, "reward_std": 0.13974337000399828, "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 1544 }, { "completion_length": 600.1964492797852, "epoch": 0.4615039952206706, "grad_norm": 0.6284602880477905, "kl": 0.64404296875, "learning_rate": 6.867343893419778e-07, "loss": 0.0258, "reward": 1.1964286267757416, "reward_std": 0.18296711519360542, "rewards/accuracy_reward": 0.2075892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 1545 }, { "completion_length": 637.8326110839844, "epoch": 0.46180270330819206, "grad_norm": 0.5706275105476379, "kl": 0.593994140625, "learning_rate": 6.86287145686392e-07, "loss": 0.0237, "reward": 1.082589328289032, "reward_std": 0.13797790929675102, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1546 }, { "completion_length": 753.5536041259766, "epoch": 0.46210141139571353, "grad_norm": 0.5773928165435791, "kl": 1.00390625, "learning_rate": 6.858397537637492e-07, "loss": 0.0401, "reward": 1.0641741752624512, "reward_std": 0.17449244484305382, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562947034836, "step": 1547 }, { "completion_length": 687.6250305175781, "epoch": 0.462400119483235, "grad_norm": 0.5368198752403259, "kl": 0.55615234375, "learning_rate": 6.853922140607683e-07, "loss": 0.0222, "reward": 1.2087053954601288, "reward_std": 0.08494393900036812, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 1548 }, { "completion_length": 621.1406555175781, "epoch": 0.4626988275707565, "grad_norm": 0.7521947026252747, "kl": 0.64990234375, "learning_rate": 6.849445270643282e-07, "loss": 0.026, "reward": 1.0680803954601288, "reward_std": 0.05592919234186411, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875149011612, "step": 1549 }, { "completion_length": 616.3036193847656, "epoch": 0.46299753565827795, "grad_norm": 0.4521300196647644, "kl": 0.59375, "learning_rate": 6.844966932614686e-07, "loss": 0.0237, "reward": 1.183035746216774, "reward_std": 0.19111615233123302, "rewards/accuracy_reward": 0.19419643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1550 }, { "completion_length": 691.4085235595703, "epoch": 0.4632962437457994, "grad_norm": 0.728463888168335, "kl": 0.542724609375, "learning_rate": 6.840487131393888e-07, "loss": 0.0218, "reward": 1.1752232611179352, "reward_std": 0.15499882586300373, "rewards/accuracy_reward": 0.1897321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 1551 }, { "completion_length": 608.669677734375, "epoch": 0.4635949518333209, "grad_norm": 0.7114244699478149, "kl": 0.42724609375, "learning_rate": 6.836005871854474e-07, "loss": 0.0171, "reward": 1.1863839626312256, "reward_std": 0.12209324352443218, "rewards/accuracy_reward": 0.19419643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 1552 }, { "completion_length": 618.6942291259766, "epoch": 0.46389365992084236, "grad_norm": 0.9760586023330688, "kl": 0.58349609375, "learning_rate": 6.831523158871612e-07, "loss": 0.0234, "reward": 1.1054687947034836, "reward_std": 0.15511007606983185, "rewards/accuracy_reward": 0.12276786682195961, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 1553 }, { "completion_length": 662.4754791259766, "epoch": 0.46419236800836383, "grad_norm": 0.5801275372505188, "kl": 0.4912109375, "learning_rate": 6.82703899732206e-07, "loss": 0.0197, "reward": 1.050781324505806, "reward_std": 0.09042885806411505, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 1554 }, { "completion_length": 623.5045013427734, "epoch": 0.4644910760958853, "grad_norm": 0.4176848828792572, "kl": 0.569580078125, "learning_rate": 6.82255339208414e-07, "loss": 0.0228, "reward": 1.1696429252624512, "reward_std": 0.14440615475177765, "rewards/accuracy_reward": 0.18526786752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 1555 }, { "completion_length": 712.2232513427734, "epoch": 0.4647897841834068, "grad_norm": 0.8191567063331604, "kl": 0.74951171875, "learning_rate": 6.818066348037755e-07, "loss": 0.03, "reward": 1.1356027126312256, "reward_std": 0.1760016418993473, "rewards/accuracy_reward": 0.1562500111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 1556 }, { "completion_length": 642.1652069091797, "epoch": 0.46508849227092824, "grad_norm": 0.7324750423431396, "kl": 0.65283203125, "learning_rate": 6.813577870064366e-07, "loss": 0.0261, "reward": 1.109375074505806, "reward_std": 0.10661739483475685, "rewards/accuracy_reward": 0.12053572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 1557 }, { "completion_length": 722.7969055175781, "epoch": 0.4653872003584497, "grad_norm": 0.8384799361228943, "kl": 0.6640625, "learning_rate": 6.809087963047e-07, "loss": 0.0266, "reward": 1.1523437798023224, "reward_std": 0.1618170142173767, "rewards/accuracy_reward": 0.1852678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9670759290456772, "step": 1558 }, { "completion_length": 678.9888763427734, "epoch": 0.4656859084459712, "grad_norm": 0.6142306923866272, "kl": 0.55615234375, "learning_rate": 6.804596631870234e-07, "loss": 0.0222, "reward": 1.090401828289032, "reward_std": 0.1784140169620514, "rewards/accuracy_reward": 0.10714286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 1559 }, { "completion_length": 717.0625152587891, "epoch": 0.46598461653349266, "grad_norm": 0.49712151288986206, "kl": 0.70654296875, "learning_rate": 6.800103881420198e-07, "loss": 0.0283, "reward": 1.1116071939468384, "reward_std": 0.14899519085884094, "rewards/accuracy_reward": 0.12500000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 1560 }, { "completion_length": 677.4263763427734, "epoch": 0.46628332462101413, "grad_norm": 0.8657347559928894, "kl": 0.6201171875, "learning_rate": 6.795609716584562e-07, "loss": 0.0248, "reward": 1.1194196939468384, "reward_std": 0.19342823419719934, "rewards/accuracy_reward": 0.14508929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9743303954601288, "step": 1561 }, { "completion_length": 619.8616333007812, "epoch": 0.4665820327085356, "grad_norm": 0.5880889892578125, "kl": 0.58203125, "learning_rate": 6.791114142252538e-07, "loss": 0.0233, "reward": 1.1328125596046448, "reward_std": 0.13349361158907413, "rewards/accuracy_reward": 0.14285714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98995541036129, "step": 1562 }, { "completion_length": 605.9598541259766, "epoch": 0.46688074079605707, "grad_norm": 0.8149988055229187, "kl": 0.390625, "learning_rate": 6.78661716331487e-07, "loss": 0.0156, "reward": 1.154575914144516, "reward_std": 0.12351745925843716, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 1563 }, { "completion_length": 669.6495971679688, "epoch": 0.46717944888357854, "grad_norm": 0.6853793263435364, "kl": 0.46728515625, "learning_rate": 6.782118784663829e-07, "loss": 0.0187, "reward": 1.098214328289032, "reward_std": 0.14836113713681698, "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1564 }, { "completion_length": 635.2567291259766, "epoch": 0.4674781569711, "grad_norm": 0.9296780228614807, "kl": 0.48974609375, "learning_rate": 6.777619011193213e-07, "loss": 0.0196, "reward": 1.1378348767757416, "reward_std": 0.13482083939015865, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848767757416, "step": 1565 }, { "completion_length": 666.7254638671875, "epoch": 0.4677768650586215, "grad_norm": 0.46903300285339355, "kl": 0.2650146484375, "learning_rate": 6.773117847798333e-07, "loss": 0.0106, "reward": 1.0887277275323868, "reward_std": 0.13887614756822586, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 1566 }, { "completion_length": 544.4062652587891, "epoch": 0.46807557314614295, "grad_norm": 0.691399335861206, "kl": 0.41943359375, "learning_rate": 6.768615299376013e-07, "loss": 0.0167, "reward": 1.1869420111179352, "reward_std": 0.20427675172686577, "rewards/accuracy_reward": 0.20089286752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98604916036129, "step": 1567 }, { "completion_length": 646.8326263427734, "epoch": 0.4683742812336644, "grad_norm": 0.5694074630737305, "kl": 0.5872802734375, "learning_rate": 6.764111370824585e-07, "loss": 0.0235, "reward": 1.252790242433548, "reward_std": 0.1384846232831478, "rewards/accuracy_reward": 0.26785715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 1568 }, { "completion_length": 635.1718978881836, "epoch": 0.4686729893211859, "grad_norm": 0.5039140582084656, "kl": 0.40234375, "learning_rate": 6.759606067043882e-07, "loss": 0.0161, "reward": 1.1071428954601288, "reward_std": 0.10985564067959785, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1569 }, { "completion_length": 623.8750152587891, "epoch": 0.46897169740870737, "grad_norm": 0.5183536410331726, "kl": 0.502685546875, "learning_rate": 6.755099392935232e-07, "loss": 0.0201, "reward": 1.1395089626312256, "reward_std": 0.14426334761083126, "rewards/accuracy_reward": 0.15178571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 1570 }, { "completion_length": 651.9241333007812, "epoch": 0.46927040549622884, "grad_norm": 1.3892914056777954, "kl": 0.54833984375, "learning_rate": 6.750591353401456e-07, "loss": 0.0219, "reward": 1.1930803954601288, "reward_std": 0.16854199953377247, "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 1571 }, { "completion_length": 711.0335083007812, "epoch": 0.46956911358375025, "grad_norm": 0.599284291267395, "kl": 0.82861328125, "learning_rate": 6.746081953346858e-07, "loss": 0.0332, "reward": 1.0820313096046448, "reward_std": 0.08651383826509118, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1572 }, { "completion_length": 653.3660888671875, "epoch": 0.4698678216712717, "grad_norm": 0.6842381358146667, "kl": 0.845703125, "learning_rate": 6.741571197677225e-07, "loss": 0.0338, "reward": 1.1484375596046448, "reward_std": 0.09108195826411247, "rewards/accuracy_reward": 0.15848214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1573 }, { "completion_length": 681.5223541259766, "epoch": 0.4701665297587932, "grad_norm": 0.8422563672065735, "kl": 1.0419921875, "learning_rate": 6.737059091299817e-07, "loss": 0.0417, "reward": 1.1696428954601288, "reward_std": 0.15535495802760124, "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464775323868, "step": 1574 }, { "completion_length": 720.3482666015625, "epoch": 0.47046523784631467, "grad_norm": 0.3955110013484955, "kl": 0.60546875, "learning_rate": 6.73254563912336e-07, "loss": 0.0243, "reward": 1.08370541036129, "reward_std": 0.08620391273871064, "rewards/accuracy_reward": 0.09821429150179029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 1575 }, { "completion_length": 617.2700958251953, "epoch": 0.47076394593383614, "grad_norm": 0.649599552154541, "kl": 0.8974609375, "learning_rate": 6.728030846058052e-07, "loss": 0.0358, "reward": 1.1372767984867096, "reward_std": 0.15877286717295647, "rewards/accuracy_reward": 0.14955357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1576 }, { "completion_length": 640.8817367553711, "epoch": 0.4710626540213576, "grad_norm": 0.6803287863731384, "kl": 0.8310546875, "learning_rate": 6.723514717015542e-07, "loss": 0.0333, "reward": 1.1289062798023224, "reward_std": 0.09929575771093369, "rewards/accuracy_reward": 0.14508928847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 1577 }, { "completion_length": 564.7544860839844, "epoch": 0.4713613621088791, "grad_norm": 0.6114216446876526, "kl": 0.523193359375, "learning_rate": 6.718997256908938e-07, "loss": 0.021, "reward": 1.1104911267757416, "reward_std": 0.12628401443362236, "rewards/accuracy_reward": 0.12276786402799189, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1578 }, { "completion_length": 675.6495819091797, "epoch": 0.47166007019640055, "grad_norm": 1.424958348274231, "kl": 1.0048828125, "learning_rate": 6.714478470652792e-07, "loss": 0.0402, "reward": 1.023995578289032, "reward_std": 0.08203303068876266, "rewards/accuracy_reward": 0.04910714412108064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.974888414144516, "step": 1579 }, { "completion_length": 572.6540374755859, "epoch": 0.471958778283922, "grad_norm": 0.7296261191368103, "kl": 0.59912109375, "learning_rate": 6.709958363163104e-07, "loss": 0.024, "reward": 1.1752232611179352, "reward_std": 0.11910732463002205, "rewards/accuracy_reward": 0.18303572572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 1580 }, { "completion_length": 587.1183319091797, "epoch": 0.4722574863714435, "grad_norm": 0.9964141249656677, "kl": 0.808837890625, "learning_rate": 6.705436939357304e-07, "loss": 0.0323, "reward": 1.1104911267757416, "reward_std": 0.1709994487464428, "rewards/accuracy_reward": 0.12723214668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 1581 }, { "completion_length": 600.9040374755859, "epoch": 0.47255619445896496, "grad_norm": 0.7930851578712463, "kl": 0.436767578125, "learning_rate": 6.700914204154258e-07, "loss": 0.0174, "reward": 1.1802456080913544, "reward_std": 0.1243143449537456, "rewards/accuracy_reward": 0.1875000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 1582 }, { "completion_length": 622.4710083007812, "epoch": 0.47285490254648643, "grad_norm": 0.6234930157661438, "kl": 0.7333984375, "learning_rate": 6.696390162474261e-07, "loss": 0.0294, "reward": 1.2181920409202576, "reward_std": 0.15397686697542667, "rewards/accuracy_reward": 0.23437500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 1583 }, { "completion_length": 636.6518096923828, "epoch": 0.4731536106340079, "grad_norm": 0.33287447690963745, "kl": 0.4326171875, "learning_rate": 6.691864819239028e-07, "loss": 0.0173, "reward": 1.1662946939468384, "reward_std": 0.18318082764744759, "rewards/accuracy_reward": 0.17410715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 1584 }, { "completion_length": 675.2143402099609, "epoch": 0.4734523187215294, "grad_norm": 0.6881682872772217, "kl": 0.52978515625, "learning_rate": 6.687338179371686e-07, "loss": 0.0212, "reward": 1.2304687798023224, "reward_std": 0.15792496502399445, "rewards/accuracy_reward": 0.2477678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 1585 }, { "completion_length": 552.8214569091797, "epoch": 0.47375102680905085, "grad_norm": 0.29909971356391907, "kl": 0.24560546875, "learning_rate": 6.682810247796776e-07, "loss": 0.0098, "reward": 1.2728795111179352, "reward_std": 0.19104483351111412, "rewards/accuracy_reward": 0.2812500111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916294813156128, "step": 1586 }, { "completion_length": 595.3013763427734, "epoch": 0.4740497348965723, "grad_norm": 0.23003076016902924, "kl": 0.30615234375, "learning_rate": 6.678281029440243e-07, "loss": 0.0123, "reward": 1.137276828289032, "reward_std": 0.10671233013272285, "rewards/accuracy_reward": 0.14508929569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875149011612, "step": 1587 }, { "completion_length": 613.0379638671875, "epoch": 0.4743484429840938, "grad_norm": 0.9841751456260681, "kl": 0.279541015625, "learning_rate": 6.673750529229437e-07, "loss": 0.0112, "reward": 1.209263414144516, "reward_std": 0.15326758846640587, "rewards/accuracy_reward": 0.2209821566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 1588 }, { "completion_length": 634.1116333007812, "epoch": 0.47464715107161526, "grad_norm": 1.3412662744522095, "kl": 0.42919921875, "learning_rate": 6.669218752093093e-07, "loss": 0.0172, "reward": 1.135044664144516, "reward_std": 0.09963824693113565, "rewards/accuracy_reward": 0.15625000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946790456772, "step": 1589 }, { "completion_length": 633.0558319091797, "epoch": 0.47494585915913673, "grad_norm": 0.2072514295578003, "kl": 0.406494140625, "learning_rate": 6.664685702961344e-07, "loss": 0.0163, "reward": 1.0820312947034836, "reward_std": 0.14413230679929256, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 1590 }, { "completion_length": 639.4576263427734, "epoch": 0.4752445672466582, "grad_norm": 0.6289888620376587, "kl": 0.58447265625, "learning_rate": 6.6601513867657e-07, "loss": 0.0234, "reward": 1.1858259737491608, "reward_std": 0.12955747777596116, "rewards/accuracy_reward": 0.2008928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 1591 }, { "completion_length": 584.7745819091797, "epoch": 0.47554327533417967, "grad_norm": 0.6325693726539612, "kl": 0.5184326171875, "learning_rate": 6.655615808439055e-07, "loss": 0.0207, "reward": 1.209821492433548, "reward_std": 0.1563247460871935, "rewards/accuracy_reward": 0.2254464402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 1592 }, { "completion_length": 574.7902069091797, "epoch": 0.47584198342170114, "grad_norm": 0.24569383263587952, "kl": 0.347900390625, "learning_rate": 6.651078972915672e-07, "loss": 0.0139, "reward": 1.1997768580913544, "reward_std": 0.1610939707607031, "rewards/accuracy_reward": 0.21205358137376606, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1593 }, { "completion_length": 693.841552734375, "epoch": 0.4761406915092226, "grad_norm": 0.5067805647850037, "kl": 0.7218017578125, "learning_rate": 6.646540885131185e-07, "loss": 0.0289, "reward": 1.1238840073347092, "reward_std": 0.15108967572450638, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589477300644, "step": 1594 }, { "completion_length": 631.5045013427734, "epoch": 0.4764393995967441, "grad_norm": 0.552502453327179, "kl": 0.68115234375, "learning_rate": 6.642001550022589e-07, "loss": 0.0272, "reward": 1.1149554252624512, "reward_std": 0.17706714756786823, "rewards/accuracy_reward": 0.1272321450524032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1595 }, { "completion_length": 697.435302734375, "epoch": 0.47673810768426556, "grad_norm": 0.7829424738883972, "kl": 0.62890625, "learning_rate": 6.637460972528234e-07, "loss": 0.0252, "reward": 1.1501116454601288, "reward_std": 0.16195371374487877, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973767757416, "step": 1596 }, { "completion_length": 668.5379791259766, "epoch": 0.477036815771787, "grad_norm": 1.112359881401062, "kl": 0.6455078125, "learning_rate": 6.632919157587825e-07, "loss": 0.0259, "reward": 1.1272321939468384, "reward_std": 0.09974666126072407, "rewards/accuracy_reward": 0.13392858183942735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1597 }, { "completion_length": 635.5022583007812, "epoch": 0.4773355238593085, "grad_norm": 0.5097032189369202, "kl": 0.9716796875, "learning_rate": 6.628376110142407e-07, "loss": 0.0389, "reward": 1.2126116454601288, "reward_std": 0.24205591902136803, "rewards/accuracy_reward": 0.238839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723618745804, "step": 1598 }, { "completion_length": 755.2745819091797, "epoch": 0.47763423194682997, "grad_norm": 1.2599409818649292, "kl": 1.5439453125, "learning_rate": 6.623831835134377e-07, "loss": 0.0618, "reward": 1.0530134588479996, "reward_std": 0.18347032740712166, "rewards/accuracy_reward": 0.08928572060540318, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9637277275323868, "step": 1599 }, { "completion_length": 668.3214569091797, "epoch": 0.47793294003435144, "grad_norm": 0.2989734709262848, "kl": 0.48095703125, "learning_rate": 6.619286337507457e-07, "loss": 0.0193, "reward": 1.1238839626312256, "reward_std": 0.17462840862572193, "rewards/accuracy_reward": 0.13839286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911267757416, "step": 1600 }, { "completion_length": 650.2187805175781, "epoch": 0.4782316481218729, "grad_norm": 0.928888738155365, "kl": 0.564453125, "learning_rate": 6.614739622206704e-07, "loss": 0.0226, "reward": 1.1244420111179352, "reward_std": 0.11547192186117172, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455484867096, "step": 1601 }, { "completion_length": 633.6763610839844, "epoch": 0.4785303562093944, "grad_norm": 0.25255903601646423, "kl": 0.3349609375, "learning_rate": 6.610191694178499e-07, "loss": 0.0134, "reward": 1.098214328289032, "reward_std": 0.08723372081294656, "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1602 }, { "completion_length": 677.0491333007812, "epoch": 0.47882906429691585, "grad_norm": 0.3900318443775177, "kl": 0.41015625, "learning_rate": 6.605642558370539e-07, "loss": 0.0164, "reward": 1.1037946939468384, "reward_std": 0.12871775217354298, "rewards/accuracy_reward": 0.11383928824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 1603 }, { "completion_length": 748.6093902587891, "epoch": 0.4791277723844373, "grad_norm": 0.6014900207519531, "kl": 0.8232421875, "learning_rate": 6.601092219731842e-07, "loss": 0.0329, "reward": 1.0312500298023224, "reward_std": 0.11777269653975964, "rewards/accuracy_reward": 0.055803573690354824, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464626312256, "step": 1604 }, { "completion_length": 661.1138763427734, "epoch": 0.4794264804719588, "grad_norm": 1.8760234117507935, "kl": 0.26904296875, "learning_rate": 6.596540683212728e-07, "loss": 0.0107, "reward": 1.1177455484867096, "reward_std": 0.11904658935964108, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134439468384, "step": 1605 }, { "completion_length": 663.225471496582, "epoch": 0.47972518855948026, "grad_norm": 0.739327073097229, "kl": 0.35205078125, "learning_rate": 6.591987953764824e-07, "loss": 0.0141, "reward": 1.1707589626312256, "reward_std": 0.09678955283015966, "rewards/accuracy_reward": 0.1875000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 1606 }, { "completion_length": 686.1205596923828, "epoch": 0.48002389664700174, "grad_norm": 1.0096826553344727, "kl": 0.62841796875, "learning_rate": 6.587434036341051e-07, "loss": 0.0252, "reward": 1.1852678954601288, "reward_std": 0.1980300359427929, "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.970982164144516, "step": 1607 }, { "completion_length": 741.1027069091797, "epoch": 0.4803226047345232, "grad_norm": 0.932421863079071, "kl": 0.627197265625, "learning_rate": 6.582878935895627e-07, "loss": 0.0251, "reward": 1.0954241752624512, "reward_std": 0.19413290545344353, "rewards/accuracy_reward": 0.12276786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562798023224, "step": 1608 }, { "completion_length": 601.7969207763672, "epoch": 0.4806213128220447, "grad_norm": 0.37690678238868713, "kl": 0.3388671875, "learning_rate": 6.578322657384055e-07, "loss": 0.0136, "reward": 1.2075893580913544, "reward_std": 0.1721203587949276, "rewards/accuracy_reward": 0.2165178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1609 }, { "completion_length": 626.8817291259766, "epoch": 0.48092002090956615, "grad_norm": 0.68938809633255, "kl": 0.35546875, "learning_rate": 6.573765205763118e-07, "loss": 0.0142, "reward": 1.0948661267757416, "reward_std": 0.14758562482893467, "rewards/accuracy_reward": 0.10714286100119352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1610 }, { "completion_length": 607.6473541259766, "epoch": 0.4812187289970876, "grad_norm": 0.476176381111145, "kl": 0.295166015625, "learning_rate": 6.569206585990878e-07, "loss": 0.0118, "reward": 1.2103795111179352, "reward_std": 0.1602681539952755, "rewards/accuracy_reward": 0.2321428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 1611 }, { "completion_length": 736.6205749511719, "epoch": 0.4815174370846091, "grad_norm": 0.6524980664253235, "kl": 0.545166015625, "learning_rate": 6.564646803026666e-07, "loss": 0.0218, "reward": 1.1802456080913544, "reward_std": 0.16862563136965036, "rewards/accuracy_reward": 0.1986607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 1612 }, { "completion_length": 695.9152069091797, "epoch": 0.48181614517213056, "grad_norm": 0.39930176734924316, "kl": 0.64208984375, "learning_rate": 6.560085861831078e-07, "loss": 0.0257, "reward": 1.1774553954601288, "reward_std": 0.21092582866549492, "rewards/accuracy_reward": 0.19642857951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 1613 }, { "completion_length": 688.6629791259766, "epoch": 0.48211485325965203, "grad_norm": 0.5333260297775269, "kl": 0.5162353515625, "learning_rate": 6.555523767365973e-07, "loss": 0.0206, "reward": 1.1953125447034836, "reward_std": 0.15421756356954575, "rewards/accuracy_reward": 0.21651786612346768, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946939468384, "step": 1614 }, { "completion_length": 654.9018249511719, "epoch": 0.48241356134717345, "grad_norm": 0.5577230453491211, "kl": 0.63525390625, "learning_rate": 6.55096052459446e-07, "loss": 0.0254, "reward": 1.1071428954601288, "reward_std": 0.17573334462940693, "rewards/accuracy_reward": 0.12276786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 1615 }, { "completion_length": 664.0692291259766, "epoch": 0.4827122694346949, "grad_norm": 0.4164632558822632, "kl": 0.7060546875, "learning_rate": 6.546396138480904e-07, "loss": 0.0283, "reward": 1.125558078289032, "reward_std": 0.18004146963357925, "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 1616 }, { "completion_length": 629.7388610839844, "epoch": 0.4830109775222164, "grad_norm": 0.5887446403503418, "kl": 0.4111328125, "learning_rate": 6.541830613990904e-07, "loss": 0.0164, "reward": 1.1897321939468384, "reward_std": 0.09371955366805196, "rewards/accuracy_reward": 0.19866072246804833, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1617 }, { "completion_length": 662.4844055175781, "epoch": 0.48330968560973786, "grad_norm": 0.6230341196060181, "kl": 0.395263671875, "learning_rate": 6.53726395609131e-07, "loss": 0.0158, "reward": 1.1495536267757416, "reward_std": 0.10481368191540241, "rewards/accuracy_reward": 0.16741071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 1618 }, { "completion_length": 722.5736846923828, "epoch": 0.48360839369725933, "grad_norm": 0.5123399496078491, "kl": 0.8203125, "learning_rate": 6.532696169750192e-07, "loss": 0.0328, "reward": 1.2220982611179352, "reward_std": 0.19953331351280212, "rewards/accuracy_reward": 0.24776787124574184, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9743303954601288, "step": 1619 }, { "completion_length": 643.5714569091797, "epoch": 0.4839071017847808, "grad_norm": 0.5136784315109253, "kl": 0.591796875, "learning_rate": 6.528127259936856e-07, "loss": 0.0236, "reward": 1.1768973767757416, "reward_std": 0.21658339584246278, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 1620 }, { "completion_length": 677.9464569091797, "epoch": 0.4842058098723023, "grad_norm": 1.1173495054244995, "kl": 0.91357421875, "learning_rate": 6.52355723162183e-07, "loss": 0.0367, "reward": 1.0976563096046448, "reward_std": 0.16242489963769913, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 1621 }, { "completion_length": 744.1964569091797, "epoch": 0.48450451795982374, "grad_norm": 1.3767096996307373, "kl": 1.15185546875, "learning_rate": 6.518986089776854e-07, "loss": 0.0461, "reward": 1.1936384439468384, "reward_std": 0.19931641593575478, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.97042416036129, "step": 1622 }, { "completion_length": 618.5178909301758, "epoch": 0.4848032260473452, "grad_norm": 0.8467552065849304, "kl": 0.986328125, "learning_rate": 6.514413839374886e-07, "loss": 0.0394, "reward": 1.2806920111179352, "reward_std": 0.191010020673275, "rewards/accuracy_reward": 0.3035714440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977120578289032, "step": 1623 }, { "completion_length": 584.5647583007812, "epoch": 0.4851019341348667, "grad_norm": 0.3006381392478943, "kl": 0.358154296875, "learning_rate": 6.509840485390081e-07, "loss": 0.0143, "reward": 1.1986607909202576, "reward_std": 0.18224995583295822, "rewards/accuracy_reward": 0.2098214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1624 }, { "completion_length": 700.2768249511719, "epoch": 0.48540064222238816, "grad_norm": 0.5258913040161133, "kl": 0.787109375, "learning_rate": 6.505266032797805e-07, "loss": 0.0315, "reward": 1.0273438096046448, "reward_std": 0.18082240596413612, "rewards/accuracy_reward": 0.055803573690354824, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9715402126312256, "step": 1625 }, { "completion_length": 587.7879638671875, "epoch": 0.4856993503099096, "grad_norm": 0.3487643301486969, "kl": 0.2744140625, "learning_rate": 6.500690486574611e-07, "loss": 0.011, "reward": 1.2327009439468384, "reward_std": 0.14128456450998783, "rewards/accuracy_reward": 0.24330358393490314, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 1626 }, { "completion_length": 703.1986999511719, "epoch": 0.4859980583974311, "grad_norm": 1.0126630067825317, "kl": 0.40185546875, "learning_rate": 6.496113851698247e-07, "loss": 0.0161, "reward": 1.1049107611179352, "reward_std": 0.15911360830068588, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 1627 }, { "completion_length": 722.2835235595703, "epoch": 0.48629676648495257, "grad_norm": 0.9670879244804382, "kl": 0.41436767578125, "learning_rate": 6.49153613314764e-07, "loss": 0.0166, "reward": 1.1300223767757416, "reward_std": 0.1411410914734006, "rewards/accuracy_reward": 0.14508929220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 1628 }, { "completion_length": 579.3058166503906, "epoch": 0.48659547457247404, "grad_norm": 0.3133770227432251, "kl": 0.1358642578125, "learning_rate": 6.486957335902904e-07, "loss": 0.0054, "reward": 1.2550223767757416, "reward_std": 0.20715874433517456, "rewards/accuracy_reward": 0.2589285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937649011612, "step": 1629 }, { "completion_length": 625.3594055175781, "epoch": 0.4868941826599955, "grad_norm": 0.5605428814888, "kl": 0.45166015625, "learning_rate": 6.482377464945316e-07, "loss": 0.018, "reward": 1.1149553954601288, "reward_std": 0.16124228574335575, "rewards/accuracy_reward": 0.13169643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 1630 }, { "completion_length": 607.6719055175781, "epoch": 0.487192890747517, "grad_norm": 0.6449914574623108, "kl": 0.67822265625, "learning_rate": 6.477796525257331e-07, "loss": 0.0271, "reward": 1.1233259439468384, "reward_std": 0.16773954965174198, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.976004496216774, "step": 1631 }, { "completion_length": 692.2678833007812, "epoch": 0.48749159883503845, "grad_norm": 0.5904121994972229, "kl": 0.7490234375, "learning_rate": 6.473214521822561e-07, "loss": 0.0299, "reward": 1.137276828289032, "reward_std": 0.13282499834895134, "rewards/accuracy_reward": 0.15848214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946939468384, "step": 1632 }, { "completion_length": 653.1942291259766, "epoch": 0.4877903069225599, "grad_norm": 0.3461383283138275, "kl": 0.5186767578125, "learning_rate": 6.468631459625775e-07, "loss": 0.0207, "reward": 1.1049107909202576, "reward_std": 0.15603209659457207, "rewards/accuracy_reward": 0.11830358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 1633 }, { "completion_length": 656.6763763427734, "epoch": 0.4880890150100814, "grad_norm": 0.5593435764312744, "kl": 0.508056640625, "learning_rate": 6.464047343652898e-07, "loss": 0.0203, "reward": 1.1891741752624512, "reward_std": 0.18276648968458176, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 1634 }, { "completion_length": 555.0982437133789, "epoch": 0.48838772309760287, "grad_norm": 0.5200849175453186, "kl": 0.345703125, "learning_rate": 6.459462178890998e-07, "loss": 0.0138, "reward": 1.1411831080913544, "reward_std": 0.1513664461672306, "rewards/accuracy_reward": 0.14955357927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 1635 }, { "completion_length": 619.4665374755859, "epoch": 0.48868643118512434, "grad_norm": 1.0804359912872314, "kl": 0.930908203125, "learning_rate": 6.454875970328285e-07, "loss": 0.0373, "reward": 1.2460938096046448, "reward_std": 0.17862926051020622, "rewards/accuracy_reward": 0.2589285857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1636 }, { "completion_length": 645.9241409301758, "epoch": 0.4889851392726458, "grad_norm": 0.8586986660957336, "kl": 0.73095703125, "learning_rate": 6.450288722954103e-07, "loss": 0.0293, "reward": 1.162946492433548, "reward_std": 0.14292002003639936, "rewards/accuracy_reward": 0.1763392984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071939468384, "step": 1637 }, { "completion_length": 605.5670013427734, "epoch": 0.4892838473601673, "grad_norm": 0.550063967704773, "kl": 0.71240234375, "learning_rate": 6.44570044175893e-07, "loss": 0.0285, "reward": 1.127790242433548, "reward_std": 0.14710952155292034, "rewards/accuracy_reward": 0.14285714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 1638 }, { "completion_length": 670.950927734375, "epoch": 0.48958255544768875, "grad_norm": 0.6442111134529114, "kl": 1.06982421875, "learning_rate": 6.441111131734364e-07, "loss": 0.0428, "reward": 1.1110491752624512, "reward_std": 0.15659465454518795, "rewards/accuracy_reward": 0.13616071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884290456772, "step": 1639 }, { "completion_length": 647.0625305175781, "epoch": 0.4898812635352102, "grad_norm": 1.160496711730957, "kl": 0.533203125, "learning_rate": 6.436520797873128e-07, "loss": 0.0213, "reward": 1.127790242433548, "reward_std": 0.17356102354824543, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871651977300644, "step": 1640 }, { "completion_length": 691.5379943847656, "epoch": 0.4901799716227317, "grad_norm": 0.7385382056236267, "kl": 0.76171875, "learning_rate": 6.431929445169051e-07, "loss": 0.0305, "reward": 1.1875000596046448, "reward_std": 0.15675761550664902, "rewards/accuracy_reward": 0.212053582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464626312256, "step": 1641 }, { "completion_length": 650.0491485595703, "epoch": 0.49047867971025316, "grad_norm": 0.4041045606136322, "kl": 0.55322265625, "learning_rate": 6.427337078617076e-07, "loss": 0.0221, "reward": 1.194196492433548, "reward_std": 0.14351879991590977, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 1642 }, { "completion_length": 655.4352874755859, "epoch": 0.49077738779777463, "grad_norm": 0.6450632810592651, "kl": 0.45166015625, "learning_rate": 6.422743703213248e-07, "loss": 0.0181, "reward": 1.2248884439468384, "reward_std": 0.18250145576894283, "rewards/accuracy_reward": 0.23437500977888703, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 1643 }, { "completion_length": 684.9799499511719, "epoch": 0.4910760958852961, "grad_norm": 1.0506137609481812, "kl": 1.1533203125, "learning_rate": 6.41814932395471e-07, "loss": 0.0461, "reward": 1.1406250596046448, "reward_std": 0.21534224972128868, "rewards/accuracy_reward": 0.18303572246804833, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.957589328289032, "step": 1644 }, { "completion_length": 683.2433319091797, "epoch": 0.4913748039728176, "grad_norm": 0.836383581161499, "kl": 0.41796875, "learning_rate": 6.413553945839696e-07, "loss": 0.0167, "reward": 1.037388414144516, "reward_std": 0.13895923644304276, "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 1645 }, { "completion_length": 559.5245895385742, "epoch": 0.49167351206033905, "grad_norm": 0.8196337223052979, "kl": 0.257568359375, "learning_rate": 6.408957573867527e-07, "loss": 0.0103, "reward": 1.1261161118745804, "reward_std": 0.1072020661085844, "rewards/accuracy_reward": 0.14285714528523386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 1646 }, { "completion_length": 592.2500305175781, "epoch": 0.4919722201478605, "grad_norm": 0.4670889973640442, "kl": 0.4434814453125, "learning_rate": 6.404360213038605e-07, "loss": 0.0177, "reward": 1.2087053656578064, "reward_std": 0.09584835264831781, "rewards/accuracy_reward": 0.2232142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 1647 }, { "completion_length": 633.0647735595703, "epoch": 0.492270928235382, "grad_norm": 0.4044514000415802, "kl": 0.326171875, "learning_rate": 6.399761868354409e-07, "loss": 0.013, "reward": 1.209821492433548, "reward_std": 0.15963290259242058, "rewards/accuracy_reward": 0.22544643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 1648 }, { "completion_length": 631.5536041259766, "epoch": 0.49256963632290346, "grad_norm": 0.28895848989486694, "kl": 0.326171875, "learning_rate": 6.395162544817484e-07, "loss": 0.013, "reward": 1.1908482611179352, "reward_std": 0.14784935489296913, "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 1649 }, { "completion_length": 711.8036041259766, "epoch": 0.49286834441042493, "grad_norm": 0.4421224296092987, "kl": 0.64599609375, "learning_rate": 6.390562247431449e-07, "loss": 0.0258, "reward": 1.1132813394069672, "reward_std": 0.17831257171928883, "rewards/accuracy_reward": 0.1361607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977120578289032, "step": 1650 }, { "completion_length": 624.7678833007812, "epoch": 0.4931670524979464, "grad_norm": 0.32341018319129944, "kl": 0.47296142578125, "learning_rate": 6.385960981200969e-07, "loss": 0.0189, "reward": 1.1439732909202576, "reward_std": 0.15928757935762405, "rewards/accuracy_reward": 0.15401786752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 1651 }, { "completion_length": 609.8504791259766, "epoch": 0.49346576058546787, "grad_norm": 1.0450026988983154, "kl": 0.473876953125, "learning_rate": 6.381358751131778e-07, "loss": 0.019, "reward": 1.233816996216774, "reward_std": 0.16411610785871744, "rewards/accuracy_reward": 0.2566964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205633878708, "step": 1652 }, { "completion_length": 666.5758972167969, "epoch": 0.49376446867298934, "grad_norm": 0.3980810344219208, "kl": 0.402099609375, "learning_rate": 6.376755562230646e-07, "loss": 0.0161, "reward": 1.1690848469734192, "reward_std": 0.2597667723894119, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 1653 }, { "completion_length": 724.4330749511719, "epoch": 0.4940631767605108, "grad_norm": 0.7514843344688416, "kl": 0.49462890625, "learning_rate": 6.372151419505397e-07, "loss": 0.0198, "reward": 1.0675223767757416, "reward_std": 0.12414667941629887, "rewards/accuracy_reward": 0.08258928940631449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 1654 }, { "completion_length": 542.7433242797852, "epoch": 0.4943618848480323, "grad_norm": 0.6487753391265869, "kl": 0.2703857421875, "learning_rate": 6.367546327964882e-07, "loss": 0.0108, "reward": 1.1049107909202576, "reward_std": 0.1020752964541316, "rewards/accuracy_reward": 0.11607143515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1655 }, { "completion_length": 679.2009124755859, "epoch": 0.49466059293555376, "grad_norm": 0.4509682059288025, "kl": 0.4093017578125, "learning_rate": 6.362940292618989e-07, "loss": 0.0164, "reward": 1.129464328289032, "reward_std": 0.11044390872120857, "rewards/accuracy_reward": 0.14285715040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 1656 }, { "completion_length": 635.9643249511719, "epoch": 0.4949593010230752, "grad_norm": 0.2553585171699524, "kl": 0.29443359375, "learning_rate": 6.358333318478637e-07, "loss": 0.0118, "reward": 1.1512277126312256, "reward_std": 0.12425514031201601, "rewards/accuracy_reward": 0.16294644074514508, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 1657 }, { "completion_length": 552.5647659301758, "epoch": 0.49525800911059664, "grad_norm": 0.45961064100265503, "kl": 0.520751953125, "learning_rate": 6.35372541055576e-07, "loss": 0.0208, "reward": 1.2321428954601288, "reward_std": 0.15481351502239704, "rewards/accuracy_reward": 0.247767873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 1658 }, { "completion_length": 691.5982513427734, "epoch": 0.4955567171981181, "grad_norm": 0.4255906641483307, "kl": 0.736083984375, "learning_rate": 6.349116573863309e-07, "loss": 0.0295, "reward": 1.084821492433548, "reward_std": 0.19356642849743366, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776785969734192, "step": 1659 }, { "completion_length": 610.8460083007812, "epoch": 0.4958554252856396, "grad_norm": 0.40814095735549927, "kl": 0.70263671875, "learning_rate": 6.344506813415249e-07, "loss": 0.0281, "reward": 1.1573661267757416, "reward_std": 0.1377621553838253, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 1660 }, { "completion_length": 761.9977874755859, "epoch": 0.49615413337316105, "grad_norm": 0.5725954174995422, "kl": 0.80224609375, "learning_rate": 6.339896134226546e-07, "loss": 0.0321, "reward": 1.0792411267757416, "reward_std": 0.1443611215800047, "rewards/accuracy_reward": 0.11383929033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9654018133878708, "step": 1661 }, { "completion_length": 763.9553833007812, "epoch": 0.4964528414606825, "grad_norm": 0.4486847519874573, "kl": 0.9619140625, "learning_rate": 6.335284541313168e-07, "loss": 0.0385, "reward": 1.1311384439468384, "reward_std": 0.17642296478152275, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9704241454601288, "step": 1662 }, { "completion_length": 655.8125305175781, "epoch": 0.496751549548204, "grad_norm": 0.46560636162757874, "kl": 0.485595703125, "learning_rate": 6.330672039692077e-07, "loss": 0.0194, "reward": 1.1389509439468384, "reward_std": 0.1793962400406599, "rewards/accuracy_reward": 0.15625000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 1663 }, { "completion_length": 722.0156402587891, "epoch": 0.49705025763572547, "grad_norm": 1.0441583395004272, "kl": 1.0166015625, "learning_rate": 6.326058634381219e-07, "loss": 0.0407, "reward": 1.0937500298023224, "reward_std": 0.15471546910703182, "rewards/accuracy_reward": 0.11160714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98214291036129, "step": 1664 }, { "completion_length": 642.1451263427734, "epoch": 0.49734896572324694, "grad_norm": 0.7516884207725525, "kl": 0.54248046875, "learning_rate": 6.321444330399531e-07, "loss": 0.0218, "reward": 1.151785746216774, "reward_std": 0.19103392772376537, "rewards/accuracy_reward": 0.17410715413279831, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 1665 }, { "completion_length": 749.2678985595703, "epoch": 0.4976476738107684, "grad_norm": 0.45047545433044434, "kl": 0.8623046875, "learning_rate": 6.316829132766921e-07, "loss": 0.0345, "reward": 1.1227679252624512, "reward_std": 0.2202421836555004, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9709821790456772, "step": 1666 }, { "completion_length": 703.6696624755859, "epoch": 0.4979463818982899, "grad_norm": 0.45875880122184753, "kl": 0.639892578125, "learning_rate": 6.312213046504273e-07, "loss": 0.0256, "reward": 1.2243303954601288, "reward_std": 0.20392993837594986, "rewards/accuracy_reward": 0.24553572945296764, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946939468384, "step": 1667 }, { "completion_length": 698.341552734375, "epoch": 0.49824508998581135, "grad_norm": 0.5523280501365662, "kl": 0.64501953125, "learning_rate": 6.307596076633434e-07, "loss": 0.0258, "reward": 1.0898438096046448, "reward_std": 0.16016435250639915, "rewards/accuracy_reward": 0.10491072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 1668 }, { "completion_length": 620.8951110839844, "epoch": 0.4985437980733328, "grad_norm": 0.4195149838924408, "kl": 0.46728515625, "learning_rate": 6.302978228177221e-07, "loss": 0.0187, "reward": 1.1328125596046448, "reward_std": 0.1458871942013502, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1669 }, { "completion_length": 706.1852874755859, "epoch": 0.4988425061608543, "grad_norm": 0.6972272396087646, "kl": 0.71337890625, "learning_rate": 6.298359506159392e-07, "loss": 0.0285, "reward": 1.1127232760190964, "reward_std": 0.1506785824894905, "rewards/accuracy_reward": 0.13839286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.97433041036129, "step": 1670 }, { "completion_length": 646.7723541259766, "epoch": 0.49914121424837576, "grad_norm": 0.8579990267753601, "kl": 0.48046875, "learning_rate": 6.293739915604668e-07, "loss": 0.0192, "reward": 1.1054688096046448, "reward_std": 0.13228006940335035, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 1671 }, { "completion_length": 635.7991333007812, "epoch": 0.49943992233589724, "grad_norm": 0.3027392625808716, "kl": 0.425537109375, "learning_rate": 6.289119461538712e-07, "loss": 0.017, "reward": 1.109933078289032, "reward_std": 0.18937743082642555, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 1672 }, { "completion_length": 672.7701110839844, "epoch": 0.4997386304234187, "grad_norm": 0.6069291830062866, "kl": 0.623291015625, "learning_rate": 6.284498148988123e-07, "loss": 0.0249, "reward": 1.089285746216774, "reward_std": 0.11023499816656113, "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107313156128, "step": 1673 }, { "completion_length": 641.6674194335938, "epoch": 0.5000373385109402, "grad_norm": 0.35966309905052185, "kl": 0.3681640625, "learning_rate": 6.279875982980439e-07, "loss": 0.0147, "reward": 1.1997768580913544, "reward_std": 0.21056389063596725, "rewards/accuracy_reward": 0.2165178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 1674 }, { "completion_length": 763.0379791259766, "epoch": 0.5003360465984616, "grad_norm": 0.8048264384269714, "kl": 0.6513671875, "learning_rate": 6.275252968544119e-07, "loss": 0.0261, "reward": 1.1300223767757416, "reward_std": 0.18451089784502983, "rewards/accuracy_reward": 0.16294643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9670759290456772, "step": 1675 }, { "completion_length": 677.8236846923828, "epoch": 0.5006347546859832, "grad_norm": 0.985861599445343, "kl": 0.50732421875, "learning_rate": 6.270629110708554e-07, "loss": 0.0203, "reward": 1.1679687798023224, "reward_std": 0.14243588782846928, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.976004496216774, "step": 1676 }, { "completion_length": 598.6094055175781, "epoch": 0.5009334627735046, "grad_norm": 0.3149518668651581, "kl": 0.1749267578125, "learning_rate": 6.266004414504044e-07, "loss": 0.007, "reward": 1.2165179252624512, "reward_std": 0.1881512701511383, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9977678656578064, "step": 1677 }, { "completion_length": 692.6875457763672, "epoch": 0.5012321708610261, "grad_norm": 0.53229820728302, "kl": 0.3477783203125, "learning_rate": 6.261378884961811e-07, "loss": 0.0139, "reward": 1.1266741752624512, "reward_std": 0.1355691272765398, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1678 }, { "completion_length": 554.4866256713867, "epoch": 0.5015308789485475, "grad_norm": 0.745934247970581, "kl": 0.34234619140625, "learning_rate": 6.256752527113973e-07, "loss": 0.0137, "reward": 1.0731026828289032, "reward_std": 0.12947128154337406, "rewards/accuracy_reward": 0.0825892873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 1679 }, { "completion_length": 724.8995819091797, "epoch": 0.501829587036069, "grad_norm": 0.2891250252723694, "kl": 0.4091796875, "learning_rate": 6.252125345993555e-07, "loss": 0.0164, "reward": 1.1886161267757416, "reward_std": 0.15818206826224923, "rewards/accuracy_reward": 0.19866072479635477, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 1680 }, { "completion_length": 677.0803833007812, "epoch": 0.5021282951235905, "grad_norm": 0.3848031461238861, "kl": 0.5596923828125, "learning_rate": 6.247497346634475e-07, "loss": 0.0224, "reward": 1.2159598469734192, "reward_std": 0.1394837312400341, "rewards/accuracy_reward": 0.2343750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 1681 }, { "completion_length": 708.5044860839844, "epoch": 0.5024270032111119, "grad_norm": 0.486890971660614, "kl": 0.38623046875, "learning_rate": 6.242868534071547e-07, "loss": 0.0155, "reward": 1.1166295111179352, "reward_std": 0.17737079598009586, "rewards/accuracy_reward": 0.12500000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 1682 }, { "completion_length": 603.9375152587891, "epoch": 0.5027257112986334, "grad_norm": 0.6569130420684814, "kl": 0.2430419921875, "learning_rate": 6.238238913340461e-07, "loss": 0.0097, "reward": 1.195870578289032, "reward_std": 0.16086182557046413, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.992745578289032, "step": 1683 }, { "completion_length": 705.6562652587891, "epoch": 0.5030244193861548, "grad_norm": 0.2159164994955063, "kl": 0.4775390625, "learning_rate": 6.233608489477793e-07, "loss": 0.0191, "reward": 1.215401828289032, "reward_std": 0.14861003495752811, "rewards/accuracy_reward": 0.232142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 1684 }, { "completion_length": 600.5312728881836, "epoch": 0.5033231274736764, "grad_norm": 0.5251451730728149, "kl": 0.54638671875, "learning_rate": 6.228977267520991e-07, "loss": 0.0219, "reward": 1.1657366454601288, "reward_std": 0.20291019417345524, "rewards/accuracy_reward": 0.1830357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 1685 }, { "completion_length": 638.8214721679688, "epoch": 0.5036218355611978, "grad_norm": 0.44569528102874756, "kl": 0.435546875, "learning_rate": 6.224345252508368e-07, "loss": 0.0174, "reward": 1.1389509737491608, "reward_std": 0.17471468076109886, "rewards/accuracy_reward": 0.15625000861473382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 1686 }, { "completion_length": 726.091552734375, "epoch": 0.5039205436487193, "grad_norm": 0.4684152603149414, "kl": 0.503662109375, "learning_rate": 6.219712449479105e-07, "loss": 0.0202, "reward": 1.1116071939468384, "reward_std": 0.14365565218031406, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1687 }, { "completion_length": 547.9576110839844, "epoch": 0.5042192517362407, "grad_norm": 0.4885670244693756, "kl": 0.1617431640625, "learning_rate": 6.215078863473234e-07, "loss": 0.0065, "reward": 1.1439732313156128, "reward_std": 0.12183477357029915, "rewards/accuracy_reward": 0.15178572246804833, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875149011612, "step": 1688 }, { "completion_length": 609.7500152587891, "epoch": 0.5045179598237622, "grad_norm": 0.6716877222061157, "kl": 0.3526611328125, "learning_rate": 6.210444499531647e-07, "loss": 0.0141, "reward": 1.1534598767757416, "reward_std": 0.1320192338898778, "rewards/accuracy_reward": 0.16517857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812649011612, "step": 1689 }, { "completion_length": 644.9598693847656, "epoch": 0.5048166679112837, "grad_norm": 0.5382274389266968, "kl": 0.4345703125, "learning_rate": 6.205809362696076e-07, "loss": 0.0174, "reward": 1.1400669813156128, "reward_std": 0.11956567317247391, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 1690 }, { "completion_length": 691.2299346923828, "epoch": 0.5051153759988052, "grad_norm": 0.8891240954399109, "kl": 0.5, "learning_rate": 6.201173458009093e-07, "loss": 0.02, "reward": 1.1261161118745804, "reward_std": 0.15888206660747528, "rewards/accuracy_reward": 0.14062500861473382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 1691 }, { "completion_length": 676.6317291259766, "epoch": 0.5054140840863266, "grad_norm": 0.3059857189655304, "kl": 0.4130859375, "learning_rate": 6.196536790514112e-07, "loss": 0.0165, "reward": 1.1233259737491608, "reward_std": 0.13991557992994785, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 1692 }, { "completion_length": 726.7589416503906, "epoch": 0.5057127921738481, "grad_norm": 0.8824033737182617, "kl": 0.643798828125, "learning_rate": 6.19189936525537e-07, "loss": 0.0258, "reward": 1.0703125596046448, "reward_std": 0.14162713382393122, "rewards/accuracy_reward": 0.09151786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946790456772, "step": 1693 }, { "completion_length": 583.2835083007812, "epoch": 0.5060115002613695, "grad_norm": 0.8621372580528259, "kl": 0.625, "learning_rate": 6.187261187277931e-07, "loss": 0.025, "reward": 1.2801340222358704, "reward_std": 0.15843011252582073, "rewards/accuracy_reward": 0.2946428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 1694 }, { "completion_length": 688.700927734375, "epoch": 0.5063102083488911, "grad_norm": 0.7840409874916077, "kl": 0.60205078125, "learning_rate": 6.18262226162768e-07, "loss": 0.0241, "reward": 1.174665242433548, "reward_std": 0.17226405441761017, "rewards/accuracy_reward": 0.18303572572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916295111179352, "step": 1695 }, { "completion_length": 631.6428985595703, "epoch": 0.5066089164364125, "grad_norm": 0.30859819054603577, "kl": 0.4083251953125, "learning_rate": 6.177982593351313e-07, "loss": 0.0163, "reward": 1.1261160969734192, "reward_std": 0.10619966173544526, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 1696 }, { "completion_length": 604.1004943847656, "epoch": 0.506907624523934, "grad_norm": 0.49610304832458496, "kl": 0.462646484375, "learning_rate": 6.173342187496333e-07, "loss": 0.0185, "reward": 1.1875000596046448, "reward_std": 0.13615364767611027, "rewards/accuracy_reward": 0.1964285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714775323868, "step": 1697 }, { "completion_length": 649.7120819091797, "epoch": 0.5072063326114554, "grad_norm": 0.38224557042121887, "kl": 0.5341796875, "learning_rate": 6.168701049111048e-07, "loss": 0.0214, "reward": 1.1562500298023224, "reward_std": 0.20172176510095596, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 1698 }, { "completion_length": 646.4196624755859, "epoch": 0.507505040698977, "grad_norm": 0.2969953119754791, "kl": 0.39208984375, "learning_rate": 6.164059183244562e-07, "loss": 0.0157, "reward": 1.209821492433548, "reward_std": 0.19480478018522263, "rewards/accuracy_reward": 0.2299107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 1699 }, { "completion_length": 714.9777069091797, "epoch": 0.5078037487864984, "grad_norm": 0.46118640899658203, "kl": 1.171875, "learning_rate": 6.159416594946769e-07, "loss": 0.0469, "reward": 1.0440848916769028, "reward_std": 0.15681350976228714, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.97042416036129, "step": 1700 }, { "completion_length": 642.1495666503906, "epoch": 0.5081024568740199, "grad_norm": 0.39375826716423035, "kl": 0.39990234375, "learning_rate": 6.15477328926835e-07, "loss": 0.016, "reward": 1.2126116752624512, "reward_std": 0.1915285438299179, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 1701 }, { "completion_length": 607.7143249511719, "epoch": 0.5084011649615413, "grad_norm": 0.2979508340358734, "kl": 0.310546875, "learning_rate": 6.150129271260768e-07, "loss": 0.0124, "reward": 1.1261160969734192, "reward_std": 0.10154800908640027, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966517984867096, "step": 1702 }, { "completion_length": 618.0156555175781, "epoch": 0.5086998730490628, "grad_norm": 0.440426230430603, "kl": 0.28173828125, "learning_rate": 6.145484545976257e-07, "loss": 0.0113, "reward": 1.198102742433548, "reward_std": 0.08669026382267475, "rewards/accuracy_reward": 0.20089286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.997209832072258, "step": 1703 }, { "completion_length": 649.1027069091797, "epoch": 0.5089985811365842, "grad_norm": 1.030515193939209, "kl": 0.8291015625, "learning_rate": 6.140839118467825e-07, "loss": 0.0331, "reward": 1.094308078289032, "reward_std": 0.14750026911497116, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 1704 }, { "completion_length": 713.8772735595703, "epoch": 0.5092972892241058, "grad_norm": 1.4054499864578247, "kl": 0.611328125, "learning_rate": 6.13619299378924e-07, "loss": 0.0245, "reward": 1.1333706080913544, "reward_std": 0.16641556099057198, "rewards/accuracy_reward": 0.15401786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 1705 }, { "completion_length": 672.5335083007812, "epoch": 0.5095959973116272, "grad_norm": 0.6739136576652527, "kl": 0.5865478515625, "learning_rate": 6.131546176995033e-07, "loss": 0.0235, "reward": 1.0970982611179352, "reward_std": 0.17716419324278831, "rewards/accuracy_reward": 0.11383929033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 1706 }, { "completion_length": 700.4464569091797, "epoch": 0.5098947053991487, "grad_norm": 0.5559543967247009, "kl": 0.666015625, "learning_rate": 6.126898673140483e-07, "loss": 0.0266, "reward": 1.1138393431901932, "reward_std": 0.1574202971532941, "rewards/accuracy_reward": 0.12946429592557251, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 1707 }, { "completion_length": 638.7611846923828, "epoch": 0.5101934134866701, "grad_norm": 0.8141440153121948, "kl": 0.1756591796875, "learning_rate": 6.122250487281621e-07, "loss": 0.007, "reward": 1.139508992433548, "reward_std": 0.11593508441001177, "rewards/accuracy_reward": 0.14285715389996767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966517984867096, "step": 1708 }, { "completion_length": 682.8839569091797, "epoch": 0.5104921215741917, "grad_norm": 1.2668899297714233, "kl": 0.854736328125, "learning_rate": 6.117601624475214e-07, "loss": 0.0342, "reward": 1.2477678954601288, "reward_std": 0.24958008155226707, "rewards/accuracy_reward": 0.2767857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9709821790456772, "step": 1709 }, { "completion_length": 622.1763610839844, "epoch": 0.5107908296617131, "grad_norm": 0.5799228549003601, "kl": 0.337158203125, "learning_rate": 6.11295208977877e-07, "loss": 0.0135, "reward": 1.2756697237491608, "reward_std": 0.1525908624753356, "rewards/accuracy_reward": 0.283482150407508, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875447034836, "step": 1710 }, { "completion_length": 587.0803833007812, "epoch": 0.5110895377492346, "grad_norm": 0.553458571434021, "kl": 0.484130859375, "learning_rate": 6.10830188825053e-07, "loss": 0.0194, "reward": 1.271763414144516, "reward_std": 0.21146350912749767, "rewards/accuracy_reward": 0.2834821464493871, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1711 }, { "completion_length": 673.1830596923828, "epoch": 0.511388245836756, "grad_norm": 1.4395973682403564, "kl": 0.8955078125, "learning_rate": 6.103651024949454e-07, "loss": 0.0358, "reward": 1.1478794813156128, "reward_std": 0.1704997755587101, "rewards/accuracy_reward": 0.1674107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 1712 }, { "completion_length": 580.8147583007812, "epoch": 0.5116869539242775, "grad_norm": 0.6744803786277771, "kl": 0.65283203125, "learning_rate": 6.098999504935228e-07, "loss": 0.0261, "reward": 1.1238839775323868, "reward_std": 0.10057204961776733, "rewards/accuracy_reward": 0.13392857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1713 }, { "completion_length": 705.4821624755859, "epoch": 0.511985662011799, "grad_norm": 0.4829126298427582, "kl": 0.65380859375, "learning_rate": 6.094347333268251e-07, "loss": 0.0262, "reward": 1.1919643133878708, "reward_std": 0.1813643779605627, "rewards/accuracy_reward": 0.2053571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 1714 }, { "completion_length": 608.9174499511719, "epoch": 0.5122843700993205, "grad_norm": 0.7112649083137512, "kl": 0.390869140625, "learning_rate": 6.089694515009624e-07, "loss": 0.0157, "reward": 1.1718750596046448, "reward_std": 0.19382111728191376, "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1715 }, { "completion_length": 603.2009124755859, "epoch": 0.5125830781868419, "grad_norm": 0.38433220982551575, "kl": 0.814453125, "learning_rate": 6.085041055221161e-07, "loss": 0.0325, "reward": 1.1629464328289032, "reward_std": 0.1570736113935709, "rewards/accuracy_reward": 0.1830357164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 1716 }, { "completion_length": 684.3236846923828, "epoch": 0.5128817862743634, "grad_norm": 1.401312232017517, "kl": 0.907470703125, "learning_rate": 6.080386958965374e-07, "loss": 0.0363, "reward": 1.2092634439468384, "reward_std": 0.1256523970514536, "rewards/accuracy_reward": 0.2209821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 1717 }, { "completion_length": 680.2187805175781, "epoch": 0.5131804943618848, "grad_norm": 0.4089798927307129, "kl": 0.419677734375, "learning_rate": 6.075732231305457e-07, "loss": 0.0168, "reward": 1.1378348767757416, "reward_std": 0.12139743939042091, "rewards/accuracy_reward": 0.14955358067527413, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 1718 }, { "completion_length": 658.8705596923828, "epoch": 0.5134792024494064, "grad_norm": 0.5855483412742615, "kl": 0.7154541015625, "learning_rate": 6.071076877305299e-07, "loss": 0.0286, "reward": 1.1763393580913544, "reward_std": 0.1623552218079567, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 1719 }, { "completion_length": 666.7076263427734, "epoch": 0.5137779105369278, "grad_norm": 0.4334624409675598, "kl": 0.43994140625, "learning_rate": 6.066420902029472e-07, "loss": 0.0176, "reward": 1.1244420111179352, "reward_std": 0.17688178271055222, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 1720 }, { "completion_length": 688.9933319091797, "epoch": 0.5140766186244493, "grad_norm": 0.5140188932418823, "kl": 0.50390625, "learning_rate": 6.061764310543219e-07, "loss": 0.0202, "reward": 1.1099331080913544, "reward_std": 0.13555711321532726, "rewards/accuracy_reward": 0.12723215389996767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 1721 }, { "completion_length": 674.4911041259766, "epoch": 0.5143753267119707, "grad_norm": 0.6301897168159485, "kl": 0.31591796875, "learning_rate": 6.057107107912453e-07, "loss": 0.0127, "reward": 1.1841518580913544, "reward_std": 0.15767767652869225, "rewards/accuracy_reward": 0.19642857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1722 }, { "completion_length": 648.7210083007812, "epoch": 0.5146740347994921, "grad_norm": 0.528436005115509, "kl": 0.278076171875, "learning_rate": 6.052449299203758e-07, "loss": 0.0111, "reward": 1.1489955484867096, "reward_std": 0.08991318754851818, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 1723 }, { "completion_length": 656.200927734375, "epoch": 0.5149727428870137, "grad_norm": 0.4526270627975464, "kl": 0.41552734375, "learning_rate": 6.047790889484369e-07, "loss": 0.0167, "reward": 1.2639509737491608, "reward_std": 0.2091105543076992, "rewards/accuracy_reward": 0.27678572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1724 }, { "completion_length": 717.1629791259766, "epoch": 0.5152714509745351, "grad_norm": 0.6266877055168152, "kl": 0.677734375, "learning_rate": 6.043131883822185e-07, "loss": 0.0271, "reward": 1.1255580484867096, "reward_std": 0.1844976209104061, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 1725 }, { "completion_length": 654.6763610839844, "epoch": 0.5155701590620566, "grad_norm": 0.5564864873886108, "kl": 0.56396484375, "learning_rate": 6.038472287285741e-07, "loss": 0.0225, "reward": 1.1545759439468384, "reward_std": 0.18617111071944237, "rewards/accuracy_reward": 0.16294643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916295111179352, "step": 1726 }, { "completion_length": 641.8348541259766, "epoch": 0.515868867149578, "grad_norm": 1.1150379180908203, "kl": 0.891845703125, "learning_rate": 6.033812104944227e-07, "loss": 0.0357, "reward": 1.08370541036129, "reward_std": 0.11992630735039711, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9720982313156128, "step": 1727 }, { "completion_length": 657.6629791259766, "epoch": 0.5161675752370996, "grad_norm": 0.4714948236942291, "kl": 0.471435546875, "learning_rate": 6.02915134186746e-07, "loss": 0.0189, "reward": 1.2756697237491608, "reward_std": 0.20285427570343018, "rewards/accuracy_reward": 0.2924107387661934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589477300644, "step": 1728 }, { "completion_length": 643.8660888671875, "epoch": 0.516466283324621, "grad_norm": 0.3078688085079193, "kl": 0.1669921875, "learning_rate": 6.024490003125896e-07, "loss": 0.0067, "reward": 1.1166294813156128, "reward_std": 0.08673432841897011, "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616454601288, "step": 1729 }, { "completion_length": 579.0223541259766, "epoch": 0.5167649914121425, "grad_norm": 0.2956479787826538, "kl": 0.383056640625, "learning_rate": 6.019828093790613e-07, "loss": 0.0153, "reward": 1.3258929252624512, "reward_std": 0.12077471986413002, "rewards/accuracy_reward": 0.3325892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933036118745804, "step": 1730 }, { "completion_length": 680.5826110839844, "epoch": 0.5170636994996639, "grad_norm": 0.38780030608177185, "kl": 0.4788818359375, "learning_rate": 6.015165618933315e-07, "loss": 0.0192, "reward": 1.1668527126312256, "reward_std": 0.15686978865414858, "rewards/accuracy_reward": 0.17633929336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 1731 }, { "completion_length": 754.1294860839844, "epoch": 0.5173624075871854, "grad_norm": 0.7403196692466736, "kl": 0.634765625, "learning_rate": 6.010502583626314e-07, "loss": 0.0254, "reward": 1.148995578289032, "reward_std": 0.2200388703495264, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562798023224, "step": 1732 }, { "completion_length": 601.6897583007812, "epoch": 0.5176611156747069, "grad_norm": 0.4426736831665039, "kl": 0.57080078125, "learning_rate": 6.005838992942536e-07, "loss": 0.0228, "reward": 1.1489956080913544, "reward_std": 0.18539527244865894, "rewards/accuracy_reward": 0.1674107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 1733 }, { "completion_length": 628.8549346923828, "epoch": 0.5179598237622284, "grad_norm": 0.5545042157173157, "kl": 0.28076171875, "learning_rate": 6.001174851955512e-07, "loss": 0.0112, "reward": 1.2087053954601288, "reward_std": 0.09274304565042257, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 1734 }, { "completion_length": 656.5223541259766, "epoch": 0.5182585318497498, "grad_norm": 0.4734697937965393, "kl": 0.54296875, "learning_rate": 5.99651016573937e-07, "loss": 0.0217, "reward": 1.0859375447034836, "reward_std": 0.07525120303034782, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232313156128, "step": 1735 }, { "completion_length": 635.6741180419922, "epoch": 0.5185572399372713, "grad_norm": 0.32030189037323, "kl": 0.2728271484375, "learning_rate": 5.99184493936883e-07, "loss": 0.0109, "reward": 1.2181920409202576, "reward_std": 0.2063816636800766, "rewards/accuracy_reward": 0.2254464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455484867096, "step": 1736 }, { "completion_length": 713.575927734375, "epoch": 0.5188559480247927, "grad_norm": 0.6755461096763611, "kl": 0.75830078125, "learning_rate": 5.987179177919202e-07, "loss": 0.0303, "reward": 1.0518973469734192, "reward_std": 0.1192231085151434, "rewards/accuracy_reward": 0.08035714877769351, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9715402275323868, "step": 1737 }, { "completion_length": 645.404052734375, "epoch": 0.5191546561123143, "grad_norm": 0.4938809871673584, "kl": 0.640869140625, "learning_rate": 5.982512886466377e-07, "loss": 0.0257, "reward": 1.1043526977300644, "reward_std": 0.16578946076333523, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1738 }, { "completion_length": 603.0491180419922, "epoch": 0.5194533641998357, "grad_norm": 0.36461520195007324, "kl": 0.3033447265625, "learning_rate": 5.977846070086823e-07, "loss": 0.0121, "reward": 1.131696492433548, "reward_std": 0.09714781027287245, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1739 }, { "completion_length": 698.6406555175781, "epoch": 0.5197520722873572, "grad_norm": 0.7139224410057068, "kl": 0.830078125, "learning_rate": 5.973178733857578e-07, "loss": 0.0332, "reward": 1.1489955633878708, "reward_std": 0.20949284732341766, "rewards/accuracy_reward": 0.18303572246804833, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9659598618745804, "step": 1740 }, { "completion_length": 599.5402069091797, "epoch": 0.5200507803748786, "grad_norm": 0.40753936767578125, "kl": 0.6513671875, "learning_rate": 5.968510882856249e-07, "loss": 0.026, "reward": 1.1702009439468384, "reward_std": 0.16148490644991398, "rewards/accuracy_reward": 0.19196429662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 1741 }, { "completion_length": 674.8393249511719, "epoch": 0.5203494884624001, "grad_norm": 0.6569154262542725, "kl": 0.59375, "learning_rate": 5.963842522160997e-07, "loss": 0.0237, "reward": 1.145089328289032, "reward_std": 0.17874113842844963, "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9732143133878708, "step": 1742 }, { "completion_length": 671.654052734375, "epoch": 0.5206481965499216, "grad_norm": 0.3429394066333771, "kl": 0.4697265625, "learning_rate": 5.959173656850543e-07, "loss": 0.0188, "reward": 1.085379496216774, "reward_std": 0.13831805624067783, "rewards/accuracy_reward": 0.09821429220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 1743 }, { "completion_length": 709.8036041259766, "epoch": 0.5209469046374431, "grad_norm": 0.5150386691093445, "kl": 0.443359375, "learning_rate": 5.954504292004154e-07, "loss": 0.0178, "reward": 1.0725446939468384, "reward_std": 0.10412666574120522, "rewards/accuracy_reward": 0.0848214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 1744 }, { "completion_length": 667.4665374755859, "epoch": 0.5212456127249645, "grad_norm": 0.4753444492816925, "kl": 0.4697265625, "learning_rate": 5.949834432701641e-07, "loss": 0.0188, "reward": 1.1501116752624512, "reward_std": 0.15937378350645304, "rewards/accuracy_reward": 0.1696428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 1745 }, { "completion_length": 688.3013763427734, "epoch": 0.521544320812486, "grad_norm": 1.0014581680297852, "kl": 0.720703125, "learning_rate": 5.945164084023355e-07, "loss": 0.0289, "reward": 1.1780134588479996, "reward_std": 0.15524416975677013, "rewards/accuracy_reward": 0.2008928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977120578289032, "step": 1746 }, { "completion_length": 655.5714569091797, "epoch": 0.5218430289000074, "grad_norm": 0.814373254776001, "kl": 0.466064453125, "learning_rate": 5.940493251050174e-07, "loss": 0.0186, "reward": 1.0697545260190964, "reward_std": 0.10246901493519545, "rewards/accuracy_reward": 0.08482143515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 1747 }, { "completion_length": 741.9464569091797, "epoch": 0.522141736987529, "grad_norm": 0.35339275002479553, "kl": 0.73095703125, "learning_rate": 5.93582193886351e-07, "loss": 0.0293, "reward": 1.0379464626312256, "reward_std": 0.1506412159651518, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500298023224, "step": 1748 }, { "completion_length": 672.2433319091797, "epoch": 0.5224404450750504, "grad_norm": 1.021483063697815, "kl": 0.53857421875, "learning_rate": 5.931150152545292e-07, "loss": 0.0216, "reward": 1.1835938096046448, "reward_std": 0.1329372450709343, "rewards/accuracy_reward": 0.191964291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916295111179352, "step": 1749 }, { "completion_length": 591.8169860839844, "epoch": 0.5227391531625719, "grad_norm": 0.29672500491142273, "kl": 0.347412109375, "learning_rate": 5.926477897177967e-07, "loss": 0.0139, "reward": 1.0864956080913544, "reward_std": 0.11441207025200129, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 1750 }, { "completion_length": 669.9152069091797, "epoch": 0.5230378612500933, "grad_norm": 0.5903239250183105, "kl": 0.7254638671875, "learning_rate": 5.921805177844486e-07, "loss": 0.0291, "reward": 1.0100447237491608, "reward_std": 0.08627788256853819, "rewards/accuracy_reward": 0.022321429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1751 }, { "completion_length": 745.1161041259766, "epoch": 0.5233365693376149, "grad_norm": 0.6844322681427002, "kl": 0.9521484375, "learning_rate": 5.917131999628315e-07, "loss": 0.0381, "reward": 1.1729911267757416, "reward_std": 0.19809923879802227, "rewards/accuracy_reward": 0.20312500861473382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9698661267757416, "step": 1752 }, { "completion_length": 667.2678985595703, "epoch": 0.5236352774251363, "grad_norm": 0.5718821287155151, "kl": 0.55419921875, "learning_rate": 5.912458367613409e-07, "loss": 0.0222, "reward": 1.1623884737491608, "reward_std": 0.2176392897963524, "rewards/accuracy_reward": 0.1897321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562947034836, "step": 1753 }, { "completion_length": 697.4464569091797, "epoch": 0.5239339855126578, "grad_norm": 1.2454196214675903, "kl": 1.01904296875, "learning_rate": 5.907784286884228e-07, "loss": 0.0406, "reward": 1.1768973767757416, "reward_std": 0.1611199863255024, "rewards/accuracy_reward": 0.19642857951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687649011612, "step": 1754 }, { "completion_length": 665.0223388671875, "epoch": 0.5242326936001792, "grad_norm": 0.7267901301383972, "kl": 0.7568359375, "learning_rate": 5.903109762525707e-07, "loss": 0.0303, "reward": 1.178571492433548, "reward_std": 0.21911903098225594, "rewards/accuracy_reward": 0.19642858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98214291036129, "step": 1755 }, { "completion_length": 634.6294860839844, "epoch": 0.5245314016877007, "grad_norm": 0.5158233642578125, "kl": 0.650390625, "learning_rate": 5.898434799623276e-07, "loss": 0.026, "reward": 1.0948661118745804, "reward_std": 0.13770475424826145, "rewards/accuracy_reward": 0.11160715157166123, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 1756 }, { "completion_length": 676.872802734375, "epoch": 0.5248301097752222, "grad_norm": 0.33897995948791504, "kl": 0.53466796875, "learning_rate": 5.893759403262832e-07, "loss": 0.0213, "reward": 1.1406250596046448, "reward_std": 0.09679117612540722, "rewards/accuracy_reward": 0.14732143748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933036267757416, "step": 1757 }, { "completion_length": 611.7143096923828, "epoch": 0.5251288178627437, "grad_norm": 0.3351786434650421, "kl": 0.33056640625, "learning_rate": 5.889083578530752e-07, "loss": 0.0132, "reward": 1.1735491454601288, "reward_std": 0.117575129494071, "rewards/accuracy_reward": 0.18526786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1758 }, { "completion_length": 637.8035888671875, "epoch": 0.5254275259502651, "grad_norm": 0.2813557982444763, "kl": 0.185546875, "learning_rate": 5.88440733051387e-07, "loss": 0.0074, "reward": 1.1847098767757416, "reward_std": 0.16380434669554234, "rewards/accuracy_reward": 0.1897321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949777126312256, "step": 1759 }, { "completion_length": 619.5201110839844, "epoch": 0.5257262340377866, "grad_norm": 0.2634652554988861, "kl": 0.36346435546875, "learning_rate": 5.87973066429949e-07, "loss": 0.0146, "reward": 1.211495578289032, "reward_std": 0.1579978782683611, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.992745578289032, "step": 1760 }, { "completion_length": 695.5357513427734, "epoch": 0.526024942125308, "grad_norm": 0.6444151997566223, "kl": 0.504638671875, "learning_rate": 5.875053584975365e-07, "loss": 0.0202, "reward": 1.1439732611179352, "reward_std": 0.18626972287893295, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946790456772, "step": 1761 }, { "completion_length": 639.6964569091797, "epoch": 0.5263236502128296, "grad_norm": 0.33076855540275574, "kl": 0.3978271484375, "learning_rate": 5.870376097629698e-07, "loss": 0.0159, "reward": 1.2304688394069672, "reward_std": 0.16092894691973925, "rewards/accuracy_reward": 0.2366071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616454601288, "step": 1762 }, { "completion_length": 693.7165374755859, "epoch": 0.526622358300351, "grad_norm": 1.0729507207870483, "kl": 0.4627685546875, "learning_rate": 5.865698207351138e-07, "loss": 0.0185, "reward": 1.1261161267757416, "reward_std": 0.17835631594061852, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 1763 }, { "completion_length": 621.3951263427734, "epoch": 0.5269210663878725, "grad_norm": 0.39166194200515747, "kl": 0.3330078125, "learning_rate": 5.861019919228769e-07, "loss": 0.0133, "reward": 1.1534598469734192, "reward_std": 0.14745066780596972, "rewards/accuracy_reward": 0.1629464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 1764 }, { "completion_length": 703.7768096923828, "epoch": 0.5272197744753939, "grad_norm": 0.23867444694042206, "kl": 0.458740234375, "learning_rate": 5.856341238352114e-07, "loss": 0.0184, "reward": 1.1026786267757416, "reward_std": 0.12336608022451401, "rewards/accuracy_reward": 0.12276786426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 1765 }, { "completion_length": 615.2232513427734, "epoch": 0.5275184825629153, "grad_norm": 0.43753862380981445, "kl": 0.511962890625, "learning_rate": 5.851662169811116e-07, "loss": 0.0204, "reward": 1.1289062947034836, "reward_std": 0.17106243036687374, "rewards/accuracy_reward": 0.14285715040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1766 }, { "completion_length": 732.4397583007812, "epoch": 0.5278171906504369, "grad_norm": 0.38698357343673706, "kl": 0.362060546875, "learning_rate": 5.846982718696143e-07, "loss": 0.0145, "reward": 1.1713170111179352, "reward_std": 0.17307674512267113, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884290456772, "step": 1767 }, { "completion_length": 730.4352874755859, "epoch": 0.5281158987379583, "grad_norm": 0.7709253430366516, "kl": 0.653076171875, "learning_rate": 5.842302890097981e-07, "loss": 0.0261, "reward": 1.0697545409202576, "reward_std": 0.15627989545464516, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.982700914144516, "step": 1768 }, { "completion_length": 666.2143249511719, "epoch": 0.5284146068254798, "grad_norm": 0.3534386157989502, "kl": 0.370361328125, "learning_rate": 5.837622689107823e-07, "loss": 0.0148, "reward": 1.1244420111179352, "reward_std": 0.13748328387737274, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 1769 }, { "completion_length": 626.7790374755859, "epoch": 0.5287133149130012, "grad_norm": 0.43472936749458313, "kl": 0.68896484375, "learning_rate": 5.83294212081727e-07, "loss": 0.0276, "reward": 1.1629464626312256, "reward_std": 0.1756544578820467, "rewards/accuracy_reward": 0.18303571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 1770 }, { "completion_length": 703.4576263427734, "epoch": 0.5290120230005227, "grad_norm": 0.8349222540855408, "kl": 0.634765625, "learning_rate": 5.828261190318323e-07, "loss": 0.0254, "reward": 1.1534598618745804, "reward_std": 0.12214847467839718, "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562798023224, "step": 1771 }, { "completion_length": 619.1986846923828, "epoch": 0.5293107310880442, "grad_norm": 0.5296047329902649, "kl": 0.1904296875, "learning_rate": 5.823579902703373e-07, "loss": 0.0076, "reward": 1.2025670111179352, "reward_std": 0.16695973463356495, "rewards/accuracy_reward": 0.20758930081501603, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949777126312256, "step": 1772 }, { "completion_length": 638.1451110839844, "epoch": 0.5296094391755657, "grad_norm": 0.39389491081237793, "kl": 0.58984375, "learning_rate": 5.818898263065203e-07, "loss": 0.0236, "reward": 1.2109375298023224, "reward_std": 0.18040997721254826, "rewards/accuracy_reward": 0.22767858812585473, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 1773 }, { "completion_length": 655.0044860839844, "epoch": 0.5299081472630871, "grad_norm": 0.403281033039093, "kl": 0.4375, "learning_rate": 5.814216276496978e-07, "loss": 0.0175, "reward": 1.1540179252624512, "reward_std": 0.16641448996961117, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1774 }, { "completion_length": 650.2701263427734, "epoch": 0.5302068553506086, "grad_norm": 0.3618859648704529, "kl": 0.486328125, "learning_rate": 5.809533948092241e-07, "loss": 0.0195, "reward": 1.1283482909202576, "reward_std": 0.17092561349272728, "rewards/accuracy_reward": 0.13839286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1775 }, { "completion_length": 697.3103179931641, "epoch": 0.53050556343813, "grad_norm": 0.34174129366874695, "kl": 0.682861328125, "learning_rate": 5.804851282944905e-07, "loss": 0.0273, "reward": 1.1512277126312256, "reward_std": 0.1790682077407837, "rewards/accuracy_reward": 0.17857143841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562947034836, "step": 1776 }, { "completion_length": 646.3928985595703, "epoch": 0.5308042715256516, "grad_norm": 0.6518713235855103, "kl": 0.51416015625, "learning_rate": 5.800168286149254e-07, "loss": 0.0206, "reward": 1.3136160969734192, "reward_std": 0.166208166629076, "rewards/accuracy_reward": 0.3258928693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 1777 }, { "completion_length": 675.966552734375, "epoch": 0.531102979613173, "grad_norm": 1.1718307733535767, "kl": 0.8961181640625, "learning_rate": 5.795484962799924e-07, "loss": 0.0358, "reward": 1.1752232611179352, "reward_std": 0.21008703485131264, "rewards/accuracy_reward": 0.20982144586741924, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9654018133878708, "step": 1778 }, { "completion_length": 604.928596496582, "epoch": 0.5314016877006945, "grad_norm": 0.9429442286491394, "kl": 0.2452392578125, "learning_rate": 5.790801317991919e-07, "loss": 0.0098, "reward": 1.2416295111179352, "reward_std": 0.15114595368504524, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 1779 }, { "completion_length": 719.6786041259766, "epoch": 0.5317003957882159, "grad_norm": 0.7239096164703369, "kl": 0.62158203125, "learning_rate": 5.786117356820579e-07, "loss": 0.0249, "reward": 1.1579241156578064, "reward_std": 0.20827532187104225, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 1780 }, { "completion_length": 603.9844055175781, "epoch": 0.5319991038757375, "grad_norm": 0.4136126935482025, "kl": 0.4796142578125, "learning_rate": 5.781433084381599e-07, "loss": 0.0192, "reward": 1.1808035969734192, "reward_std": 0.1422005519270897, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9955357313156128, "step": 1781 }, { "completion_length": 683.5268096923828, "epoch": 0.5322978119632589, "grad_norm": 0.40509533882141113, "kl": 0.556640625, "learning_rate": 5.776748505771005e-07, "loss": 0.0223, "reward": 1.3030134439468384, "reward_std": 0.21253128722310066, "rewards/accuracy_reward": 0.3281250149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884290456772, "step": 1782 }, { "completion_length": 710.3214721679688, "epoch": 0.5325965200507804, "grad_norm": 0.6207461357116699, "kl": 0.82421875, "learning_rate": 5.77206362608516e-07, "loss": 0.033, "reward": 1.1238839775323868, "reward_std": 0.14658226072788239, "rewards/accuracy_reward": 0.15625000861473382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9676339775323868, "step": 1783 }, { "completion_length": 754.1875305175781, "epoch": 0.5328952281383018, "grad_norm": 0.5872459411621094, "kl": 0.794921875, "learning_rate": 5.767378450420758e-07, "loss": 0.0318, "reward": 1.1406250596046448, "reward_std": 0.23161472380161285, "rewards/accuracy_reward": 0.1629464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 1784 }, { "completion_length": 619.5960083007812, "epoch": 0.5331939362258233, "grad_norm": 0.5266631245613098, "kl": 0.5439453125, "learning_rate": 5.762692983874806e-07, "loss": 0.0218, "reward": 1.4441965222358704, "reward_std": 0.20198483020067215, "rewards/accuracy_reward": 0.4531250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1785 }, { "completion_length": 633.2745819091797, "epoch": 0.5334926443133448, "grad_norm": 0.5263822674751282, "kl": 0.6630859375, "learning_rate": 5.758007231544636e-07, "loss": 0.0266, "reward": 1.1255581080913544, "reward_std": 0.11869444698095322, "rewards/accuracy_reward": 0.14062500093132257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330931901932, "step": 1786 }, { "completion_length": 648.9598693847656, "epoch": 0.5337913524008663, "grad_norm": 0.45075467228889465, "kl": 0.705078125, "learning_rate": 5.753321198527883e-07, "loss": 0.0282, "reward": 1.20870541036129, "reward_std": 0.13222387805581093, "rewards/accuracy_reward": 0.2209821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1787 }, { "completion_length": 649.5491333007812, "epoch": 0.5340900604883877, "grad_norm": 0.5451816916465759, "kl": 0.46044921875, "learning_rate": 5.748634889922494e-07, "loss": 0.0185, "reward": 1.1964286267757416, "reward_std": 0.13733707182109356, "rewards/accuracy_reward": 0.2075893022119999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1788 }, { "completion_length": 657.5915374755859, "epoch": 0.5343887685759092, "grad_norm": 0.8347994089126587, "kl": 0.47119140625, "learning_rate": 5.743948310826716e-07, "loss": 0.0188, "reward": 1.145089328289032, "reward_std": 0.12477049976587296, "rewards/accuracy_reward": 0.15625000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1789 }, { "completion_length": 682.5491333007812, "epoch": 0.5346874766634306, "grad_norm": 0.6222599744796753, "kl": 0.77978515625, "learning_rate": 5.739261466339083e-07, "loss": 0.0312, "reward": 1.1462053954601288, "reward_std": 0.15160690620541573, "rewards/accuracy_reward": 0.1808035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.965401828289032, "step": 1790 }, { "completion_length": 624.2768249511719, "epoch": 0.5349861847509522, "grad_norm": 0.9122955799102783, "kl": 0.38037109375, "learning_rate": 5.734574361558427e-07, "loss": 0.0152, "reward": 1.1573660969734192, "reward_std": 0.1466425433754921, "rewards/accuracy_reward": 0.17187500838190317, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 1791 }, { "completion_length": 644.7946701049805, "epoch": 0.5352848928384736, "grad_norm": 0.8713271021842957, "kl": 0.49798583984375, "learning_rate": 5.729887001583857e-07, "loss": 0.0199, "reward": 1.2059152126312256, "reward_std": 0.18775583244860172, "rewards/accuracy_reward": 0.22098215762525797, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330484867096, "step": 1792 }, { "completion_length": 709.2343902587891, "epoch": 0.5355836009259951, "grad_norm": 0.528500497341156, "kl": 0.57275390625, "learning_rate": 5.725199391514757e-07, "loss": 0.0229, "reward": 1.2181920409202576, "reward_std": 0.17097614705562592, "rewards/accuracy_reward": 0.2343750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 1793 }, { "completion_length": 603.841552734375, "epoch": 0.5358823090135165, "grad_norm": 0.23703637719154358, "kl": 0.392822265625, "learning_rate": 5.720511536450793e-07, "loss": 0.0157, "reward": 1.0485491454601288, "reward_std": 0.12902180664241314, "rewards/accuracy_reward": 0.05803571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 1794 }, { "completion_length": 664.4375305175781, "epoch": 0.536181017101038, "grad_norm": 0.4804369807243347, "kl": 0.694580078125, "learning_rate": 5.715823441491889e-07, "loss": 0.0278, "reward": 1.3337054252624512, "reward_std": 0.21838952973484993, "rewards/accuracy_reward": 0.3504464477300644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 1795 }, { "completion_length": 624.9844207763672, "epoch": 0.5364797251885595, "grad_norm": 1.1466532945632935, "kl": 0.5830078125, "learning_rate": 5.711135111738236e-07, "loss": 0.0233, "reward": 1.170758992433548, "reward_std": 0.261391993612051, "rewards/accuracy_reward": 0.1941964402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625447034836, "step": 1796 }, { "completion_length": 605.1741333007812, "epoch": 0.536778433276081, "grad_norm": 0.2812540531158447, "kl": 0.3729248046875, "learning_rate": 5.706446552290272e-07, "loss": 0.0149, "reward": 1.1852679252624512, "reward_std": 0.18656756170094013, "rewards/accuracy_reward": 0.1986607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1797 }, { "completion_length": 684.8594055175781, "epoch": 0.5370771413636024, "grad_norm": 0.38225147128105164, "kl": 0.46826171875, "learning_rate": 5.701757768248693e-07, "loss": 0.0188, "reward": 1.1127232611179352, "reward_std": 0.11467876471579075, "rewards/accuracy_reward": 0.12500000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 1798 }, { "completion_length": 664.904052734375, "epoch": 0.5373758494511239, "grad_norm": 0.6207275390625, "kl": 0.650390625, "learning_rate": 5.697068764714439e-07, "loss": 0.026, "reward": 1.1378348767757416, "reward_std": 0.17688430286943913, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1799 }, { "completion_length": 661.3348541259766, "epoch": 0.5376745575386453, "grad_norm": 0.8326931595802307, "kl": 0.442138671875, "learning_rate": 5.692379546788683e-07, "loss": 0.0177, "reward": 1.1768973469734192, "reward_std": 0.16030918806791306, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616454601288, "step": 1800 }, { "completion_length": 669.8504791259766, "epoch": 0.5379732656261669, "grad_norm": 0.9670313596725464, "kl": 0.47900390625, "learning_rate": 5.687690119572835e-07, "loss": 0.0191, "reward": 1.1400670409202576, "reward_std": 0.17946675419807434, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848767757416, "step": 1801 }, { "completion_length": 715.294677734375, "epoch": 0.5382719737136883, "grad_norm": 0.4054868817329407, "kl": 0.75927734375, "learning_rate": 5.683000488168533e-07, "loss": 0.0304, "reward": 1.123883992433548, "reward_std": 0.1555708423256874, "rewards/accuracy_reward": 0.14732143841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625447034836, "step": 1802 }, { "completion_length": 674.4799499511719, "epoch": 0.5385706818012098, "grad_norm": 0.6698505282402039, "kl": 0.363525390625, "learning_rate": 5.678310657677634e-07, "loss": 0.0145, "reward": 1.1635045409202576, "reward_std": 0.1148405484855175, "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 1803 }, { "completion_length": 617.919677734375, "epoch": 0.5388693898887312, "grad_norm": 0.33156266808509827, "kl": 0.310302734375, "learning_rate": 5.673620633202217e-07, "loss": 0.0124, "reward": 1.111607164144516, "reward_std": 0.1420690007507801, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1804 }, { "completion_length": 675.4866333007812, "epoch": 0.5391680979762528, "grad_norm": 0.6119936108589172, "kl": 0.6162109375, "learning_rate": 5.668930419844568e-07, "loss": 0.0246, "reward": 1.0128348767757416, "reward_std": 0.1156998397782445, "rewards/accuracy_reward": 0.0334821455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 1805 }, { "completion_length": 674.1428833007812, "epoch": 0.5394668060637742, "grad_norm": 0.39939138293266296, "kl": 0.3892822265625, "learning_rate": 5.664240022707179e-07, "loss": 0.0156, "reward": 1.1495536267757416, "reward_std": 0.12424618192017078, "rewards/accuracy_reward": 0.16517858067527413, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 1806 }, { "completion_length": 637.5893249511719, "epoch": 0.5397655141512957, "grad_norm": 0.5051243901252747, "kl": 0.45166015625, "learning_rate": 5.659549446892743e-07, "loss": 0.0181, "reward": 1.2500000298023224, "reward_std": 0.22568823397159576, "rewards/accuracy_reward": 0.2566964402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933036118745804, "step": 1807 }, { "completion_length": 577.2678833007812, "epoch": 0.5400642222388171, "grad_norm": 1.0008211135864258, "kl": 0.57177734375, "learning_rate": 5.654858697504145e-07, "loss": 0.0228, "reward": 1.1456473469734192, "reward_std": 0.19376244582235813, "rewards/accuracy_reward": 0.16741071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 1808 }, { "completion_length": 616.7388763427734, "epoch": 0.5403629303263385, "grad_norm": 0.2620394825935364, "kl": 0.38482666015625, "learning_rate": 5.650167779644464e-07, "loss": 0.0154, "reward": 1.122209906578064, "reward_std": 0.1380556793883443, "rewards/accuracy_reward": 0.12946429220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455484867096, "step": 1809 }, { "completion_length": 595.6629638671875, "epoch": 0.5406616384138601, "grad_norm": 0.518854558467865, "kl": 0.74267578125, "learning_rate": 5.645476698416954e-07, "loss": 0.0297, "reward": 1.0842634737491608, "reward_std": 0.16139954887330532, "rewards/accuracy_reward": 0.09821429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1810 }, { "completion_length": 653.6875305175781, "epoch": 0.5409603465013815, "grad_norm": 0.34282681345939636, "kl": 0.4755859375, "learning_rate": 5.640785458925057e-07, "loss": 0.019, "reward": 1.1462054401636124, "reward_std": 0.09672635700553656, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1811 }, { "completion_length": 640.7053680419922, "epoch": 0.541259054588903, "grad_norm": 0.45988982915878296, "kl": 0.4140625, "learning_rate": 5.636094066272377e-07, "loss": 0.0166, "reward": 1.162388414144516, "reward_std": 0.1416509710252285, "rewards/accuracy_reward": 0.1674107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949776977300644, "step": 1812 }, { "completion_length": 591.5089569091797, "epoch": 0.5415577626764244, "grad_norm": 0.3756006062030792, "kl": 0.528076171875, "learning_rate": 5.631402525562693e-07, "loss": 0.0211, "reward": 1.2109375596046448, "reward_std": 0.17003550380468369, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1813 }, { "completion_length": 624.9598541259766, "epoch": 0.5418564707639459, "grad_norm": 0.5914662480354309, "kl": 0.2393798828125, "learning_rate": 5.62671084189994e-07, "loss": 0.0096, "reward": 1.1478795111179352, "reward_std": 0.17885836400091648, "rewards/accuracy_reward": 0.1562500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916295111179352, "step": 1814 }, { "completion_length": 624.3236770629883, "epoch": 0.5421551788514674, "grad_norm": 0.33849185705184937, "kl": 0.4869384765625, "learning_rate": 5.622019020388208e-07, "loss": 0.0195, "reward": 1.1819196939468384, "reward_std": 0.1776493676006794, "rewards/accuracy_reward": 0.1964285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.985491082072258, "step": 1815 }, { "completion_length": 669.5178985595703, "epoch": 0.5424538869389889, "grad_norm": 0.5520827174186707, "kl": 0.341064453125, "learning_rate": 5.617327066131744e-07, "loss": 0.0136, "reward": 1.1054688096046448, "reward_std": 0.17137041687965393, "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937798023224, "step": 1816 }, { "completion_length": 660.2946929931641, "epoch": 0.5427525950265103, "grad_norm": 0.8564183115959167, "kl": 0.3990478515625, "learning_rate": 5.612634984234929e-07, "loss": 0.016, "reward": 1.1428572237491608, "reward_std": 0.12092740554362535, "rewards/accuracy_reward": 0.15401786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1817 }, { "completion_length": 630.700927734375, "epoch": 0.5430513031140318, "grad_norm": 0.5770827531814575, "kl": 0.173583984375, "learning_rate": 5.607942779802292e-07, "loss": 0.0069, "reward": 1.1830357611179352, "reward_std": 0.14962000586092472, "rewards/accuracy_reward": 0.18973215110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1818 }, { "completion_length": 671.7254638671875, "epoch": 0.5433500112015532, "grad_norm": 0.9474579095840454, "kl": 0.54833984375, "learning_rate": 5.603250457938489e-07, "loss": 0.0219, "reward": 1.1875000298023224, "reward_std": 0.20731915161013603, "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 1819 }, { "completion_length": 710.0759124755859, "epoch": 0.5436487192890748, "grad_norm": 0.6794065833091736, "kl": 0.22137451171875, "learning_rate": 5.598558023748307e-07, "loss": 0.0089, "reward": 1.1065848469734192, "reward_std": 0.13669652305543423, "rewards/accuracy_reward": 0.11383929010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 1820 }, { "completion_length": 703.0312805175781, "epoch": 0.5439474273765962, "grad_norm": 0.5448546409606934, "kl": 0.437255859375, "learning_rate": 5.593865482336657e-07, "loss": 0.0175, "reward": 1.2449777126312256, "reward_std": 0.16133557446300983, "rewards/accuracy_reward": 0.2611607313156128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 1821 }, { "completion_length": 688.7455596923828, "epoch": 0.5442461354641177, "grad_norm": 0.6677109003067017, "kl": 0.486572265625, "learning_rate": 5.589172838808561e-07, "loss": 0.0195, "reward": 1.1930804252624512, "reward_std": 0.15323399752378464, "rewards/accuracy_reward": 0.20982143841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 1822 }, { "completion_length": 653.4576110839844, "epoch": 0.5445448435516391, "grad_norm": 0.46452221274375916, "kl": 0.528076171875, "learning_rate": 5.584480098269155e-07, "loss": 0.0212, "reward": 1.145647406578064, "reward_std": 0.16468500345945358, "rewards/accuracy_reward": 0.1562500111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.989397332072258, "step": 1823 }, { "completion_length": 611.2924270629883, "epoch": 0.5448435516391607, "grad_norm": 0.8273898363113403, "kl": 0.4520263671875, "learning_rate": 5.579787265823684e-07, "loss": 0.0181, "reward": 1.1618303954601288, "reward_std": 0.10881450166925788, "rewards/accuracy_reward": 0.1674107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 1824 }, { "completion_length": 631.7053833007812, "epoch": 0.5451422597266821, "grad_norm": 0.38280436396598816, "kl": 0.474609375, "learning_rate": 5.575094346577487e-07, "loss": 0.0189, "reward": 1.2784598767757416, "reward_std": 0.17633394338190556, "rewards/accuracy_reward": 0.2946428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 1825 }, { "completion_length": 654.0736999511719, "epoch": 0.5454409678142036, "grad_norm": 0.44716718792915344, "kl": 0.7890625, "learning_rate": 5.570401345636e-07, "loss": 0.0316, "reward": 1.0731027126312256, "reward_std": 0.18142453767359257, "rewards/accuracy_reward": 0.10044642956927419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562947034836, "step": 1826 }, { "completion_length": 641.0402069091797, "epoch": 0.545739675901725, "grad_norm": 0.5873451232910156, "kl": 0.338623046875, "learning_rate": 5.56570826810475e-07, "loss": 0.0135, "reward": 1.2059152126312256, "reward_std": 0.13096509873867035, "rewards/accuracy_reward": 0.2120535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 1827 }, { "completion_length": 688.6004638671875, "epoch": 0.5460383839892465, "grad_norm": 0.5492969751358032, "kl": 0.6212158203125, "learning_rate": 5.561015119089345e-07, "loss": 0.0249, "reward": 1.2354911267757416, "reward_std": 0.13769317511469126, "rewards/accuracy_reward": 0.2477678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232313156128, "step": 1828 }, { "completion_length": 614.5825958251953, "epoch": 0.546337092076768, "grad_norm": 0.5612823963165283, "kl": 0.47064208984375, "learning_rate": 5.556321903695469e-07, "loss": 0.0189, "reward": 1.0429687798023224, "reward_std": 0.10300444671884179, "rewards/accuracy_reward": 0.04910714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616156578064, "step": 1829 }, { "completion_length": 652.2500305175781, "epoch": 0.5466358001642895, "grad_norm": 0.32984262704849243, "kl": 0.37841796875, "learning_rate": 5.551628627028883e-07, "loss": 0.0151, "reward": 1.1886161267757416, "reward_std": 0.07389958132989705, "rewards/accuracy_reward": 0.19866072572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1830 }, { "completion_length": 652.6161041259766, "epoch": 0.5469345082518109, "grad_norm": 0.9777703285217285, "kl": 0.896484375, "learning_rate": 5.546935294195411e-07, "loss": 0.0359, "reward": 1.2042411267757416, "reward_std": 0.2253781072795391, "rewards/accuracy_reward": 0.2165178693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 1831 }, { "completion_length": 716.1562805175781, "epoch": 0.5472332163393324, "grad_norm": 0.5782091617584229, "kl": 0.715576171875, "learning_rate": 5.542241910300939e-07, "loss": 0.0287, "reward": 1.1981027126312256, "reward_std": 0.16996091976761818, "rewards/accuracy_reward": 0.21651787124574184, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 1832 }, { "completion_length": 634.7544860839844, "epoch": 0.5475319244268538, "grad_norm": 1.1592339277267456, "kl": 0.3583984375, "learning_rate": 5.537548480451408e-07, "loss": 0.0143, "reward": 1.1835937798023224, "reward_std": 0.21153897419571877, "rewards/accuracy_reward": 0.19866071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 1833 }, { "completion_length": 662.5089569091797, "epoch": 0.5478306325143754, "grad_norm": 0.3548508882522583, "kl": 0.7021484375, "learning_rate": 5.53285500975281e-07, "loss": 0.0281, "reward": 1.1679688096046448, "reward_std": 0.1665295995771885, "rewards/accuracy_reward": 0.1852678705472499, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 1834 }, { "completion_length": 590.4844055175781, "epoch": 0.5481293406018968, "grad_norm": 0.21766158938407898, "kl": 0.3218994140625, "learning_rate": 5.528161503311184e-07, "loss": 0.0129, "reward": 1.172433078289032, "reward_std": 0.11367389559745789, "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 1835 }, { "completion_length": 604.0491485595703, "epoch": 0.5484280486894183, "grad_norm": 0.2579664885997772, "kl": 0.328857421875, "learning_rate": 5.5234679662326e-07, "loss": 0.0131, "reward": 1.203125074505806, "reward_std": 0.07231695321388543, "rewards/accuracy_reward": 0.2120535895228386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714775323868, "step": 1836 }, { "completion_length": 649.0134124755859, "epoch": 0.5487267567769397, "grad_norm": 0.676496684551239, "kl": 0.61572265625, "learning_rate": 5.518774403623169e-07, "loss": 0.0247, "reward": 1.1612723767757416, "reward_std": 0.1664970926940441, "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 1837 }, { "completion_length": 734.7634124755859, "epoch": 0.5490254648644612, "grad_norm": 0.40767937898635864, "kl": 0.50830078125, "learning_rate": 5.514080820589029e-07, "loss": 0.0203, "reward": 1.1250000298023224, "reward_std": 0.15206770319491625, "rewards/accuracy_reward": 0.13839286426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1838 }, { "completion_length": 709.9777221679688, "epoch": 0.5493241729519827, "grad_norm": 0.28653597831726074, "kl": 0.484130859375, "learning_rate": 5.509387222236336e-07, "loss": 0.0194, "reward": 1.1757813394069672, "reward_std": 0.19241687282919884, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 1839 }, { "completion_length": 672.2545013427734, "epoch": 0.5496228810395042, "grad_norm": 0.662946343421936, "kl": 0.39501953125, "learning_rate": 5.504693613671263e-07, "loss": 0.0158, "reward": 1.151227742433548, "reward_std": 0.16178925335407257, "rewards/accuracy_reward": 0.16741072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 1840 }, { "completion_length": 648.7120819091797, "epoch": 0.5499215891270256, "grad_norm": 0.4257197678089142, "kl": 0.45489501953125, "learning_rate": 5.5e-07, "loss": 0.0182, "reward": 1.1171875447034836, "reward_std": 0.1087794890627265, "rewards/accuracy_reward": 0.12723214901052415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 1841 }, { "completion_length": 609.9018249511719, "epoch": 0.5502202972145471, "grad_norm": 0.2831263244152069, "kl": 0.53759765625, "learning_rate": 5.495306386328738e-07, "loss": 0.0215, "reward": 1.159040242433548, "reward_std": 0.13555825874209404, "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 1842 }, { "completion_length": 629.6629791259766, "epoch": 0.5505190053020685, "grad_norm": 0.5120300650596619, "kl": 0.4461669921875, "learning_rate": 5.490612777763665e-07, "loss": 0.0179, "reward": 1.1428571939468384, "reward_std": 0.07782099209725857, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 1843 }, { "completion_length": 601.3593902587891, "epoch": 0.5508177133895901, "grad_norm": 0.5411847829818726, "kl": 0.47705078125, "learning_rate": 5.48591917941097e-07, "loss": 0.0191, "reward": 1.1752232611179352, "reward_std": 0.14797746017575264, "rewards/accuracy_reward": 0.18526786682195961, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1844 }, { "completion_length": 668.0558319091797, "epoch": 0.5511164214771115, "grad_norm": 0.33638086915016174, "kl": 0.4482421875, "learning_rate": 5.48122559637683e-07, "loss": 0.018, "reward": 1.1584821939468384, "reward_std": 0.10135083086788654, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 1845 }, { "completion_length": 693.8504638671875, "epoch": 0.551415129564633, "grad_norm": 0.5629646182060242, "kl": 0.471923828125, "learning_rate": 5.476532033767401e-07, "loss": 0.0189, "reward": 1.1930804252624512, "reward_std": 0.14241190161556005, "rewards/accuracy_reward": 0.2031250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 1846 }, { "completion_length": 769.0067291259766, "epoch": 0.5517138376521544, "grad_norm": 0.392559289932251, "kl": 0.57861328125, "learning_rate": 5.471838496688817e-07, "loss": 0.0231, "reward": 1.133370578289032, "reward_std": 0.0706683243624866, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1847 }, { "completion_length": 640.9464416503906, "epoch": 0.552012545739676, "grad_norm": 0.6554203033447266, "kl": 0.458740234375, "learning_rate": 5.467144990247189e-07, "loss": 0.0183, "reward": 1.087053656578064, "reward_std": 0.14940583053976297, "rewards/accuracy_reward": 0.1004464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1848 }, { "completion_length": 581.1027069091797, "epoch": 0.5523112538271974, "grad_norm": 0.403330534696579, "kl": 0.2218017578125, "learning_rate": 5.462451519548592e-07, "loss": 0.0089, "reward": 1.1054687798023224, "reward_std": 0.11184239760041237, "rewards/accuracy_reward": 0.11160715227015316, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616454601288, "step": 1849 }, { "completion_length": 688.0803985595703, "epoch": 0.5526099619147189, "grad_norm": 0.3251369297504425, "kl": 0.55810546875, "learning_rate": 5.45775808969906e-07, "loss": 0.0223, "reward": 1.1785714626312256, "reward_std": 0.17138813622295856, "rewards/accuracy_reward": 0.196428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 1850 }, { "completion_length": 675.0379791259766, "epoch": 0.5529086700022403, "grad_norm": 0.39184460043907166, "kl": 0.500244140625, "learning_rate": 5.453064705804588e-07, "loss": 0.02, "reward": 1.2075893431901932, "reward_std": 0.1360583733767271, "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.993303582072258, "step": 1851 }, { "completion_length": 666.9777069091797, "epoch": 0.5532073780897617, "grad_norm": 0.43481582403182983, "kl": 0.8544921875, "learning_rate": 5.448371372971116e-07, "loss": 0.0342, "reward": 1.178571492433548, "reward_std": 0.2102656029164791, "rewards/accuracy_reward": 0.1986607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 1852 }, { "completion_length": 687.6495819091797, "epoch": 0.5535060861772833, "grad_norm": 0.7882028818130493, "kl": 0.386962890625, "learning_rate": 5.44367809630453e-07, "loss": 0.0155, "reward": 1.1875000894069672, "reward_std": 0.1787452418357134, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 1853 }, { "completion_length": 642.1585083007812, "epoch": 0.5538047942648047, "grad_norm": 0.5873395204544067, "kl": 0.5458984375, "learning_rate": 5.438984880910656e-07, "loss": 0.0218, "reward": 1.2684152126312256, "reward_std": 0.16846301034092903, "rewards/accuracy_reward": 0.2812500223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 1854 }, { "completion_length": 651.8839569091797, "epoch": 0.5541035023523262, "grad_norm": 0.3115120232105255, "kl": 0.257080078125, "learning_rate": 5.43429173189525e-07, "loss": 0.0103, "reward": 1.1439732909202576, "reward_std": 0.15561959147453308, "rewards/accuracy_reward": 0.15178571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 1855 }, { "completion_length": 653.7544937133789, "epoch": 0.5544022104398476, "grad_norm": 0.4692508578300476, "kl": 0.45947265625, "learning_rate": 5.429598654364e-07, "loss": 0.0184, "reward": 1.080915242433548, "reward_std": 0.1436563478782773, "rewards/accuracy_reward": 0.09151786402799189, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 1856 }, { "completion_length": 584.2879638671875, "epoch": 0.5547009185273691, "grad_norm": 0.7986082434654236, "kl": 0.27294921875, "learning_rate": 5.424905653422514e-07, "loss": 0.0109, "reward": 1.1981027126312256, "reward_std": 0.12438295222818851, "rewards/accuracy_reward": 0.20089286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.997209832072258, "step": 1857 }, { "completion_length": 692.5580596923828, "epoch": 0.5549996266148906, "grad_norm": 0.4385168254375458, "kl": 0.52484130859375, "learning_rate": 5.420212734176315e-07, "loss": 0.021, "reward": 1.2014509439468384, "reward_std": 0.14725758880376816, "rewards/accuracy_reward": 0.2187500111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.982700914144516, "step": 1858 }, { "completion_length": 657.9330596923828, "epoch": 0.5552983347024121, "grad_norm": 0.3783135712146759, "kl": 0.38836669921875, "learning_rate": 5.415519901730845e-07, "loss": 0.0156, "reward": 1.166852742433548, "reward_std": 0.1610253043472767, "rewards/accuracy_reward": 0.1808035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1859 }, { "completion_length": 620.5491333007812, "epoch": 0.5555970427899335, "grad_norm": 0.9624544382095337, "kl": 0.273193359375, "learning_rate": 5.410827161191441e-07, "loss": 0.0109, "reward": 1.1579241454601288, "reward_std": 0.15287522412836552, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 1860 }, { "completion_length": 663.2277069091797, "epoch": 0.555895750877455, "grad_norm": 0.29405322670936584, "kl": 0.46722412109375, "learning_rate": 5.406134517663344e-07, "loss": 0.0187, "reward": 1.2901786267757416, "reward_std": 0.17551894672214985, "rewards/accuracy_reward": 0.3035714477300644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 1861 }, { "completion_length": 620.5536041259766, "epoch": 0.5561944589649764, "grad_norm": 0.7466765642166138, "kl": 0.312255859375, "learning_rate": 5.401441976251691e-07, "loss": 0.0125, "reward": 1.2031250298023224, "reward_std": 0.12084508407860994, "rewards/accuracy_reward": 0.20758930081501603, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9955357611179352, "step": 1862 }, { "completion_length": 587.1652069091797, "epoch": 0.556493167052498, "grad_norm": 0.9371073842048645, "kl": 0.44189453125, "learning_rate": 5.396749542061512e-07, "loss": 0.0177, "reward": 1.1026786267757416, "reward_std": 0.15942766144871712, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1863 }, { "completion_length": 750.1116485595703, "epoch": 0.5567918751400194, "grad_norm": 0.49088820815086365, "kl": 0.399658203125, "learning_rate": 5.392057220197708e-07, "loss": 0.016, "reward": 1.1484375447034836, "reward_std": 0.20447667222470045, "rewards/accuracy_reward": 0.17187500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625447034836, "step": 1864 }, { "completion_length": 685.2745971679688, "epoch": 0.5570905832275409, "grad_norm": 0.280200719833374, "kl": 0.583251953125, "learning_rate": 5.387365015765071e-07, "loss": 0.0234, "reward": 1.1316965222358704, "reward_std": 0.08145188260823488, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 1865 }, { "completion_length": 597.8571701049805, "epoch": 0.5573892913150623, "grad_norm": 0.6505685448646545, "kl": 0.6947021484375, "learning_rate": 5.382672933868258e-07, "loss": 0.0278, "reward": 1.1244420409202576, "reward_std": 0.17186390981078148, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 1866 }, { "completion_length": 659.7701263427734, "epoch": 0.5576879994025838, "grad_norm": 0.6966601014137268, "kl": 0.59765625, "learning_rate": 5.377980979611792e-07, "loss": 0.0239, "reward": 1.2790179252624512, "reward_std": 0.20057544484734535, "rewards/accuracy_reward": 0.29464287124574184, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 1867 }, { "completion_length": 616.9620742797852, "epoch": 0.5579867074901053, "grad_norm": 0.7050462365150452, "kl": 0.6119384765625, "learning_rate": 5.373289158100061e-07, "loss": 0.0245, "reward": 1.0803571790456772, "reward_std": 0.12283297441899776, "rewards/accuracy_reward": 0.1004464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 1868 }, { "completion_length": 612.8728103637695, "epoch": 0.5582854155776268, "grad_norm": 0.44501814246177673, "kl": 0.4188232421875, "learning_rate": 5.368597474437308e-07, "loss": 0.0168, "reward": 1.1484375596046448, "reward_std": 0.12627042550593615, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1869 }, { "completion_length": 707.5446624755859, "epoch": 0.5585841236651482, "grad_norm": 0.34792762994766235, "kl": 0.544921875, "learning_rate": 5.363905933727624e-07, "loss": 0.0218, "reward": 1.172433078289032, "reward_std": 0.11231587827205658, "rewards/accuracy_reward": 0.1808035804424435, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 1870 }, { "completion_length": 687.7656555175781, "epoch": 0.5588828317526697, "grad_norm": 1.0273373126983643, "kl": 0.4912109375, "learning_rate": 5.359214541074943e-07, "loss": 0.0197, "reward": 1.1422991454601288, "reward_std": 0.19575710967183113, "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1871 }, { "completion_length": 598.435302734375, "epoch": 0.5591815398401911, "grad_norm": 0.3209948241710663, "kl": 0.34375, "learning_rate": 5.354523301583046e-07, "loss": 0.0138, "reward": 1.1607143580913544, "reward_std": 0.14086453057825565, "rewards/accuracy_reward": 0.16741072246804833, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1872 }, { "completion_length": 612.7634124755859, "epoch": 0.5594802479277127, "grad_norm": 0.7103597521781921, "kl": 0.648193359375, "learning_rate": 5.349832220355537e-07, "loss": 0.0259, "reward": 1.2008929252624512, "reward_std": 0.16427123732864857, "rewards/accuracy_reward": 0.2120535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1873 }, { "completion_length": 641.2902069091797, "epoch": 0.5597789560152341, "grad_norm": 0.23496992886066437, "kl": 0.318115234375, "learning_rate": 5.345141302495855e-07, "loss": 0.0127, "reward": 1.1713170111179352, "reward_std": 0.1062358058989048, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 1874 }, { "completion_length": 674.3036041259766, "epoch": 0.5600776641027556, "grad_norm": 0.41162532567977905, "kl": 0.395263671875, "learning_rate": 5.340450553107257e-07, "loss": 0.0158, "reward": 1.175223246216774, "reward_std": 0.18925925809890032, "rewards/accuracy_reward": 0.1875000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1875 }, { "completion_length": 676.138427734375, "epoch": 0.560376372190277, "grad_norm": 0.6052908897399902, "kl": 0.54248046875, "learning_rate": 5.335759977292821e-07, "loss": 0.0217, "reward": 1.1629464626312256, "reward_std": 0.16392246447503567, "rewards/accuracy_reward": 0.16964286752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1876 }, { "completion_length": 661.2723693847656, "epoch": 0.5606750802777986, "grad_norm": 0.43464019894599915, "kl": 0.507568359375, "learning_rate": 5.331069580155431e-07, "loss": 0.0203, "reward": 1.178571492433548, "reward_std": 0.1422538347542286, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1877 }, { "completion_length": 616.7522583007812, "epoch": 0.56097378836532, "grad_norm": 0.40539199113845825, "kl": 0.366943359375, "learning_rate": 5.326379366797782e-07, "loss": 0.0147, "reward": 1.1489956080913544, "reward_std": 0.11483588558621705, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949777126312256, "step": 1878 }, { "completion_length": 664.1473541259766, "epoch": 0.5612724964528415, "grad_norm": 0.7480473518371582, "kl": 0.5869140625, "learning_rate": 5.321689342322366e-07, "loss": 0.0234, "reward": 1.1651786267757416, "reward_std": 0.19095627963542938, "rewards/accuracy_reward": 0.1852678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107760190964, "step": 1879 }, { "completion_length": 611.0602951049805, "epoch": 0.5615712045403629, "grad_norm": 0.5969061851501465, "kl": 0.70458984375, "learning_rate": 5.316999511831468e-07, "loss": 0.0282, "reward": 1.1635045409202576, "reward_std": 0.1845200890675187, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 1880 }, { "completion_length": 627.6473693847656, "epoch": 0.5618699126278844, "grad_norm": 0.3559882938861847, "kl": 0.29833984375, "learning_rate": 5.312309880427166e-07, "loss": 0.0119, "reward": 1.1261161267757416, "reward_std": 0.13109402917325497, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 1881 }, { "completion_length": 739.4375457763672, "epoch": 0.5621686207154059, "grad_norm": 0.7297037243843079, "kl": 0.74755859375, "learning_rate": 5.307620453211317e-07, "loss": 0.0299, "reward": 1.2025670111179352, "reward_std": 0.22712826915085316, "rewards/accuracy_reward": 0.2254464402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205633878708, "step": 1882 }, { "completion_length": 679.716552734375, "epoch": 0.5624673288029274, "grad_norm": 0.29706892371177673, "kl": 0.6845703125, "learning_rate": 5.30293123528556e-07, "loss": 0.0274, "reward": 1.2399554252624512, "reward_std": 0.13639103481546044, "rewards/accuracy_reward": 0.2566964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589477300644, "step": 1883 }, { "completion_length": 666.4643249511719, "epoch": 0.5627660368904488, "grad_norm": 0.42825424671173096, "kl": 0.50927734375, "learning_rate": 5.298242231751305e-07, "loss": 0.0204, "reward": 1.2103795111179352, "reward_std": 0.14800026267766953, "rewards/accuracy_reward": 0.2209821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 1884 }, { "completion_length": 722.6205749511719, "epoch": 0.5630647449779703, "grad_norm": 0.9497143626213074, "kl": 0.830322265625, "learning_rate": 5.293553447709729e-07, "loss": 0.0333, "reward": 1.1199777126312256, "reward_std": 0.20681652054190636, "rewards/accuracy_reward": 0.1406250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 1885 }, { "completion_length": 560.4955673217773, "epoch": 0.5633634530654917, "grad_norm": 0.7383110523223877, "kl": 0.339111328125, "learning_rate": 5.288864888261765e-07, "loss": 0.0136, "reward": 1.2405134737491608, "reward_std": 0.16158168017864227, "rewards/accuracy_reward": 0.2455357238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949776977300644, "step": 1886 }, { "completion_length": 632.2076263427734, "epoch": 0.5636621611530133, "grad_norm": 0.3208366334438324, "kl": 0.4190673828125, "learning_rate": 5.28417655850811e-07, "loss": 0.0168, "reward": 1.1635045409202576, "reward_std": 0.13194098137319088, "rewards/accuracy_reward": 0.1763392984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 1887 }, { "completion_length": 637.3772583007812, "epoch": 0.5639608692405347, "grad_norm": 0.42613011598587036, "kl": 0.36767578125, "learning_rate": 5.279488463549208e-07, "loss": 0.0147, "reward": 1.1093750298023224, "reward_std": 0.15325486287474632, "rewards/accuracy_reward": 0.12276786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071939468384, "step": 1888 }, { "completion_length": 602.029052734375, "epoch": 0.5642595773280562, "grad_norm": 0.291652649641037, "kl": 0.262939453125, "learning_rate": 5.274800608485243e-07, "loss": 0.0105, "reward": 1.1679687798023224, "reward_std": 0.11966456472873688, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916294813156128, "step": 1889 }, { "completion_length": 600.0178909301758, "epoch": 0.5645582854155776, "grad_norm": 0.39807766675949097, "kl": 0.1617431640625, "learning_rate": 5.270112998416145e-07, "loss": 0.0065, "reward": 1.2410715222358704, "reward_std": 0.13946010544896126, "rewards/accuracy_reward": 0.24776786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1890 }, { "completion_length": 695.4308471679688, "epoch": 0.5648569935030991, "grad_norm": 0.48094603419303894, "kl": 0.290283203125, "learning_rate": 5.265425638441574e-07, "loss": 0.0116, "reward": 1.1160714626312256, "reward_std": 0.1762685328722, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 1891 }, { "completion_length": 677.732177734375, "epoch": 0.5651557015906206, "grad_norm": 0.528839111328125, "kl": 0.4493408203125, "learning_rate": 5.260738533660917e-07, "loss": 0.018, "reward": 1.1729910969734192, "reward_std": 0.13616662099957466, "rewards/accuracy_reward": 0.19642857951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625447034836, "step": 1892 }, { "completion_length": 612.5580749511719, "epoch": 0.5654544096781421, "grad_norm": 0.3422158658504486, "kl": 0.322265625, "learning_rate": 5.256051689173284e-07, "loss": 0.0129, "reward": 1.2220982909202576, "reward_std": 0.11983959469944239, "rewards/accuracy_reward": 0.2276785857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 1893 }, { "completion_length": 639.7879791259766, "epoch": 0.5657531177656635, "grad_norm": 0.4374244213104248, "kl": 0.391845703125, "learning_rate": 5.251365110077506e-07, "loss": 0.0156, "reward": 1.1969866752624512, "reward_std": 0.14470139518380165, "rewards/accuracy_reward": 0.20982143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871651977300644, "step": 1894 }, { "completion_length": 620.4799423217773, "epoch": 0.5660518258531849, "grad_norm": 0.23393966257572174, "kl": 0.24658203125, "learning_rate": 5.246678801472118e-07, "loss": 0.0099, "reward": 1.1540179252624512, "reward_std": 0.14929198659956455, "rewards/accuracy_reward": 0.16294643376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 1895 }, { "completion_length": 725.560302734375, "epoch": 0.5663505339407064, "grad_norm": 0.49525749683380127, "kl": 0.294921875, "learning_rate": 5.241992768455366e-07, "loss": 0.0118, "reward": 1.1741071939468384, "reward_std": 0.059854544000700116, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9977678805589676, "step": 1896 }, { "completion_length": 665.9732437133789, "epoch": 0.5666492420282279, "grad_norm": 0.5334664583206177, "kl": 0.4638671875, "learning_rate": 5.237307016125194e-07, "loss": 0.0186, "reward": 1.0228795111179352, "reward_std": 0.06401307042688131, "rewards/accuracy_reward": 0.04017857206054032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.982700914144516, "step": 1897 }, { "completion_length": 661.8839569091797, "epoch": 0.5669479501157494, "grad_norm": 0.6064989566802979, "kl": 0.49072265625, "learning_rate": 5.232621549579242e-07, "loss": 0.0196, "reward": 1.1406250596046448, "reward_std": 0.17391308769583702, "rewards/accuracy_reward": 0.15848215110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98214291036129, "step": 1898 }, { "completion_length": 605.5223541259766, "epoch": 0.5672466582032708, "grad_norm": 0.54103022813797, "kl": 0.38153076171875, "learning_rate": 5.22793637391484e-07, "loss": 0.0152, "reward": 1.1065848767757416, "reward_std": 0.15898361429572105, "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 1899 }, { "completion_length": 648.6406478881836, "epoch": 0.5675453662907923, "grad_norm": 0.7249769568443298, "kl": 0.497314453125, "learning_rate": 5.223251494228995e-07, "loss": 0.0199, "reward": 1.1763393580913544, "reward_std": 0.14010326005518436, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1900 }, { "completion_length": 675.6138763427734, "epoch": 0.5678440743783137, "grad_norm": 0.43524307012557983, "kl": 0.273681640625, "learning_rate": 5.218566915618402e-07, "loss": 0.011, "reward": 1.1523437798023224, "reward_std": 0.12150735780596733, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616454601288, "step": 1901 }, { "completion_length": 597.3817367553711, "epoch": 0.5681427824658353, "grad_norm": 0.3685872256755829, "kl": 0.34619140625, "learning_rate": 5.213882643179422e-07, "loss": 0.0138, "reward": 1.1847098767757416, "reward_std": 0.19783813133835793, "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134439468384, "step": 1902 }, { "completion_length": 700.825927734375, "epoch": 0.5684414905533567, "grad_norm": 0.4907711148262024, "kl": 0.4951171875, "learning_rate": 5.209198682008081e-07, "loss": 0.0198, "reward": 1.1406250298023224, "reward_std": 0.127073148265481, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1903 }, { "completion_length": 674.4888610839844, "epoch": 0.5687401986408782, "grad_norm": 0.6805552244186401, "kl": 0.337646484375, "learning_rate": 5.204515037200074e-07, "loss": 0.0135, "reward": 1.2064732313156128, "reward_std": 0.1265758213121444, "rewards/accuracy_reward": 0.2209821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 1904 }, { "completion_length": 661.591552734375, "epoch": 0.5690389067283996, "grad_norm": 0.47569912672042847, "kl": 0.2890625, "learning_rate": 5.199831713850748e-07, "loss": 0.0116, "reward": 1.1735491752624512, "reward_std": 0.14988607913255692, "rewards/accuracy_reward": 0.1830357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 1905 }, { "completion_length": 621.6652069091797, "epoch": 0.5693376148159212, "grad_norm": 0.3711089491844177, "kl": 0.2794189453125, "learning_rate": 5.195148717055094e-07, "loss": 0.0112, "reward": 1.1802455931901932, "reward_std": 0.11736956425011158, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 1906 }, { "completion_length": 613.7857513427734, "epoch": 0.5696363229034426, "grad_norm": 0.5654884576797485, "kl": 0.457275390625, "learning_rate": 5.19046605190776e-07, "loss": 0.0183, "reward": 1.1088170111179352, "reward_std": 0.14658940397202969, "rewards/accuracy_reward": 0.12946428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 1907 }, { "completion_length": 668.4732360839844, "epoch": 0.5699350309909641, "grad_norm": 0.36273616552352905, "kl": 0.271728515625, "learning_rate": 5.185783723503022e-07, "loss": 0.0109, "reward": 1.2282366156578064, "reward_std": 0.22523143887519836, "rewards/accuracy_reward": 0.2410714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1908 }, { "completion_length": 670.3661193847656, "epoch": 0.5702337390784855, "grad_norm": 0.3236788213253021, "kl": 0.2659912109375, "learning_rate": 5.181101736934798e-07, "loss": 0.0106, "reward": 1.1819196939468384, "reward_std": 0.19006889313459396, "rewards/accuracy_reward": 0.19419644214212894, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1909 }, { "completion_length": 548.4576034545898, "epoch": 0.570532447166007, "grad_norm": 0.44102784991264343, "kl": 0.2081298828125, "learning_rate": 5.176420097296627e-07, "loss": 0.0083, "reward": 1.1741071939468384, "reward_std": 0.1457194723188877, "rewards/accuracy_reward": 0.17857143748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9955357313156128, "step": 1910 }, { "completion_length": 686.2723541259766, "epoch": 0.5708311552535285, "grad_norm": 0.39460042119026184, "kl": 0.474365234375, "learning_rate": 5.171738809681677e-07, "loss": 0.019, "reward": 1.1992187798023224, "reward_std": 0.15787188336253166, "rewards/accuracy_reward": 0.2165178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 1911 }, { "completion_length": 664.3661041259766, "epoch": 0.57112986334105, "grad_norm": 0.3210844099521637, "kl": 0.2874755859375, "learning_rate": 5.167057879182729e-07, "loss": 0.0115, "reward": 1.193638414144516, "reward_std": 0.18054029531776905, "rewards/accuracy_reward": 0.2053571566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 1912 }, { "completion_length": 651.8616485595703, "epoch": 0.5714285714285714, "grad_norm": 0.46923646330833435, "kl": 0.270751953125, "learning_rate": 5.162377310892177e-07, "loss": 0.0108, "reward": 1.2265625298023224, "reward_std": 0.20154014229774475, "rewards/accuracy_reward": 0.2366071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 1913 }, { "completion_length": 616.7120819091797, "epoch": 0.5717272795160929, "grad_norm": 0.27090421319007874, "kl": 0.3544921875, "learning_rate": 5.15769710990202e-07, "loss": 0.0142, "reward": 1.2209821939468384, "reward_std": 0.20329941250383854, "rewards/accuracy_reward": 0.23214287124574184, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 1914 }, { "completion_length": 668.5111846923828, "epoch": 0.5720259876036143, "grad_norm": 0.4329092800617218, "kl": 0.418701171875, "learning_rate": 5.153017281303858e-07, "loss": 0.0167, "reward": 1.2025670111179352, "reward_std": 0.125105245038867, "rewards/accuracy_reward": 0.212053582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 1915 }, { "completion_length": 638.2343902587891, "epoch": 0.5723246956911359, "grad_norm": 0.6592311859130859, "kl": 0.44677734375, "learning_rate": 5.148337830188885e-07, "loss": 0.0179, "reward": 1.2047991752624512, "reward_std": 0.14141440577805042, "rewards/accuracy_reward": 0.216517873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 1916 }, { "completion_length": 690.841552734375, "epoch": 0.5726234037786573, "grad_norm": 0.9612416625022888, "kl": 0.70166015625, "learning_rate": 5.143658761647887e-07, "loss": 0.028, "reward": 1.184151828289032, "reward_std": 0.15482945553958416, "rewards/accuracy_reward": 0.20312500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268431901932, "step": 1917 }, { "completion_length": 755.2232360839844, "epoch": 0.5729221118661788, "grad_norm": 0.5556712746620178, "kl": 0.39501953125, "learning_rate": 5.13898008077123e-07, "loss": 0.0158, "reward": 1.0401786267757416, "reward_std": 0.10570356249809265, "rewards/accuracy_reward": 0.06473214528523386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464775323868, "step": 1918 }, { "completion_length": 663.4710083007812, "epoch": 0.5732208199537002, "grad_norm": 0.7541987895965576, "kl": 0.3984375, "learning_rate": 5.134301792648864e-07, "loss": 0.0159, "reward": 1.1629465222358704, "reward_std": 0.16136403568089008, "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 1919 }, { "completion_length": 593.6964569091797, "epoch": 0.5735195280412217, "grad_norm": 0.5010515451431274, "kl": 0.25146484375, "learning_rate": 5.129623902370304e-07, "loss": 0.0101, "reward": 1.1646206080913544, "reward_std": 0.18017053976655006, "rewards/accuracy_reward": 0.17187501350417733, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 1920 }, { "completion_length": 674.6116333007812, "epoch": 0.5738182361287432, "grad_norm": 0.5133348107337952, "kl": 0.50830078125, "learning_rate": 5.124946415024635e-07, "loss": 0.0203, "reward": 1.1367188096046448, "reward_std": 0.14212366193532944, "rewards/accuracy_reward": 0.149553582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1921 }, { "completion_length": 662.8928833007812, "epoch": 0.5741169442162647, "grad_norm": 0.4867524802684784, "kl": 0.45361328125, "learning_rate": 5.120269335700511e-07, "loss": 0.0182, "reward": 1.3002232909202576, "reward_std": 0.17867417633533478, "rewards/accuracy_reward": 0.305803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 1922 }, { "completion_length": 661.7433319091797, "epoch": 0.5744156523037861, "grad_norm": 0.32420170307159424, "kl": 0.423828125, "learning_rate": 5.115592669486131e-07, "loss": 0.0169, "reward": 1.2650670409202576, "reward_std": 0.1010410888120532, "rewards/accuracy_reward": 0.27678573690354824, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 1923 }, { "completion_length": 693.7835083007812, "epoch": 0.5747143603913076, "grad_norm": 0.39753565192222595, "kl": 0.716796875, "learning_rate": 5.110916421469249e-07, "loss": 0.0287, "reward": 1.1902902126312256, "reward_std": 0.15809595212340355, "rewards/accuracy_reward": 0.20758929569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 1924 }, { "completion_length": 605.6339721679688, "epoch": 0.575013068478829, "grad_norm": 0.5451651215553284, "kl": 0.37109375, "learning_rate": 5.106240596737168e-07, "loss": 0.0148, "reward": 1.1847098767757416, "reward_std": 0.13664106838405132, "rewards/accuracy_reward": 0.1941964402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 1925 }, { "completion_length": 581.4018096923828, "epoch": 0.5753117765663506, "grad_norm": 0.3698814809322357, "kl": 0.47314453125, "learning_rate": 5.101565200376725e-07, "loss": 0.0189, "reward": 1.1824777126312256, "reward_std": 0.14576679654419422, "rewards/accuracy_reward": 0.18973215017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 1926 }, { "completion_length": 583.0312652587891, "epoch": 0.575610484653872, "grad_norm": 0.4160662293434143, "kl": 0.2431640625, "learning_rate": 5.096890237474292e-07, "loss": 0.0097, "reward": 1.1015625447034836, "reward_std": 0.08979155402630568, "rewards/accuracy_reward": 0.1071428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 1927 }, { "completion_length": 692.8393096923828, "epoch": 0.5759091927413935, "grad_norm": 0.9557473063468933, "kl": 1.0439453125, "learning_rate": 5.092215713115772e-07, "loss": 0.0418, "reward": 1.1529018580913544, "reward_std": 0.16342365741729736, "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268431901932, "step": 1928 }, { "completion_length": 669.6160888671875, "epoch": 0.5762079008289149, "grad_norm": 0.42537423968315125, "kl": 0.470703125, "learning_rate": 5.08754163238659e-07, "loss": 0.0188, "reward": 1.1227678954601288, "reward_std": 0.15802357159554958, "rewards/accuracy_reward": 0.1428571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 1929 }, { "completion_length": 708.4040374755859, "epoch": 0.5765066089164365, "grad_norm": 0.5971729755401611, "kl": 0.465576171875, "learning_rate": 5.082868000371686e-07, "loss": 0.0186, "reward": 1.1685268133878708, "reward_std": 0.20447508618235588, "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1930 }, { "completion_length": 649.716552734375, "epoch": 0.5768053170039579, "grad_norm": 1.4341514110565186, "kl": 0.283447265625, "learning_rate": 5.078194822155513e-07, "loss": 0.0113, "reward": 1.0943080484867096, "reward_std": 0.11598595604300499, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937798023224, "step": 1931 }, { "completion_length": 715.8348541259766, "epoch": 0.5771040250914794, "grad_norm": 0.6176071763038635, "kl": 0.58154296875, "learning_rate": 5.073522102822034e-07, "loss": 0.0233, "reward": 1.2706473767757416, "reward_std": 0.16624676622450352, "rewards/accuracy_reward": 0.2857142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330931901932, "step": 1932 }, { "completion_length": 682.7142944335938, "epoch": 0.5774027331790008, "grad_norm": 0.5198960304260254, "kl": 0.597900390625, "learning_rate": 5.068849847454709e-07, "loss": 0.0239, "reward": 1.217633992433548, "reward_std": 0.12848859652876854, "rewards/accuracy_reward": 0.2410714440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625298023224, "step": 1933 }, { "completion_length": 604.6473236083984, "epoch": 0.5777014412665223, "grad_norm": 0.6930674314498901, "kl": 0.2593994140625, "learning_rate": 5.06417806113649e-07, "loss": 0.0104, "reward": 1.1367188096046448, "reward_std": 0.17783885076642036, "rewards/accuracy_reward": 0.14732143515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 1934 }, { "completion_length": 562.1428833007812, "epoch": 0.5780001493540438, "grad_norm": 0.15657225251197815, "kl": 0.1590576171875, "learning_rate": 5.059506748949825e-07, "loss": 0.0064, "reward": 1.1696429252624512, "reward_std": 0.10026032011955976, "rewards/accuracy_reward": 0.17187500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9977678656578064, "step": 1935 }, { "completion_length": 670.8795013427734, "epoch": 0.5782988574415653, "grad_norm": 0.9079229831695557, "kl": 0.71435546875, "learning_rate": 5.054835915976646e-07, "loss": 0.0285, "reward": 1.2611607611179352, "reward_std": 0.2211020439863205, "rewards/accuracy_reward": 0.2924107350409031, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500447034836, "step": 1936 }, { "completion_length": 746.8661041259766, "epoch": 0.5785975655290867, "grad_norm": 0.4074627161026001, "kl": 0.330810546875, "learning_rate": 5.05016556729836e-07, "loss": 0.0132, "reward": 1.1110491752624512, "reward_std": 0.17811029218137264, "rewards/accuracy_reward": 0.12500000838190317, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1937 }, { "completion_length": 619.1294860839844, "epoch": 0.5788962736166081, "grad_norm": 0.6996570229530334, "kl": 0.576904296875, "learning_rate": 5.045495707995847e-07, "loss": 0.0231, "reward": 1.1986607611179352, "reward_std": 0.1332932710647583, "rewards/accuracy_reward": 0.2075892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1938 }, { "completion_length": 663.4576034545898, "epoch": 0.5791949817041296, "grad_norm": 0.6517667770385742, "kl": 0.45068359375, "learning_rate": 5.040826343149458e-07, "loss": 0.018, "reward": 1.2025670111179352, "reward_std": 0.1713742557913065, "rewards/accuracy_reward": 0.212053582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 1939 }, { "completion_length": 663.6295013427734, "epoch": 0.5794936897916511, "grad_norm": 0.32197466492652893, "kl": 0.50048828125, "learning_rate": 5.036157477839004e-07, "loss": 0.02, "reward": 1.1568080484867096, "reward_std": 0.14848658815026283, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916294813156128, "step": 1940 }, { "completion_length": 690.0513610839844, "epoch": 0.5797923978791726, "grad_norm": 0.779393196105957, "kl": 0.506103515625, "learning_rate": 5.031489117143753e-07, "loss": 0.0203, "reward": 1.1891741752624512, "reward_std": 0.24132990092039108, "rewards/accuracy_reward": 0.20312500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1941 }, { "completion_length": 649.685302734375, "epoch": 0.580091105966694, "grad_norm": 0.39750227332115173, "kl": 0.67822265625, "learning_rate": 5.026821266142422e-07, "loss": 0.0272, "reward": 1.2924107611179352, "reward_std": 0.19896375853568316, "rewards/accuracy_reward": 0.30580359185114503, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1942 }, { "completion_length": 657.0781707763672, "epoch": 0.5803898140542155, "grad_norm": 1.420432448387146, "kl": 0.5711669921875, "learning_rate": 5.022153929913178e-07, "loss": 0.0228, "reward": 1.231026828289032, "reward_std": 0.15401358529925346, "rewards/accuracy_reward": 0.24553572619333863, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 1943 }, { "completion_length": 707.0312805175781, "epoch": 0.5806885221417369, "grad_norm": 0.6891472339630127, "kl": 0.503662109375, "learning_rate": 5.017487113533625e-07, "loss": 0.0202, "reward": 1.1065848767757416, "reward_std": 0.21634309738874435, "rewards/accuracy_reward": 0.12946429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977120578289032, "step": 1944 }, { "completion_length": 719.2723541259766, "epoch": 0.5809872302292585, "grad_norm": 0.5713549852371216, "kl": 0.560546875, "learning_rate": 5.012820822080799e-07, "loss": 0.0224, "reward": 1.1389509439468384, "reward_std": 0.1840471439063549, "rewards/accuracy_reward": 0.15178572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1945 }, { "completion_length": 637.4754791259766, "epoch": 0.5812859383167799, "grad_norm": 1.0034829378128052, "kl": 0.5986328125, "learning_rate": 5.008155060631171e-07, "loss": 0.0239, "reward": 1.1986607611179352, "reward_std": 0.2165310736745596, "rewards/accuracy_reward": 0.22544643189758062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.973214328289032, "step": 1946 }, { "completion_length": 664.9241638183594, "epoch": 0.5815846464043014, "grad_norm": 1.223605990409851, "kl": 0.869140625, "learning_rate": 5.003489834260631e-07, "loss": 0.0348, "reward": 1.1774553954601288, "reward_std": 0.1300571747124195, "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 1947 }, { "completion_length": 520.8393096923828, "epoch": 0.5818833544918228, "grad_norm": 0.4270680248737335, "kl": 0.4208984375, "learning_rate": 4.99882514804449e-07, "loss": 0.0169, "reward": 1.295758992433548, "reward_std": 0.17499591410160065, "rewards/accuracy_reward": 0.3013392984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 1948 }, { "completion_length": 681.5446624755859, "epoch": 0.5821820625793444, "grad_norm": 0.3394615948200226, "kl": 0.343505859375, "learning_rate": 4.994161007057465e-07, "loss": 0.0137, "reward": 1.1869420111179352, "reward_std": 0.16176720708608627, "rewards/accuracy_reward": 0.2053571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 1949 }, { "completion_length": 654.2991333007812, "epoch": 0.5824807706668658, "grad_norm": 0.9884783029556274, "kl": 0.8779296875, "learning_rate": 4.989497416373687e-07, "loss": 0.0351, "reward": 1.1914063096046448, "reward_std": 0.16643663123250008, "rewards/accuracy_reward": 0.2209821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.97042416036129, "step": 1950 }, { "completion_length": 657.5960235595703, "epoch": 0.5827794787543873, "grad_norm": 1.9373722076416016, "kl": 0.69970703125, "learning_rate": 4.984834381066687e-07, "loss": 0.028, "reward": 1.2706473767757416, "reward_std": 0.16843270137906075, "rewards/accuracy_reward": 0.294642873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 1951 }, { "completion_length": 634.0602951049805, "epoch": 0.5830781868419087, "grad_norm": 0.4846351146697998, "kl": 0.506103515625, "learning_rate": 4.980171906209386e-07, "loss": 0.0202, "reward": 1.2254464626312256, "reward_std": 0.12297599203884602, "rewards/accuracy_reward": 0.2366071566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1952 }, { "completion_length": 687.1986846923828, "epoch": 0.5833768949294302, "grad_norm": 0.9160714745521545, "kl": 0.669677734375, "learning_rate": 4.975509996874106e-07, "loss": 0.0268, "reward": 1.147879496216774, "reward_std": 0.20166026428341866, "rewards/accuracy_reward": 0.17187500349245965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 1953 }, { "completion_length": 650.3080596923828, "epoch": 0.5836756030169516, "grad_norm": 0.4691300094127655, "kl": 0.74072265625, "learning_rate": 4.970848658132541e-07, "loss": 0.0297, "reward": 1.2979911118745804, "reward_std": 0.1621055919677019, "rewards/accuracy_reward": 0.3102678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1954 }, { "completion_length": 669.8638610839844, "epoch": 0.5839743111044732, "grad_norm": 0.27959805727005005, "kl": 0.500732421875, "learning_rate": 4.966187895055776e-07, "loss": 0.02, "reward": 1.2181920111179352, "reward_std": 0.16207006573677063, "rewards/accuracy_reward": 0.2299107275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812649011612, "step": 1955 }, { "completion_length": 681.3839569091797, "epoch": 0.5842730191919946, "grad_norm": 1.10115385055542, "kl": 0.67578125, "learning_rate": 4.961527712714259e-07, "loss": 0.0271, "reward": 1.1188616454601288, "reward_std": 0.10121178906410933, "rewards/accuracy_reward": 0.13392857764847577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 1956 }, { "completion_length": 660.8259124755859, "epoch": 0.5845717272795161, "grad_norm": 2.3141133785247803, "kl": 1.0361328125, "learning_rate": 4.956868116177816e-07, "loss": 0.0415, "reward": 1.2578125596046448, "reward_std": 0.19894453696906567, "rewards/accuracy_reward": 0.290178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9676339626312256, "step": 1957 }, { "completion_length": 580.3259124755859, "epoch": 0.5848704353670375, "grad_norm": 0.5615019798278809, "kl": 0.302734375, "learning_rate": 4.95220911051563e-07, "loss": 0.0121, "reward": 1.1953125149011612, "reward_std": 0.15743201412260532, "rewards/accuracy_reward": 0.20758929592557251, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 1958 }, { "completion_length": 628.5402069091797, "epoch": 0.5851691434545591, "grad_norm": 0.5948117971420288, "kl": 0.66748046875, "learning_rate": 4.947550700796242e-07, "loss": 0.0267, "reward": 1.233258992433548, "reward_std": 0.2016987968236208, "rewards/accuracy_reward": 0.2522321566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 1959 }, { "completion_length": 674.7366485595703, "epoch": 0.5854678515420805, "grad_norm": 0.7077656388282776, "kl": 1.01904296875, "learning_rate": 4.942892892087546e-07, "loss": 0.0407, "reward": 1.0915178954601288, "reward_std": 0.1632572275120765, "rewards/accuracy_reward": 0.11383928847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786267757416, "step": 1960 }, { "completion_length": 745.5446624755859, "epoch": 0.585766559629602, "grad_norm": 0.48370447754859924, "kl": 0.6865234375, "learning_rate": 4.938235689456782e-07, "loss": 0.0275, "reward": 1.147321492433548, "reward_std": 0.18923757039010525, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 1961 }, { "completion_length": 669.0469131469727, "epoch": 0.5860652677171234, "grad_norm": 0.30462175607681274, "kl": 0.58154296875, "learning_rate": 4.933579097970529e-07, "loss": 0.0232, "reward": 1.2304687798023224, "reward_std": 0.20217080414295197, "rewards/accuracy_reward": 0.2410714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 1962 }, { "completion_length": 691.810302734375, "epoch": 0.5863639758046449, "grad_norm": 0.5676645636558533, "kl": 0.3782958984375, "learning_rate": 4.9289231226947e-07, "loss": 0.0152, "reward": 1.1194197237491608, "reward_std": 0.10132561158388853, "rewards/accuracy_reward": 0.133928582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 1963 }, { "completion_length": 677.7902069091797, "epoch": 0.5866626838921664, "grad_norm": 0.33188366889953613, "kl": 0.71630859375, "learning_rate": 4.924267768694544e-07, "loss": 0.0286, "reward": 1.130022406578064, "reward_std": 0.15889139845967293, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366305589676, "step": 1964 }, { "completion_length": 674.5089569091797, "epoch": 0.5869613919796879, "grad_norm": 0.6959500312805176, "kl": 0.60498046875, "learning_rate": 4.919613041034628e-07, "loss": 0.0242, "reward": 1.168526828289032, "reward_std": 0.17643028497695923, "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 1965 }, { "completion_length": 610.9531555175781, "epoch": 0.5872601000672093, "grad_norm": 0.2897852659225464, "kl": 0.375732421875, "learning_rate": 4.914958944778837e-07, "loss": 0.0151, "reward": 1.2215402722358704, "reward_std": 0.16298011038452387, "rewards/accuracy_reward": 0.2299107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 1966 }, { "completion_length": 618.5245742797852, "epoch": 0.5875588081547308, "grad_norm": 0.5014230608940125, "kl": 0.42431640625, "learning_rate": 4.910305484990377e-07, "loss": 0.0169, "reward": 1.2873884439468384, "reward_std": 0.15316476859152317, "rewards/accuracy_reward": 0.29687500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 1967 }, { "completion_length": 657.7522583007812, "epoch": 0.5878575162422522, "grad_norm": 0.5543004870414734, "kl": 0.327880859375, "learning_rate": 4.905652666731751e-07, "loss": 0.0131, "reward": 1.0435268580913544, "reward_std": 0.12777280062437057, "rewards/accuracy_reward": 0.06250000116415322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 1968 }, { "completion_length": 702.8973541259766, "epoch": 0.5881562243297738, "grad_norm": 0.3732345402240753, "kl": 0.37109375, "learning_rate": 4.901000495064774e-07, "loss": 0.0149, "reward": 1.1389509439468384, "reward_std": 0.1870052982121706, "rewards/accuracy_reward": 0.15178572619333863, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 1969 }, { "completion_length": 744.9732360839844, "epoch": 0.5884549324172952, "grad_norm": 0.43117383122444153, "kl": 0.693359375, "learning_rate": 4.896348975050546e-07, "loss": 0.0278, "reward": 1.094308078289032, "reward_std": 0.13876152504235506, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723618745804, "step": 1970 }, { "completion_length": 726.1585235595703, "epoch": 0.5887536405048167, "grad_norm": 0.6626588106155396, "kl": 0.581787109375, "learning_rate": 4.891698111749471e-07, "loss": 0.0233, "reward": 1.1171875298023224, "reward_std": 0.17243101447820663, "rewards/accuracy_reward": 0.14062500419095159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625596046448, "step": 1971 }, { "completion_length": 674.6986846923828, "epoch": 0.5890523485923381, "grad_norm": 0.3434937298297882, "kl": 0.333984375, "learning_rate": 4.88704791022123e-07, "loss": 0.0134, "reward": 1.2812500596046448, "reward_std": 0.12712948210537434, "rewards/accuracy_reward": 0.2924107275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 1972 }, { "completion_length": 703.169677734375, "epoch": 0.5893510566798597, "grad_norm": 0.810050368309021, "kl": 0.33447265625, "learning_rate": 4.882398375524789e-07, "loss": 0.0134, "reward": 1.1875000298023224, "reward_std": 0.1748334150761366, "rewards/accuracy_reward": 0.2098214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786267757416, "step": 1973 }, { "completion_length": 660.2611999511719, "epoch": 0.5896497647673811, "grad_norm": 0.3671497702598572, "kl": 0.8173828125, "learning_rate": 4.877749512718381e-07, "loss": 0.0327, "reward": 1.1484375894069672, "reward_std": 0.16355600208044052, "rewards/accuracy_reward": 0.1674107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 1974 }, { "completion_length": 642.8661041259766, "epoch": 0.5899484728549026, "grad_norm": 1.214159369468689, "kl": 0.26171875, "learning_rate": 4.873101326859517e-07, "loss": 0.0105, "reward": 1.2700893580913544, "reward_std": 0.17837894149124622, "rewards/accuracy_reward": 0.28125001303851604, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 1975 }, { "completion_length": 643.5803833007812, "epoch": 0.590247180942424, "grad_norm": 0.4251362383365631, "kl": 0.330810546875, "learning_rate": 4.868453823004967e-07, "loss": 0.0133, "reward": 1.2014509439468384, "reward_std": 0.14761760085821152, "rewards/accuracy_reward": 0.2187500111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 1976 }, { "completion_length": 663.3125305175781, "epoch": 0.5905458890299455, "grad_norm": 0.45031049847602844, "kl": 0.4755859375, "learning_rate": 4.86380700621076e-07, "loss": 0.019, "reward": 1.1562500596046448, "reward_std": 0.14077585097402334, "rewards/accuracy_reward": 0.1674107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 1977 }, { "completion_length": 712.7946929931641, "epoch": 0.590844597117467, "grad_norm": 0.9343342185020447, "kl": 0.72509765625, "learning_rate": 4.859160881532176e-07, "loss": 0.029, "reward": 1.1350447088479996, "reward_std": 0.13279239274561405, "rewards/accuracy_reward": 0.1584821566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625596046448, "step": 1978 }, { "completion_length": 659.5580596923828, "epoch": 0.5911433052049885, "grad_norm": 0.8116529583930969, "kl": 0.44189453125, "learning_rate": 4.854515454023745e-07, "loss": 0.0177, "reward": 1.1250000596046448, "reward_std": 0.15067769214510918, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 1979 }, { "completion_length": 685.3169860839844, "epoch": 0.5914420132925099, "grad_norm": 0.44212764501571655, "kl": 0.84130859375, "learning_rate": 4.849870728739234e-07, "loss": 0.0336, "reward": 1.119419664144516, "reward_std": 0.19316503033041954, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9720982611179352, "step": 1980 }, { "completion_length": 626.9308395385742, "epoch": 0.5917407213800313, "grad_norm": 0.24147716164588928, "kl": 0.3192138671875, "learning_rate": 4.845226710731651e-07, "loss": 0.0128, "reward": 1.2187500298023224, "reward_std": 0.1482689566910267, "rewards/accuracy_reward": 0.22767858393490314, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1981 }, { "completion_length": 656.2232360839844, "epoch": 0.5920394294675528, "grad_norm": 0.32874056696891785, "kl": 0.396484375, "learning_rate": 4.840583405053233e-07, "loss": 0.0159, "reward": 1.1305803954601288, "reward_std": 0.1629613135010004, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 1982 }, { "completion_length": 591.1696624755859, "epoch": 0.5923381375550743, "grad_norm": 0.3001185357570648, "kl": 0.398193359375, "learning_rate": 4.83594081675544e-07, "loss": 0.0159, "reward": 1.1718750596046448, "reward_std": 0.17558522894978523, "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 1983 }, { "completion_length": 734.5469055175781, "epoch": 0.5926368456425958, "grad_norm": 1.2763088941574097, "kl": 0.80029296875, "learning_rate": 4.831298950888953e-07, "loss": 0.032, "reward": 1.129464328289032, "reward_std": 0.19976842869073153, "rewards/accuracy_reward": 0.14732143515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428805589676, "step": 1984 }, { "completion_length": 704.9219055175781, "epoch": 0.5929355537301172, "grad_norm": 0.8684783577919006, "kl": 1.0146484375, "learning_rate": 4.826657812503668e-07, "loss": 0.0405, "reward": 1.3247768580913544, "reward_std": 0.2091875784099102, "rewards/accuracy_reward": 0.3415178768336773, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 1985 }, { "completion_length": 583.1451110839844, "epoch": 0.5932342618176387, "grad_norm": 0.416262686252594, "kl": 0.492919921875, "learning_rate": 4.822017406648689e-07, "loss": 0.0197, "reward": 1.1445313096046448, "reward_std": 0.12944343127310276, "rewards/accuracy_reward": 0.15401786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 1986 }, { "completion_length": 625.9799499511719, "epoch": 0.5935329699051601, "grad_norm": 0.6468381285667419, "kl": 0.4501953125, "learning_rate": 4.817377738372321e-07, "loss": 0.018, "reward": 1.1065848767757416, "reward_std": 0.15147051215171814, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 1987 }, { "completion_length": 587.4085083007812, "epoch": 0.5938316779926817, "grad_norm": 0.3278358280658722, "kl": 0.381103515625, "learning_rate": 4.812738812722069e-07, "loss": 0.0152, "reward": 1.2036831080913544, "reward_std": 0.14690298587083817, "rewards/accuracy_reward": 0.2120535857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916294813156128, "step": 1988 }, { "completion_length": 623.5535888671875, "epoch": 0.5941303860802031, "grad_norm": 0.5096176266670227, "kl": 0.356201171875, "learning_rate": 4.808100634744632e-07, "loss": 0.0143, "reward": 1.1997767984867096, "reward_std": 0.17441920563578606, "rewards/accuracy_reward": 0.2098214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 1989 }, { "completion_length": 623.5335083007812, "epoch": 0.5944290941677246, "grad_norm": 0.30400317907333374, "kl": 0.29052734375, "learning_rate": 4.803463209485891e-07, "loss": 0.0116, "reward": 1.215401828289032, "reward_std": 0.16002847626805305, "rewards/accuracy_reward": 0.2299107275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 1990 }, { "completion_length": 682.8660888671875, "epoch": 0.594727802255246, "grad_norm": 0.6980232000350952, "kl": 0.4345703125, "learning_rate": 4.798826541990908e-07, "loss": 0.0174, "reward": 1.1138392984867096, "reward_std": 0.16519411560148, "rewards/accuracy_reward": 0.12053571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1991 }, { "completion_length": 599.3236846923828, "epoch": 0.5950265103427675, "grad_norm": 0.4113408029079437, "kl": 0.36328125, "learning_rate": 4.794190637303926e-07, "loss": 0.0145, "reward": 1.2433036267757416, "reward_std": 0.19329262245446444, "rewards/accuracy_reward": 0.2522321541327983, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 1992 }, { "completion_length": 653.5714569091797, "epoch": 0.595325218430289, "grad_norm": 0.571906566619873, "kl": 0.376220703125, "learning_rate": 4.789555500468354e-07, "loss": 0.015, "reward": 1.096540242433548, "reward_std": 0.15851814486086369, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 1993 }, { "completion_length": 634.9286041259766, "epoch": 0.5956239265178105, "grad_norm": 0.44373178482055664, "kl": 0.26708984375, "learning_rate": 4.784921136526767e-07, "loss": 0.0107, "reward": 1.1099331080913544, "reward_std": 0.1492699459195137, "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 1994 }, { "completion_length": 647.7477874755859, "epoch": 0.5959226346053319, "grad_norm": 0.31289005279541016, "kl": 0.260986328125, "learning_rate": 4.780287550520896e-07, "loss": 0.0104, "reward": 1.184151828289032, "reward_std": 0.18141926638782024, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 1995 }, { "completion_length": 685.3058319091797, "epoch": 0.5962213426928534, "grad_norm": 0.5082846283912659, "kl": 0.280517578125, "learning_rate": 4.775654747491633e-07, "loss": 0.0112, "reward": 1.199776828289032, "reward_std": 0.1524613844230771, "rewards/accuracy_reward": 0.2120535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 1996 }, { "completion_length": 594.2143096923828, "epoch": 0.5965200507803748, "grad_norm": 0.5069788098335266, "kl": 0.416748046875, "learning_rate": 4.771022732479011e-07, "loss": 0.0167, "reward": 1.2237723767757416, "reward_std": 0.1473130825906992, "rewards/accuracy_reward": 0.22991071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616454601288, "step": 1997 }, { "completion_length": 637.2098541259766, "epoch": 0.5968187588678964, "grad_norm": 0.31756705045700073, "kl": 0.3486328125, "learning_rate": 4.7663915105222075e-07, "loss": 0.0139, "reward": 1.147321492433548, "reward_std": 0.10596262477338314, "rewards/accuracy_reward": 0.15401786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 1998 }, { "completion_length": 623.935302734375, "epoch": 0.5971174669554178, "grad_norm": 0.4403603672981262, "kl": 0.46240234375, "learning_rate": 4.76176108665954e-07, "loss": 0.0185, "reward": 1.0831473767757416, "reward_std": 0.14818331599235535, "rewards/accuracy_reward": 0.10267857788130641, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 1999 }, { "completion_length": 599.8928680419922, "epoch": 0.5974161750429393, "grad_norm": 0.4342116713523865, "kl": 0.270263671875, "learning_rate": 4.7571314659284545e-07, "loss": 0.0108, "reward": 1.2209821939468384, "reward_std": 0.19935410469770432, "rewards/accuracy_reward": 0.2299107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 2000 }, { "completion_length": 629.3527069091797, "epoch": 0.5977148831304607, "grad_norm": 0.3719198703765869, "kl": 0.3203125, "learning_rate": 4.7525026533655264e-07, "loss": 0.0128, "reward": 1.1311384439468384, "reward_std": 0.11972678825259209, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.992745578289032, "step": 2001 }, { "completion_length": 598.997802734375, "epoch": 0.5980135912179823, "grad_norm": 0.4336593449115753, "kl": 0.36279296875, "learning_rate": 4.747874654006447e-07, "loss": 0.0145, "reward": 1.301339328289032, "reward_std": 0.14826526679098606, "rewards/accuracy_reward": 0.3080357313156128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933036118745804, "step": 2002 }, { "completion_length": 630.6049194335938, "epoch": 0.5983122993055037, "grad_norm": 0.45542004704475403, "kl": 0.459716796875, "learning_rate": 4.7432474728860286e-07, "loss": 0.0184, "reward": 1.3772322237491608, "reward_std": 0.2035442627966404, "rewards/accuracy_reward": 0.3906250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2003 }, { "completion_length": 663.216552734375, "epoch": 0.5986110073930252, "grad_norm": 0.45368653535842896, "kl": 0.4381103515625, "learning_rate": 4.73862111503819e-07, "loss": 0.0175, "reward": 1.0697545111179352, "reward_std": 0.10399209335446358, "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2004 }, { "completion_length": 595.9464569091797, "epoch": 0.5989097154805466, "grad_norm": 0.42493975162506104, "kl": 0.357177734375, "learning_rate": 4.7339955854959545e-07, "loss": 0.0143, "reward": 1.2689732909202576, "reward_std": 0.07066072151064873, "rewards/accuracy_reward": 0.274553582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9944196492433548, "step": 2005 }, { "completion_length": 664.6562805175781, "epoch": 0.5992084235680681, "grad_norm": 1.5838932991027832, "kl": 0.488037109375, "learning_rate": 4.729370889291446e-07, "loss": 0.0195, "reward": 1.2003348767757416, "reward_std": 0.1788315735757351, "rewards/accuracy_reward": 0.21875000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 2006 }, { "completion_length": 706.0536041259766, "epoch": 0.5995071316555896, "grad_norm": 0.4966599941253662, "kl": 0.54443359375, "learning_rate": 4.7247470314558814e-07, "loss": 0.0218, "reward": 1.1166295111179352, "reward_std": 0.14454389177262783, "rewards/accuracy_reward": 0.12500000302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916295111179352, "step": 2007 }, { "completion_length": 649.685302734375, "epoch": 0.5998058397431111, "grad_norm": 0.39373719692230225, "kl": 0.5087890625, "learning_rate": 4.7201240170195624e-07, "loss": 0.0204, "reward": 1.1127232909202576, "reward_std": 0.22367727756500244, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625447034836, "step": 2008 }, { "completion_length": 697.1138763427734, "epoch": 0.6001045478306325, "grad_norm": 0.486427366733551, "kl": 0.377685546875, "learning_rate": 4.715501851011877e-07, "loss": 0.0151, "reward": 1.1964285969734192, "reward_std": 0.1685766987502575, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 2009 }, { "completion_length": 652.6875305175781, "epoch": 0.600403255918154, "grad_norm": 0.7062864303588867, "kl": 0.7080078125, "learning_rate": 4.7108805384612884e-07, "loss": 0.0283, "reward": 1.2087053954601288, "reward_std": 0.2141515351831913, "rewards/accuracy_reward": 0.2254464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 2010 }, { "completion_length": 656.4263610839844, "epoch": 0.6007019640056754, "grad_norm": 0.5553514957427979, "kl": 0.5234375, "learning_rate": 4.706260084395333e-07, "loss": 0.021, "reward": 1.1595982611179352, "reward_std": 0.15179445454850793, "rewards/accuracy_reward": 0.16964286798611283, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 2011 }, { "completion_length": 673.5045013427734, "epoch": 0.601000672093197, "grad_norm": 0.382623553276062, "kl": 0.4808349609375, "learning_rate": 4.701640493840608e-07, "loss": 0.0192, "reward": 1.2672991752624512, "reward_std": 0.18083404563367367, "rewards/accuracy_reward": 0.2812500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 2012 }, { "completion_length": 626.8705520629883, "epoch": 0.6012993801807184, "grad_norm": 0.403555303812027, "kl": 0.739013671875, "learning_rate": 4.697021771822781e-07, "loss": 0.0295, "reward": 1.2003348767757416, "reward_std": 0.22344396635890007, "rewards/accuracy_reward": 0.22767857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562798023224, "step": 2013 }, { "completion_length": 657.1562652587891, "epoch": 0.6015980882682399, "grad_norm": 0.5825003981590271, "kl": 0.34912109375, "learning_rate": 4.6924039233665656e-07, "loss": 0.0139, "reward": 1.1969866752624512, "reward_std": 0.16130965016782284, "rewards/accuracy_reward": 0.2053571604192257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 2014 }, { "completion_length": 596.6518020629883, "epoch": 0.6018967963557613, "grad_norm": 0.8997962474822998, "kl": 0.39990234375, "learning_rate": 4.687786953495728e-07, "loss": 0.016, "reward": 1.156808078289032, "reward_std": 0.14175301790237427, "rewards/accuracy_reward": 0.16741072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2015 }, { "completion_length": 653.0045013427734, "epoch": 0.6021955044432828, "grad_norm": 0.45428624749183655, "kl": 0.496826171875, "learning_rate": 4.683170867233079e-07, "loss": 0.0199, "reward": 1.188058078289032, "reward_std": 0.1659446321427822, "rewards/accuracy_reward": 0.20089286752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 2016 }, { "completion_length": 614.8951110839844, "epoch": 0.6024942125308043, "grad_norm": 0.3793104887008667, "kl": 0.350341796875, "learning_rate": 4.67855566960047e-07, "loss": 0.014, "reward": 1.1595982909202576, "reward_std": 0.15400453098118305, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 2017 }, { "completion_length": 689.4308319091797, "epoch": 0.6027929206183258, "grad_norm": 0.7750604152679443, "kl": 0.52880859375, "learning_rate": 4.673941365618781e-07, "loss": 0.0211, "reward": 1.2427456080913544, "reward_std": 0.24245310947299004, "rewards/accuracy_reward": 0.2611607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 2018 }, { "completion_length": 759.6629943847656, "epoch": 0.6030916287058472, "grad_norm": 0.4849381446838379, "kl": 0.62451171875, "learning_rate": 4.669327960307924e-07, "loss": 0.025, "reward": 1.158482164144516, "reward_std": 0.1545394193381071, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 2019 }, { "completion_length": 674.3080444335938, "epoch": 0.6033903367933687, "grad_norm": 0.8310149908065796, "kl": 0.58642578125, "learning_rate": 4.6647154586868323e-07, "loss": 0.0235, "reward": 1.2103795409202576, "reward_std": 0.1408237051218748, "rewards/accuracy_reward": 0.22767858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 2020 }, { "completion_length": 567.9509124755859, "epoch": 0.6036890448808901, "grad_norm": 0.613699197769165, "kl": 0.30859375, "learning_rate": 4.660103865773455e-07, "loss": 0.0123, "reward": 1.2338170111179352, "reward_std": 0.1303330734372139, "rewards/accuracy_reward": 0.2410714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455484867096, "step": 2021 }, { "completion_length": 584.4888610839844, "epoch": 0.6039877529684117, "grad_norm": 0.31926873326301575, "kl": 0.209716796875, "learning_rate": 4.6554931865847526e-07, "loss": 0.0084, "reward": 1.1138393580913544, "reward_std": 0.10694535076618195, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9977678656578064, "step": 2022 }, { "completion_length": 682.2344207763672, "epoch": 0.6042864610559331, "grad_norm": 0.8888502717018127, "kl": 0.48974609375, "learning_rate": 4.6508834261366914e-07, "loss": 0.0196, "reward": 1.1372768431901932, "reward_std": 0.1291529443114996, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98995541036129, "step": 2023 }, { "completion_length": 588.3549423217773, "epoch": 0.6045851691434545, "grad_norm": 0.27083998918533325, "kl": 0.3206787109375, "learning_rate": 4.646274589444241e-07, "loss": 0.0128, "reward": 1.0753348767757416, "reward_std": 0.0772212203592062, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949776977300644, "step": 2024 }, { "completion_length": 626.5647583007812, "epoch": 0.604883877230976, "grad_norm": 0.4027782678604126, "kl": 0.364990234375, "learning_rate": 4.641666681521365e-07, "loss": 0.0146, "reward": 1.231026828289032, "reward_std": 0.21050311252474785, "rewards/accuracy_reward": 0.2433035857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2025 }, { "completion_length": 689.6652069091797, "epoch": 0.6051825853184974, "grad_norm": 0.7071657180786133, "kl": 0.433837890625, "learning_rate": 4.6370597073810113e-07, "loss": 0.0174, "reward": 1.208147406578064, "reward_std": 0.21474355459213257, "rewards/accuracy_reward": 0.22321429662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 2026 }, { "completion_length": 690.9553756713867, "epoch": 0.605481293406019, "grad_norm": 0.5012272596359253, "kl": 0.380859375, "learning_rate": 4.6324536720351205e-07, "loss": 0.0153, "reward": 1.0898438394069672, "reward_std": 0.16227765567600727, "rewards/accuracy_reward": 0.10267857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 2027 }, { "completion_length": 608.997802734375, "epoch": 0.6057800014935404, "grad_norm": 0.7111762762069702, "kl": 0.501708984375, "learning_rate": 4.6278485804946044e-07, "loss": 0.0201, "reward": 1.1986607313156128, "reward_std": 0.18361111916601658, "rewards/accuracy_reward": 0.2165178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 2028 }, { "completion_length": 647.2924499511719, "epoch": 0.6060787095810619, "grad_norm": 0.632790207862854, "kl": 0.47900390625, "learning_rate": 4.6232444377693536e-07, "loss": 0.0192, "reward": 1.1852679252624512, "reward_std": 0.16208385676145554, "rewards/accuracy_reward": 0.1941964440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 2029 }, { "completion_length": 603.1250305175781, "epoch": 0.6063774176685833, "grad_norm": 0.35647204518318176, "kl": 0.311279296875, "learning_rate": 4.618641248868221e-07, "loss": 0.0125, "reward": 1.2472098767757416, "reward_std": 0.21583093330264091, "rewards/accuracy_reward": 0.25669644214212894, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 2030 }, { "completion_length": 627.678596496582, "epoch": 0.6066761257561049, "grad_norm": 0.31310054659843445, "kl": 0.47509765625, "learning_rate": 4.6140390187990286e-07, "loss": 0.019, "reward": 1.2812500596046448, "reward_std": 0.1937135010957718, "rewards/accuracy_reward": 0.2924107313156128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393431901932, "step": 2031 }, { "completion_length": 639.9084930419922, "epoch": 0.6069748338436263, "grad_norm": 0.3890296518802643, "kl": 0.704345703125, "learning_rate": 4.6094377525685515e-07, "loss": 0.0282, "reward": 1.0954241454601288, "reward_std": 0.15297459438443184, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 2032 }, { "completion_length": 688.8058319091797, "epoch": 0.6072735419311478, "grad_norm": 0.44253242015838623, "kl": 0.73486328125, "learning_rate": 4.6048374551825143e-07, "loss": 0.0294, "reward": 1.1166295111179352, "reward_std": 0.1334381364285946, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 2033 }, { "completion_length": 687.154052734375, "epoch": 0.6075722500186692, "grad_norm": 0.4153061807155609, "kl": 0.634765625, "learning_rate": 4.600238131645592e-07, "loss": 0.0254, "reward": 1.0909598767757416, "reward_std": 0.1644089538604021, "rewards/accuracy_reward": 0.10714286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 2034 }, { "completion_length": 620.7388763427734, "epoch": 0.6078709581061907, "grad_norm": 0.6961273550987244, "kl": 0.48095703125, "learning_rate": 4.5956397869613937e-07, "loss": 0.0192, "reward": 1.1696428954601288, "reward_std": 0.11973716923967004, "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 2035 }, { "completion_length": 664.2053985595703, "epoch": 0.6081696661937122, "grad_norm": 0.5713018178939819, "kl": 0.8330078125, "learning_rate": 4.591042426132472e-07, "loss": 0.0333, "reward": 1.0870536267757416, "reward_std": 0.1806101519614458, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107313156128, "step": 2036 }, { "completion_length": 635.8661041259766, "epoch": 0.6084683742812337, "grad_norm": 0.991352379322052, "kl": 0.927734375, "learning_rate": 4.5864460541603025e-07, "loss": 0.0372, "reward": 1.1383929252624512, "reward_std": 0.2115812636911869, "rewards/accuracy_reward": 0.16071429569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776785969734192, "step": 2037 }, { "completion_length": 664.0044860839844, "epoch": 0.6087670823687551, "grad_norm": 0.435152143239975, "kl": 0.60009765625, "learning_rate": 4.581850676045288e-07, "loss": 0.0241, "reward": 1.1897321939468384, "reward_std": 0.1289786547422409, "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9955357313156128, "step": 2038 }, { "completion_length": 625.5513763427734, "epoch": 0.6090657904562766, "grad_norm": 1.2368110418319702, "kl": 0.716064453125, "learning_rate": 4.577256296786751e-07, "loss": 0.0287, "reward": 1.2176339626312256, "reward_std": 0.1751029845327139, "rewards/accuracy_reward": 0.238839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946790456772, "step": 2039 }, { "completion_length": 624.9062728881836, "epoch": 0.609364498543798, "grad_norm": 0.7706602811813354, "kl": 0.667236328125, "learning_rate": 4.572662921382924e-07, "loss": 0.0267, "reward": 1.2494420111179352, "reward_std": 0.12783771753311157, "rewards/accuracy_reward": 0.2633928768336773, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 2040 }, { "completion_length": 627.763427734375, "epoch": 0.6096632066313196, "grad_norm": 1.0027375221252441, "kl": 0.896484375, "learning_rate": 4.5680705548309495e-07, "loss": 0.0359, "reward": 1.0585937947034836, "reward_std": 0.15682686492800713, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.982700914144516, "step": 2041 }, { "completion_length": 623.857177734375, "epoch": 0.609961914718841, "grad_norm": 0.5614262819290161, "kl": 0.6748046875, "learning_rate": 4.5634792021268717e-07, "loss": 0.027, "reward": 1.220982164144516, "reward_std": 0.15262319706380367, "rewards/accuracy_reward": 0.2276785857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933036118745804, "step": 2042 }, { "completion_length": 654.3861999511719, "epoch": 0.6102606228063625, "grad_norm": 0.45928069949150085, "kl": 0.4716796875, "learning_rate": 4.5588888682656336e-07, "loss": 0.0188, "reward": 1.086495578289032, "reward_std": 0.12883598916232586, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134439468384, "step": 2043 }, { "completion_length": 720.7835083007812, "epoch": 0.6105593308938839, "grad_norm": 0.6242275834083557, "kl": 0.42919921875, "learning_rate": 4.5542995582410693e-07, "loss": 0.0172, "reward": 1.1222098767757416, "reward_std": 0.11114107817411423, "rewards/accuracy_reward": 0.13169643329456449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 2044 }, { "completion_length": 657.8303833007812, "epoch": 0.6108580389814054, "grad_norm": 0.4120122492313385, "kl": 0.31640625, "learning_rate": 4.5497112770458944e-07, "loss": 0.0127, "reward": 1.1422991454601288, "reward_std": 0.16637017205357552, "rewards/accuracy_reward": 0.15625000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 2045 }, { "completion_length": 672.4174346923828, "epoch": 0.6111567470689269, "grad_norm": 0.3775024116039276, "kl": 0.343017578125, "learning_rate": 4.5451240296717143e-07, "loss": 0.0137, "reward": 1.1077009737491608, "reward_std": 0.17808764800429344, "rewards/accuracy_reward": 0.1205357238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2046 }, { "completion_length": 610.8080673217773, "epoch": 0.6114554551564484, "grad_norm": 0.4207998216152191, "kl": 0.416015625, "learning_rate": 4.5405378211090004e-07, "loss": 0.0166, "reward": 1.1322545111179352, "reward_std": 0.13714788295328617, "rewards/accuracy_reward": 0.14285715413279831, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973767757416, "step": 2047 }, { "completion_length": 612.1942291259766, "epoch": 0.6117541632439698, "grad_norm": 0.5460286140441895, "kl": 0.404296875, "learning_rate": 4.5359526563471007e-07, "loss": 0.0162, "reward": 1.180803656578064, "reward_std": 0.20153719559311867, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 2048 }, { "completion_length": 702.0982513427734, "epoch": 0.6120528713314913, "grad_norm": 0.63337242603302, "kl": 0.33203125, "learning_rate": 4.531368540374223e-07, "loss": 0.0133, "reward": 1.106026828289032, "reward_std": 0.147038321942091, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 2049 }, { "completion_length": 709.5000305175781, "epoch": 0.6123515794190127, "grad_norm": 0.7403742074966431, "kl": 0.548828125, "learning_rate": 4.5267854781774384e-07, "loss": 0.022, "reward": 1.1417411267757416, "reward_std": 0.12318946537561715, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 2050 }, { "completion_length": 646.7768096923828, "epoch": 0.6126502875065343, "grad_norm": 0.4526778757572174, "kl": 0.60498046875, "learning_rate": 4.5222034747426687e-07, "loss": 0.0242, "reward": 1.266183078289032, "reward_std": 0.2023260034620762, "rewards/accuracy_reward": 0.2857142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 2051 }, { "completion_length": 632.8861999511719, "epoch": 0.6129489955940557, "grad_norm": 1.2520776987075806, "kl": 0.366455078125, "learning_rate": 4.517622535054684e-07, "loss": 0.0147, "reward": 1.1662946939468384, "reward_std": 0.12264220789074898, "rewards/accuracy_reward": 0.17857143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2052 }, { "completion_length": 630.2053985595703, "epoch": 0.6132477036815772, "grad_norm": 0.7197838425636292, "kl": 0.2763671875, "learning_rate": 4.5130426640970967e-07, "loss": 0.011, "reward": 1.180245578289032, "reward_std": 0.15777246095240116, "rewards/accuracy_reward": 0.1875000069849193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.992745578289032, "step": 2053 }, { "completion_length": 596.1004638671875, "epoch": 0.6135464117690986, "grad_norm": 0.2476133108139038, "kl": 0.34716796875, "learning_rate": 4.508463866852358e-07, "loss": 0.0139, "reward": 1.1618303954601288, "reward_std": 0.10389046743512154, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 2054 }, { "completion_length": 687.872802734375, "epoch": 0.6138451198566202, "grad_norm": 0.3841504752635956, "kl": 0.3074951171875, "learning_rate": 4.503886148301753e-07, "loss": 0.0123, "reward": 1.1021206080913544, "reward_std": 0.13987999968230724, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 2055 }, { "completion_length": 698.4285888671875, "epoch": 0.6141438279441416, "grad_norm": 0.6723121404647827, "kl": 0.5810546875, "learning_rate": 4.4993095134253876e-07, "loss": 0.0233, "reward": 1.0909598767757416, "reward_std": 0.16313673183321953, "rewards/accuracy_reward": 0.10714285960420966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 2056 }, { "completion_length": 596.8549499511719, "epoch": 0.6144425360316631, "grad_norm": 0.31836074590682983, "kl": 0.26513671875, "learning_rate": 4.4947339672021946e-07, "loss": 0.0106, "reward": 1.1735491752624512, "reward_std": 0.13634912855923176, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949776977300644, "step": 2057 }, { "completion_length": 610.3303680419922, "epoch": 0.6147412441191845, "grad_norm": 0.7635383009910583, "kl": 0.2193603515625, "learning_rate": 4.490159514609918e-07, "loss": 0.0088, "reward": 1.2003348469734192, "reward_std": 0.10326340235769749, "rewards/accuracy_reward": 0.20312500977888703, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9972098469734192, "step": 2058 }, { "completion_length": 551.4018096923828, "epoch": 0.615039952206706, "grad_norm": 0.8683878779411316, "kl": 0.364501953125, "learning_rate": 4.4855861606251156e-07, "loss": 0.0146, "reward": 1.2566964626312256, "reward_std": 0.1961119920015335, "rewards/accuracy_reward": 0.2656250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 2059 }, { "completion_length": 621.7098388671875, "epoch": 0.6153386602942275, "grad_norm": 0.3701191842556, "kl": 0.21466064453125, "learning_rate": 4.4810139102231446e-07, "loss": 0.0086, "reward": 1.0909598767757416, "reward_std": 0.09806814510375261, "rewards/accuracy_reward": 0.09821429033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.992745578289032, "step": 2060 }, { "completion_length": 681.7299346923828, "epoch": 0.615637368381749, "grad_norm": 0.671312153339386, "kl": 0.9677734375, "learning_rate": 4.47644276837817e-07, "loss": 0.0387, "reward": 1.0597098469734192, "reward_std": 0.18250783160328865, "rewards/accuracy_reward": 0.08258928847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977120578289032, "step": 2061 }, { "completion_length": 666.7098388671875, "epoch": 0.6159360764692704, "grad_norm": 0.39385631680488586, "kl": 0.414794921875, "learning_rate": 4.471872740063144e-07, "loss": 0.0166, "reward": 1.2087054252624512, "reward_std": 0.1525461385026574, "rewards/accuracy_reward": 0.22544643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2062 }, { "completion_length": 593.0111923217773, "epoch": 0.6162347845567919, "grad_norm": 0.3984127640724182, "kl": 0.4327392578125, "learning_rate": 4.467303830249808e-07, "loss": 0.0173, "reward": 1.2159598767757416, "reward_std": 0.13551309891045094, "rewards/accuracy_reward": 0.227678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2063 }, { "completion_length": 565.7678909301758, "epoch": 0.6165334926443133, "grad_norm": 0.4187617301940918, "kl": 0.3447265625, "learning_rate": 4.4627360439086905e-07, "loss": 0.0138, "reward": 1.1378348767757416, "reward_std": 0.1417143978178501, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 2064 }, { "completion_length": 654.4933166503906, "epoch": 0.6168322007318349, "grad_norm": 0.4658054709434509, "kl": 0.35546875, "learning_rate": 4.4581693860090954e-07, "loss": 0.0142, "reward": 1.2851562798023224, "reward_std": 0.205013245344162, "rewards/accuracy_reward": 0.2946428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 2065 }, { "completion_length": 721.1674499511719, "epoch": 0.6171309088193563, "grad_norm": 0.6601637601852417, "kl": 0.51611328125, "learning_rate": 4.453603861519096e-07, "loss": 0.0206, "reward": 1.0563616752624512, "reward_std": 0.16856191493570805, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 2066 }, { "completion_length": 701.9263610839844, "epoch": 0.6174296169068777, "grad_norm": 0.9107874631881714, "kl": 0.65234375, "learning_rate": 4.449039475405538e-07, "loss": 0.0261, "reward": 1.0803572088479996, "reward_std": 0.1852826252579689, "rewards/accuracy_reward": 0.11160715040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500298023224, "step": 2067 }, { "completion_length": 702.0491333007812, "epoch": 0.6177283249943992, "grad_norm": 0.7702666521072388, "kl": 0.690185546875, "learning_rate": 4.444476232634027e-07, "loss": 0.0275, "reward": 1.1512277126312256, "reward_std": 0.14099622750654817, "rewards/accuracy_reward": 0.16517858137376606, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 2068 }, { "completion_length": 589.3504791259766, "epoch": 0.6180270330819206, "grad_norm": 0.8808326125144958, "kl": 0.48046875, "learning_rate": 4.439914138168922e-07, "loss": 0.0192, "reward": 1.1902902126312256, "reward_std": 0.174365121871233, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 2069 }, { "completion_length": 593.7031555175781, "epoch": 0.6183257411694422, "grad_norm": 0.49771493673324585, "kl": 0.61669921875, "learning_rate": 4.435353196973334e-07, "loss": 0.0246, "reward": 1.1674107909202576, "reward_std": 0.14559611305594444, "rewards/accuracy_reward": 0.17633929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 2070 }, { "completion_length": 621.3995971679688, "epoch": 0.6186244492569636, "grad_norm": 0.516185998916626, "kl": 0.73583984375, "learning_rate": 4.430793414009122e-07, "loss": 0.0295, "reward": 1.1607143580913544, "reward_std": 0.08489705715328455, "rewards/accuracy_reward": 0.17857143515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 2071 }, { "completion_length": 620.8638763427734, "epoch": 0.6189231573444851, "grad_norm": 0.49081775546073914, "kl": 0.502685546875, "learning_rate": 4.4262347942368815e-07, "loss": 0.0202, "reward": 1.0920759439468384, "reward_std": 0.11146495980210602, "rewards/accuracy_reward": 0.10267857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973767757416, "step": 2072 }, { "completion_length": 675.0848388671875, "epoch": 0.6192218654320065, "grad_norm": 0.5935240983963013, "kl": 0.7255859375, "learning_rate": 4.4216773426159446e-07, "loss": 0.029, "reward": 1.1121652722358704, "reward_std": 0.208980280905962, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 2073 }, { "completion_length": 742.3616333007812, "epoch": 0.619520573519528, "grad_norm": 0.816258430480957, "kl": 0.5078125, "learning_rate": 4.417121064104372e-07, "loss": 0.0203, "reward": 1.0948660969734192, "reward_std": 0.10271228104829788, "rewards/accuracy_reward": 0.10714286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232313156128, "step": 2074 }, { "completion_length": 652.6674346923828, "epoch": 0.6198192816070495, "grad_norm": 0.42379647493362427, "kl": 0.578125, "learning_rate": 4.4125659636589484e-07, "loss": 0.0231, "reward": 1.168526828289032, "reward_std": 0.13128156960010529, "rewards/accuracy_reward": 0.17633928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 2075 }, { "completion_length": 639.4062805175781, "epoch": 0.620117989694571, "grad_norm": 0.505681574344635, "kl": 0.61181640625, "learning_rate": 4.408012046235177e-07, "loss": 0.0245, "reward": 1.2829241752624512, "reward_std": 0.1827071364969015, "rewards/accuracy_reward": 0.3035714514553547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 2076 }, { "completion_length": 682.5625305175781, "epoch": 0.6204166977820924, "grad_norm": 0.595797061920166, "kl": 0.7119140625, "learning_rate": 4.4034593167872714e-07, "loss": 0.0285, "reward": 1.2070313096046448, "reward_std": 0.15491684898734093, "rewards/accuracy_reward": 0.22544644260779023, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 2077 }, { "completion_length": 593.6830596923828, "epoch": 0.6207154058696139, "grad_norm": 1.2742832899093628, "kl": 0.512451171875, "learning_rate": 4.3989077802681576e-07, "loss": 0.0205, "reward": 1.1339286118745804, "reward_std": 0.10877872444689274, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 2078 }, { "completion_length": 705.8482360839844, "epoch": 0.6210141139571353, "grad_norm": 0.6235899925231934, "kl": 0.359619140625, "learning_rate": 4.3943574416294605e-07, "loss": 0.0144, "reward": 1.0881697237491608, "reward_std": 0.1341477306559682, "rewards/accuracy_reward": 0.10044643515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232313156128, "step": 2079 }, { "completion_length": 637.4419860839844, "epoch": 0.6213128220446569, "grad_norm": 0.7058842778205872, "kl": 0.54541015625, "learning_rate": 4.389808305821502e-07, "loss": 0.0219, "reward": 1.182477742433548, "reward_std": 0.21384519711136818, "rewards/accuracy_reward": 0.2008928693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 2080 }, { "completion_length": 633.9308319091797, "epoch": 0.6216115301321783, "grad_norm": 0.3672351539134979, "kl": 0.4619140625, "learning_rate": 4.385260377793295e-07, "loss": 0.0185, "reward": 1.1127232611179352, "reward_std": 0.17489511705935, "rewards/accuracy_reward": 0.12723214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911267757416, "step": 2081 }, { "completion_length": 720.9955596923828, "epoch": 0.6219102382196998, "grad_norm": 0.3969343900680542, "kl": 0.41796875, "learning_rate": 4.380713662492541e-07, "loss": 0.0167, "reward": 1.258928656578064, "reward_std": 0.1298125982284546, "rewards/accuracy_reward": 0.2700892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 2082 }, { "completion_length": 671.4777221679688, "epoch": 0.6222089463072212, "grad_norm": 0.5921757817268372, "kl": 0.46630859375, "learning_rate": 4.376168164865622e-07, "loss": 0.0187, "reward": 1.139508992433548, "reward_std": 0.16807472333312035, "rewards/accuracy_reward": 0.15178572060540318, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 2083 }, { "completion_length": 663.950927734375, "epoch": 0.6225076543947428, "grad_norm": 0.7034007906913757, "kl": 0.574462890625, "learning_rate": 4.3716238898575906e-07, "loss": 0.023, "reward": 1.1372768431901932, "reward_std": 0.1568907145410776, "rewards/accuracy_reward": 0.1562500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268431901932, "step": 2084 }, { "completion_length": 638.4665451049805, "epoch": 0.6228063624822642, "grad_norm": 0.4945627450942993, "kl": 0.640869140625, "learning_rate": 4.3670808424121765e-07, "loss": 0.0257, "reward": 1.2025670409202576, "reward_std": 0.15919784642755985, "rewards/accuracy_reward": 0.21428572945296764, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2085 }, { "completion_length": 647.0625305175781, "epoch": 0.6231050705697857, "grad_norm": 0.31339526176452637, "kl": 0.3690185546875, "learning_rate": 4.362539027471767e-07, "loss": 0.0148, "reward": 1.1662946939468384, "reward_std": 0.15677275136113167, "rewards/accuracy_reward": 0.17410715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 2086 }, { "completion_length": 627.107177734375, "epoch": 0.6234037786573071, "grad_norm": 0.28556573390960693, "kl": 0.22406005859375, "learning_rate": 4.357998449977411e-07, "loss": 0.0089, "reward": 1.2265625596046448, "reward_std": 0.12462856853380799, "rewards/accuracy_reward": 0.2299107313156128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966517984867096, "step": 2087 }, { "completion_length": 616.5245819091797, "epoch": 0.6237024867448286, "grad_norm": 0.49790072441101074, "kl": 0.2608642578125, "learning_rate": 4.353459114868814e-07, "loss": 0.0104, "reward": 1.147321492433548, "reward_std": 0.11962565151043236, "rewards/accuracy_reward": 0.1540178693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933036118745804, "step": 2088 }, { "completion_length": 667.9330749511719, "epoch": 0.6240011948323501, "grad_norm": 0.34909841418266296, "kl": 0.62109375, "learning_rate": 4.348921027084327e-07, "loss": 0.0248, "reward": 1.180245578289032, "reward_std": 0.14576493576169014, "rewards/accuracy_reward": 0.18973214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 2089 }, { "completion_length": 625.4152069091797, "epoch": 0.6242999029198716, "grad_norm": 0.31671667098999023, "kl": 0.1180419921875, "learning_rate": 4.3443841915609457e-07, "loss": 0.0047, "reward": 1.112165242433548, "reward_std": 0.12073807884007692, "rewards/accuracy_reward": 0.11383929033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9983258992433548, "step": 2090 }, { "completion_length": 686.1183471679688, "epoch": 0.624598611007393, "grad_norm": 0.49538567662239075, "kl": 0.548828125, "learning_rate": 4.339848613234299e-07, "loss": 0.022, "reward": 1.0998884290456772, "reward_std": 0.1114681139588356, "rewards/accuracy_reward": 0.1205357164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 2091 }, { "completion_length": 620.0915374755859, "epoch": 0.6248973190949145, "grad_norm": 0.28129151463508606, "kl": 0.43896484375, "learning_rate": 4.3353142970386557e-07, "loss": 0.0175, "reward": 1.2282366454601288, "reward_std": 0.2194816805422306, "rewards/accuracy_reward": 0.238839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 2092 }, { "completion_length": 649.1741180419922, "epoch": 0.6251960271824359, "grad_norm": 0.40082019567489624, "kl": 0.344482421875, "learning_rate": 4.3307812479069063e-07, "loss": 0.0138, "reward": 1.2287946939468384, "reward_std": 0.15004471503198147, "rewards/accuracy_reward": 0.2388393022119999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 2093 }, { "completion_length": 681.1473541259766, "epoch": 0.6254947352699575, "grad_norm": 0.5132942199707031, "kl": 0.339111328125, "learning_rate": 4.326249470770563e-07, "loss": 0.0136, "reward": 1.2014509439468384, "reward_std": 0.13689203560352325, "rewards/accuracy_reward": 0.2142857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871651977300644, "step": 2094 }, { "completion_length": 605.7076110839844, "epoch": 0.6257934433574789, "grad_norm": 0.8000829219818115, "kl": 0.34423828125, "learning_rate": 4.3217189705597547e-07, "loss": 0.0138, "reward": 1.1043527275323868, "reward_std": 0.13647491484880447, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455484867096, "step": 2095 }, { "completion_length": 723.029052734375, "epoch": 0.6260921514450004, "grad_norm": 0.621508777141571, "kl": 0.938232421875, "learning_rate": 4.317189752203224e-07, "loss": 0.0375, "reward": 1.113839328289032, "reward_std": 0.1741129867732525, "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464775323868, "step": 2096 }, { "completion_length": 655.8772735595703, "epoch": 0.6263908595325218, "grad_norm": 0.7934581637382507, "kl": 0.401611328125, "learning_rate": 4.3126618206283136e-07, "loss": 0.0161, "reward": 1.2656250894069672, "reward_std": 0.1712692342698574, "rewards/accuracy_reward": 0.2767857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 2097 }, { "completion_length": 701.904052734375, "epoch": 0.6266895676200434, "grad_norm": 0.724223792552948, "kl": 0.43408203125, "learning_rate": 4.308135180760971e-07, "loss": 0.0174, "reward": 1.1127232611179352, "reward_std": 0.19134380295872688, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 2098 }, { "completion_length": 613.9308319091797, "epoch": 0.6269882757075648, "grad_norm": 0.8348001837730408, "kl": 0.3662109375, "learning_rate": 4.303609837525737e-07, "loss": 0.0146, "reward": 1.1562500298023224, "reward_std": 0.12296980246901512, "rewards/accuracy_reward": 0.16517858440056443, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 2099 }, { "completion_length": 697.0893249511719, "epoch": 0.6272869837950863, "grad_norm": 0.693773090839386, "kl": 0.750244140625, "learning_rate": 4.2990857958457407e-07, "loss": 0.0301, "reward": 1.1975446939468384, "reward_std": 0.19980616588145494, "rewards/accuracy_reward": 0.21428572130389512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589477300644, "step": 2100 }, { "completion_length": 701.9375305175781, "epoch": 0.6275856918826077, "grad_norm": 0.5934595465660095, "kl": 0.5908203125, "learning_rate": 4.2945630606426966e-07, "loss": 0.0236, "reward": 1.1417411267757416, "reward_std": 0.15099174017086625, "rewards/accuracy_reward": 0.15625000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 2101 }, { "completion_length": 706.6495971679688, "epoch": 0.6278843999701292, "grad_norm": 1.882936954498291, "kl": 1.031982421875, "learning_rate": 4.2900416368368963e-07, "loss": 0.0413, "reward": 1.2047991454601288, "reward_std": 0.17814648896455765, "rewards/accuracy_reward": 0.2209821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 2102 }, { "completion_length": 604.0736923217773, "epoch": 0.6281831080576507, "grad_norm": 0.2658762037754059, "kl": 0.529296875, "learning_rate": 4.285521529347207e-07, "loss": 0.0212, "reward": 1.198102742433548, "reward_std": 0.17320766113698483, "rewards/accuracy_reward": 0.2053571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.992745578289032, "step": 2103 }, { "completion_length": 641.7522430419922, "epoch": 0.6284818161451722, "grad_norm": 0.6487276554107666, "kl": 0.3876953125, "learning_rate": 4.281002743091062e-07, "loss": 0.0155, "reward": 1.1830357611179352, "reward_std": 0.21993782743811607, "rewards/accuracy_reward": 0.2031250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 2104 }, { "completion_length": 627.6339721679688, "epoch": 0.6287805242326936, "grad_norm": 0.6965638995170593, "kl": 0.43994140625, "learning_rate": 4.2764852829844566e-07, "loss": 0.0176, "reward": 1.2354910969734192, "reward_std": 0.12403353489935398, "rewards/accuracy_reward": 0.2410714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 2105 }, { "completion_length": 687.3058319091797, "epoch": 0.6290792323202151, "grad_norm": 0.6234681010246277, "kl": 0.60986328125, "learning_rate": 4.271969153941948e-07, "loss": 0.0244, "reward": 1.2047991752624512, "reward_std": 0.20182612165808678, "rewards/accuracy_reward": 0.2165178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 2106 }, { "completion_length": 616.2902069091797, "epoch": 0.6293779404077365, "grad_norm": 0.5624808669090271, "kl": 0.59423828125, "learning_rate": 4.267454360876639e-07, "loss": 0.0237, "reward": 1.2064732611179352, "reward_std": 0.1737554520368576, "rewards/accuracy_reward": 0.22321429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2107 }, { "completion_length": 711.1339569091797, "epoch": 0.6296766484952581, "grad_norm": 1.004698634147644, "kl": 0.861328125, "learning_rate": 4.2629409087001835e-07, "loss": 0.0344, "reward": 1.1863839626312256, "reward_std": 0.19205829873681068, "rewards/accuracy_reward": 0.207589291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946939468384, "step": 2108 }, { "completion_length": 593.9643249511719, "epoch": 0.6299753565827795, "grad_norm": 0.44888725876808167, "kl": 0.558349609375, "learning_rate": 4.258428802322773e-07, "loss": 0.0223, "reward": 1.1835938096046448, "reward_std": 0.11831308249384165, "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 2109 }, { "completion_length": 737.5045013427734, "epoch": 0.6302740646703009, "grad_norm": 0.5511274933815002, "kl": 0.8212890625, "learning_rate": 4.25391804665314e-07, "loss": 0.0329, "reward": 1.0569196790456772, "reward_std": 0.1683340985327959, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625298023224, "step": 2110 }, { "completion_length": 591.6451187133789, "epoch": 0.6305727727578224, "grad_norm": 0.6945956945419312, "kl": 0.294189453125, "learning_rate": 4.2494086465985434e-07, "loss": 0.0118, "reward": 1.135602742433548, "reward_std": 0.11920256027951837, "rewards/accuracy_reward": 0.14285715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 2111 }, { "completion_length": 612.5000305175781, "epoch": 0.6308714808453438, "grad_norm": 0.38796892762184143, "kl": 0.3345947265625, "learning_rate": 4.2449006070647663e-07, "loss": 0.0134, "reward": 1.112165242433548, "reward_std": 0.13264933787286282, "rewards/accuracy_reward": 0.12053572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916295111179352, "step": 2112 }, { "completion_length": 607.7232284545898, "epoch": 0.6311701889328654, "grad_norm": 0.8299607038497925, "kl": 0.5264892578125, "learning_rate": 4.240393932956117e-07, "loss": 0.0211, "reward": 1.1741072237491608, "reward_std": 0.13244497030973434, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 2113 }, { "completion_length": 675.3616333007812, "epoch": 0.6314688970203868, "grad_norm": 0.36301079392433167, "kl": 0.31787109375, "learning_rate": 4.2358886291754134e-07, "loss": 0.0127, "reward": 1.194196492433548, "reward_std": 0.18667857721447945, "rewards/accuracy_reward": 0.2075892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2114 }, { "completion_length": 731.232177734375, "epoch": 0.6317676051079083, "grad_norm": 0.5944640040397644, "kl": 0.650390625, "learning_rate": 4.2313847006239867e-07, "loss": 0.026, "reward": 1.227120578289032, "reward_std": 0.19271844625473022, "rewards/accuracy_reward": 0.2455357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848767757416, "step": 2115 }, { "completion_length": 641.6227874755859, "epoch": 0.6320663131954297, "grad_norm": 0.3902953565120697, "kl": 0.470947265625, "learning_rate": 4.2268821522016665e-07, "loss": 0.0188, "reward": 1.2460938096046448, "reward_std": 0.182336394675076, "rewards/accuracy_reward": 0.2589285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2116 }, { "completion_length": 706.4375457763672, "epoch": 0.6323650212829512, "grad_norm": 0.8562049269676208, "kl": 0.48095703125, "learning_rate": 4.222380988806786e-07, "loss": 0.0193, "reward": 1.1780134737491608, "reward_std": 0.16372276470065117, "rewards/accuracy_reward": 0.1919642984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 2117 }, { "completion_length": 768.0580749511719, "epoch": 0.6326637293704727, "grad_norm": 1.5247855186462402, "kl": 0.8397216796875, "learning_rate": 4.2178812153361697e-07, "loss": 0.0336, "reward": 1.0518973767757416, "reward_std": 0.1292201764881611, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 2118 }, { "completion_length": 691.272346496582, "epoch": 0.6329624374579942, "grad_norm": 0.4841682016849518, "kl": 0.3582763671875, "learning_rate": 4.21338283668513e-07, "loss": 0.0143, "reward": 1.1378348767757416, "reward_std": 0.16325191035866737, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9972098618745804, "step": 2119 }, { "completion_length": 669.1406631469727, "epoch": 0.6332611455455156, "grad_norm": 0.39894676208496094, "kl": 0.4622802734375, "learning_rate": 4.2088858577474616e-07, "loss": 0.0185, "reward": 1.0775670111179352, "reward_std": 0.12159994058310986, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2120 }, { "completion_length": 682.4241333007812, "epoch": 0.6335598536330371, "grad_norm": 0.3228803277015686, "kl": 0.40869140625, "learning_rate": 4.2043902834154374e-07, "loss": 0.0164, "reward": 1.1573661267757416, "reward_std": 0.16982943564653397, "rewards/accuracy_reward": 0.1763392984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 2121 }, { "completion_length": 625.8125305175781, "epoch": 0.6338585617205585, "grad_norm": 0.8445684313774109, "kl": 0.579345703125, "learning_rate": 4.199896118579802e-07, "loss": 0.0232, "reward": 1.1183036267757416, "reward_std": 0.2068548183888197, "rewards/accuracy_reward": 0.13616071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 2122 }, { "completion_length": 656.0424652099609, "epoch": 0.6341572698080801, "grad_norm": 0.6751101016998291, "kl": 0.5341796875, "learning_rate": 4.195403368129764e-07, "loss": 0.0214, "reward": 1.2416295409202576, "reward_std": 0.19170450046658516, "rewards/accuracy_reward": 0.2633928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 2123 }, { "completion_length": 634.7009124755859, "epoch": 0.6344559778956015, "grad_norm": 0.6721422076225281, "kl": 0.682373046875, "learning_rate": 4.190912036952999e-07, "loss": 0.0273, "reward": 1.1484375298023224, "reward_std": 0.10746617242693901, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 2124 }, { "completion_length": 689.5402221679688, "epoch": 0.634754685983123, "grad_norm": 0.5503096580505371, "kl": 0.2838134765625, "learning_rate": 4.1864221299356337e-07, "loss": 0.0114, "reward": 1.1540178954601288, "reward_std": 0.11218840861693025, "rewards/accuracy_reward": 0.16071429662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 2125 }, { "completion_length": 678.4129791259766, "epoch": 0.6350533940706444, "grad_norm": 1.0905253887176514, "kl": 0.438232421875, "learning_rate": 4.181933651962245e-07, "loss": 0.0175, "reward": 1.1462054252624512, "reward_std": 0.11856637615710497, "rewards/accuracy_reward": 0.14955357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966517984867096, "step": 2126 }, { "completion_length": 709.4933471679688, "epoch": 0.635352102158166, "grad_norm": 0.4314996898174286, "kl": 0.380859375, "learning_rate": 4.177446607915859e-07, "loss": 0.0153, "reward": 1.1584821939468384, "reward_std": 0.2120959721505642, "rewards/accuracy_reward": 0.17410715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 2127 }, { "completion_length": 728.4129943847656, "epoch": 0.6356508102456874, "grad_norm": 0.4390586316585541, "kl": 0.3734130859375, "learning_rate": 4.1729610026779407e-07, "loss": 0.0149, "reward": 1.2522321939468384, "reward_std": 0.14385760575532913, "rewards/accuracy_reward": 0.2566964477300644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.995535746216774, "step": 2128 }, { "completion_length": 585.0781402587891, "epoch": 0.6359495183332089, "grad_norm": 0.5058538913726807, "kl": 0.366455078125, "learning_rate": 4.1684768411283865e-07, "loss": 0.0147, "reward": 1.1049107611179352, "reward_std": 0.13014912605285645, "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 2129 }, { "completion_length": 656.9219055175781, "epoch": 0.6362482264207303, "grad_norm": 0.5159469842910767, "kl": 0.555908203125, "learning_rate": 4.163994128145526e-07, "loss": 0.0223, "reward": 1.2142857909202576, "reward_std": 0.18204228021204472, "rewards/accuracy_reward": 0.22321430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 2130 }, { "completion_length": 627.716552734375, "epoch": 0.6365469345082518, "grad_norm": 0.5380465388298035, "kl": 0.578125, "learning_rate": 4.159512868606112e-07, "loss": 0.0231, "reward": 1.1992187798023224, "reward_std": 0.19423234649002552, "rewards/accuracy_reward": 0.2187500111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 2131 }, { "completion_length": 667.1763610839844, "epoch": 0.6368456425957733, "grad_norm": 0.7573733925819397, "kl": 0.348388671875, "learning_rate": 4.155033067385314e-07, "loss": 0.0139, "reward": 1.2578125894069672, "reward_std": 0.14527566358447075, "rewards/accuracy_reward": 0.2656250149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 2132 }, { "completion_length": 552.7745819091797, "epoch": 0.6371443506832948, "grad_norm": 0.4349260628223419, "kl": 0.32421875, "learning_rate": 4.1505547293567177e-07, "loss": 0.013, "reward": 1.276227742433548, "reward_std": 0.16437804186716676, "rewards/accuracy_reward": 0.2857143059372902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 2133 }, { "completion_length": 661.7611999511719, "epoch": 0.6374430587708162, "grad_norm": 0.46220582723617554, "kl": 0.44775390625, "learning_rate": 4.1460778593923173e-07, "loss": 0.0179, "reward": 1.2047991752624512, "reward_std": 0.12740565836429596, "rewards/accuracy_reward": 0.2165178693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2134 }, { "completion_length": 638.7388763427734, "epoch": 0.6377417668583377, "grad_norm": 0.3501216769218445, "kl": 0.224365234375, "learning_rate": 4.1416024623625066e-07, "loss": 0.009, "reward": 1.203683078289032, "reward_std": 0.17205951362848282, "rewards/accuracy_reward": 0.2075892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937649011612, "step": 2135 }, { "completion_length": 675.5714721679688, "epoch": 0.6380404749458591, "grad_norm": 0.41526705026626587, "kl": 0.696533203125, "learning_rate": 4.13712854313608e-07, "loss": 0.0279, "reward": 1.1953125596046448, "reward_std": 0.15794361755251884, "rewards/accuracy_reward": 0.2142857287544757, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 2136 }, { "completion_length": 633.1674499511719, "epoch": 0.6383391830333807, "grad_norm": 0.24800026416778564, "kl": 0.2105712890625, "learning_rate": 4.132656106580221e-07, "loss": 0.0084, "reward": 1.2003348767757416, "reward_std": 0.18334773741662502, "rewards/accuracy_reward": 0.20982143748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 2137 }, { "completion_length": 581.2768173217773, "epoch": 0.6386378911209021, "grad_norm": 0.6486968398094177, "kl": 0.266357421875, "learning_rate": 4.128185157560506e-07, "loss": 0.0106, "reward": 1.3694197237491608, "reward_std": 0.2091376669704914, "rewards/accuracy_reward": 0.3906250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946790456772, "step": 2138 }, { "completion_length": 607.0960235595703, "epoch": 0.6389365992084236, "grad_norm": 0.4457811117172241, "kl": 0.32373046875, "learning_rate": 4.1237157009408864e-07, "loss": 0.0129, "reward": 1.125558078289032, "reward_std": 0.11743925884366035, "rewards/accuracy_reward": 0.1294642877765, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937798023224, "step": 2139 }, { "completion_length": 623.1585235595703, "epoch": 0.639235307295945, "grad_norm": 0.8147462606430054, "kl": 0.317626953125, "learning_rate": 4.1192477415836944e-07, "loss": 0.0127, "reward": 1.1741071939468384, "reward_std": 0.1565065011382103, "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714775323868, "step": 2140 }, { "completion_length": 647.1383972167969, "epoch": 0.6395340153834665, "grad_norm": 1.0064328908920288, "kl": 0.591796875, "learning_rate": 4.114781284349631e-07, "loss": 0.0237, "reward": 1.1489956080913544, "reward_std": 0.17117947712540627, "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884439468384, "step": 2141 }, { "completion_length": 642.9777069091797, "epoch": 0.639832723470988, "grad_norm": 0.8069965839385986, "kl": 0.5322265625, "learning_rate": 4.110316334097764e-07, "loss": 0.0213, "reward": 1.1992188096046448, "reward_std": 0.14592047780752182, "rewards/accuracy_reward": 0.21875001583248377, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 2142 }, { "completion_length": 702.3259124755859, "epoch": 0.6401314315585095, "grad_norm": 0.4804580807685852, "kl": 0.7183837890625, "learning_rate": 4.105852895685522e-07, "loss": 0.0288, "reward": 1.0318080931901932, "reward_std": 0.1419275151565671, "rewards/accuracy_reward": 0.055803571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.976004496216774, "step": 2143 }, { "completion_length": 615.8973541259766, "epoch": 0.6404301396460309, "grad_norm": 0.6302512288093567, "kl": 0.403564453125, "learning_rate": 4.101390973968688e-07, "loss": 0.0161, "reward": 1.1233259439468384, "reward_std": 0.09806377673521638, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 2144 }, { "completion_length": 698.0045013427734, "epoch": 0.6407288477335524, "grad_norm": 0.4589228928089142, "kl": 0.52734375, "learning_rate": 4.096930573801396e-07, "loss": 0.0211, "reward": 1.0463170111179352, "reward_std": 0.15564442798495293, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 2145 }, { "completion_length": 626.3281478881836, "epoch": 0.6410275558210738, "grad_norm": 0.4141339063644409, "kl": 0.6376953125, "learning_rate": 4.0924717000361243e-07, "loss": 0.0255, "reward": 1.1261161267757416, "reward_std": 0.16066963598132133, "rewards/accuracy_reward": 0.13839286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 2146 }, { "completion_length": 669.0580749511719, "epoch": 0.6413262639085954, "grad_norm": 0.7216053009033203, "kl": 0.720703125, "learning_rate": 4.0880143575236915e-07, "loss": 0.0289, "reward": 1.1724331080913544, "reward_std": 0.11937380582094193, "rewards/accuracy_reward": 0.1919642984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 2147 }, { "completion_length": 668.0312957763672, "epoch": 0.6416249719961168, "grad_norm": 0.4360397756099701, "kl": 0.4052734375, "learning_rate": 4.083558551113245e-07, "loss": 0.0162, "reward": 1.0602679252624512, "reward_std": 0.14721323549747467, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 2148 }, { "completion_length": 667.9576110839844, "epoch": 0.6419236800836383, "grad_norm": 1.0600789785385132, "kl": 1.04833984375, "learning_rate": 4.0791042856522717e-07, "loss": 0.042, "reward": 1.2628348767757416, "reward_std": 0.20172745361924171, "rewards/accuracy_reward": 0.2834821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 2149 }, { "completion_length": 618.2768096923828, "epoch": 0.6422223881711597, "grad_norm": 0.29229816794395447, "kl": 0.239501953125, "learning_rate": 4.074651565986572e-07, "loss": 0.0096, "reward": 1.2025670111179352, "reward_std": 0.1133370716124773, "rewards/accuracy_reward": 0.212053582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134439468384, "step": 2150 }, { "completion_length": 636.8638763427734, "epoch": 0.6425210962586813, "grad_norm": 0.6358823776245117, "kl": 0.52392578125, "learning_rate": 4.070200396960269e-07, "loss": 0.0209, "reward": 1.1054687649011612, "reward_std": 0.1494390331208706, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2151 }, { "completion_length": 671.8928985595703, "epoch": 0.6428198043462027, "grad_norm": 0.6359153985977173, "kl": 0.332275390625, "learning_rate": 4.0657507834158e-07, "loss": 0.0133, "reward": 1.188058078289032, "reward_std": 0.16473522037267685, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 2152 }, { "completion_length": 644.310302734375, "epoch": 0.6431185124337241, "grad_norm": 0.38290005922317505, "kl": 0.62744140625, "learning_rate": 4.0613027301939063e-07, "loss": 0.0251, "reward": 1.1372768133878708, "reward_std": 0.13549303263425827, "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2153 }, { "completion_length": 690.372802734375, "epoch": 0.6434172205212456, "grad_norm": 0.3313271999359131, "kl": 0.4326171875, "learning_rate": 4.056856242133634e-07, "loss": 0.0173, "reward": 1.1445313096046448, "reward_std": 0.10976172797381878, "rewards/accuracy_reward": 0.15178572572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 2154 }, { "completion_length": 693.4576110839844, "epoch": 0.643715928608767, "grad_norm": 1.1193463802337646, "kl": 0.91796875, "learning_rate": 4.0524113240723266e-07, "loss": 0.0368, "reward": 1.2181920111179352, "reward_std": 0.21386170014739037, "rewards/accuracy_reward": 0.2366071566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 2155 }, { "completion_length": 664.3080596923828, "epoch": 0.6440146366962886, "grad_norm": 0.5942164659500122, "kl": 0.7294921875, "learning_rate": 4.047967980845621e-07, "loss": 0.0292, "reward": 1.2522321939468384, "reward_std": 0.16804632171988487, "rewards/accuracy_reward": 0.2678571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2156 }, { "completion_length": 712.1629791259766, "epoch": 0.64431334478381, "grad_norm": 0.6431336998939514, "kl": 0.7109375, "learning_rate": 4.0435262172874376e-07, "loss": 0.0284, "reward": 1.0976563096046448, "reward_std": 0.169888474047184, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884439468384, "step": 2157 }, { "completion_length": 661.5960235595703, "epoch": 0.6446120528713315, "grad_norm": 0.7363964915275574, "kl": 0.63427734375, "learning_rate": 4.0390860382299795e-07, "loss": 0.0254, "reward": 1.0452009290456772, "reward_std": 0.0651178639382124, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 2158 }, { "completion_length": 604.4799346923828, "epoch": 0.6449107609588529, "grad_norm": 0.49134862422943115, "kl": 0.262939453125, "learning_rate": 4.0346474485037274e-07, "loss": 0.0105, "reward": 1.1925223767757416, "reward_std": 0.1333174780011177, "rewards/accuracy_reward": 0.19866072572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616454601288, "step": 2159 }, { "completion_length": 681.919677734375, "epoch": 0.6452094690463744, "grad_norm": 0.6040561199188232, "kl": 1.08203125, "learning_rate": 4.0302104529374314e-07, "loss": 0.0433, "reward": 1.1099331080913544, "reward_std": 0.2133239433169365, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9715402126312256, "step": 2160 }, { "completion_length": 689.0335083007812, "epoch": 0.6455081771338959, "grad_norm": 0.9970811605453491, "kl": 1.09375, "learning_rate": 4.025775056358107e-07, "loss": 0.0438, "reward": 1.1277902275323868, "reward_std": 0.14494079910218716, "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723767757416, "step": 2161 }, { "completion_length": 698.8370819091797, "epoch": 0.6458068852214174, "grad_norm": 0.5508258938789368, "kl": 0.66796875, "learning_rate": 4.0213412635910316e-07, "loss": 0.0267, "reward": 1.1411831080913544, "reward_std": 0.14136908808723092, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 2162 }, { "completion_length": 706.8750381469727, "epoch": 0.6461055933089388, "grad_norm": 0.7021234631538391, "kl": 0.70263671875, "learning_rate": 4.016909079459738e-07, "loss": 0.0281, "reward": 1.1166295260190964, "reward_std": 0.15093236044049263, "rewards/accuracy_reward": 0.1406250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.976004496216774, "step": 2163 }, { "completion_length": 711.9620971679688, "epoch": 0.6464043013964603, "grad_norm": 0.4240538775920868, "kl": 0.5576171875, "learning_rate": 4.012478508786008e-07, "loss": 0.0223, "reward": 1.0825893729925156, "reward_std": 0.17131490632891655, "rewards/accuracy_reward": 0.10267857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 2164 }, { "completion_length": 701.810302734375, "epoch": 0.6467030094839817, "grad_norm": 1.4157328605651855, "kl": 0.6982421875, "learning_rate": 4.0080495563898664e-07, "loss": 0.0279, "reward": 1.0803571939468384, "reward_std": 0.1477520354092121, "rewards/accuracy_reward": 0.10267857881262898, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776785969734192, "step": 2165 }, { "completion_length": 701.1719055175781, "epoch": 0.6470017175715033, "grad_norm": 0.6263561248779297, "kl": 0.47509765625, "learning_rate": 4.0036222270895803e-07, "loss": 0.019, "reward": 1.2081473767757416, "reward_std": 0.18560531735420227, "rewards/accuracy_reward": 0.2232142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 2166 }, { "completion_length": 649.7924346923828, "epoch": 0.6473004256590247, "grad_norm": 0.37330782413482666, "kl": 0.278564453125, "learning_rate": 3.9991965257016525e-07, "loss": 0.0111, "reward": 1.252790242433548, "reward_std": 0.10425719479098916, "rewards/accuracy_reward": 0.2633928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2167 }, { "completion_length": 633.2701110839844, "epoch": 0.6475991337465462, "grad_norm": 0.45056796073913574, "kl": 0.64453125, "learning_rate": 3.9947724570408093e-07, "loss": 0.0258, "reward": 1.1662946939468384, "reward_std": 0.19342529401183128, "rewards/accuracy_reward": 0.18303572060540318, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 2168 }, { "completion_length": 591.169677734375, "epoch": 0.6478978418340676, "grad_norm": 0.5337097644805908, "kl": 0.53271484375, "learning_rate": 3.990350025920003e-07, "loss": 0.0213, "reward": 1.1194196939468384, "reward_std": 0.13877664413303137, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 2169 }, { "completion_length": 701.7701110839844, "epoch": 0.6481965499215891, "grad_norm": 0.734242856502533, "kl": 0.34912109375, "learning_rate": 3.9859292371504085e-07, "loss": 0.014, "reward": 1.0948661267757416, "reward_std": 0.08499679830856621, "rewards/accuracy_reward": 0.09821428963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966518133878708, "step": 2170 }, { "completion_length": 640.7857360839844, "epoch": 0.6484952580091106, "grad_norm": 0.4719983637332916, "kl": 0.4107666015625, "learning_rate": 3.981510095541408e-07, "loss": 0.0164, "reward": 1.1607143580913544, "reward_std": 0.1477663591504097, "rewards/accuracy_reward": 0.17187500861473382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 2171 }, { "completion_length": 587.5290374755859, "epoch": 0.6487939660966321, "grad_norm": 0.5249384045600891, "kl": 0.26904296875, "learning_rate": 3.977092605900596e-07, "loss": 0.0108, "reward": 1.1450893580913544, "reward_std": 0.12229655496776104, "rewards/accuracy_reward": 0.15401786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 2172 }, { "completion_length": 681.2076110839844, "epoch": 0.6490926741841535, "grad_norm": 0.41654881834983826, "kl": 0.5191650390625, "learning_rate": 3.9726767730337687e-07, "loss": 0.0208, "reward": 1.1579241454601288, "reward_std": 0.2302294559776783, "rewards/accuracy_reward": 0.17633929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 2173 }, { "completion_length": 723.1138763427734, "epoch": 0.649391382271675, "grad_norm": 1.214815616607666, "kl": 0.56982421875, "learning_rate": 3.968262601744917e-07, "loss": 0.0228, "reward": 1.0948661267757416, "reward_std": 0.11687901616096497, "rewards/accuracy_reward": 0.10937500582076609, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 2174 }, { "completion_length": 640.3794860839844, "epoch": 0.6496900903591964, "grad_norm": 0.7395460605621338, "kl": 0.4453125, "learning_rate": 3.96385009683623e-07, "loss": 0.0178, "reward": 1.3203125596046448, "reward_std": 0.21359092369675636, "rewards/accuracy_reward": 0.3236607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966518133878708, "step": 2175 }, { "completion_length": 708.200927734375, "epoch": 0.649988798446718, "grad_norm": 0.6154245734214783, "kl": 0.580078125, "learning_rate": 3.9594392631080766e-07, "loss": 0.0232, "reward": 1.1261160969734192, "reward_std": 0.18577606603503227, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 2176 }, { "completion_length": 713.3214721679688, "epoch": 0.6502875065342394, "grad_norm": 1.353237271308899, "kl": 0.46240234375, "learning_rate": 3.9550301053590163e-07, "loss": 0.0185, "reward": 1.0357143580913544, "reward_std": 0.11581929214298725, "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98214291036129, "step": 2177 }, { "completion_length": 786.950927734375, "epoch": 0.6505862146217609, "grad_norm": 0.649966835975647, "kl": 1.2197265625, "learning_rate": 3.950622628385777e-07, "loss": 0.0487, "reward": 1.0680804252624512, "reward_std": 0.20750263519585133, "rewards/accuracy_reward": 0.10491071920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9631696939468384, "step": 2178 }, { "completion_length": 654.6004638671875, "epoch": 0.6508849227092823, "grad_norm": 0.48228299617767334, "kl": 0.55859375, "learning_rate": 3.9462168369832614e-07, "loss": 0.0224, "reward": 1.1835938096046448, "reward_std": 0.2086997888982296, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 2179 }, { "completion_length": 674.4598541259766, "epoch": 0.6511836307968039, "grad_norm": 0.49964621663093567, "kl": 0.9462890625, "learning_rate": 3.941812735944542e-07, "loss": 0.0379, "reward": 1.1166295409202576, "reward_std": 0.18872517719864845, "rewards/accuracy_reward": 0.13616071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804688096046448, "step": 2180 }, { "completion_length": 764.2187805175781, "epoch": 0.6514823388843253, "grad_norm": 0.4237072467803955, "kl": 0.6630859375, "learning_rate": 3.9374103300608463e-07, "loss": 0.0265, "reward": 1.1462054252624512, "reward_std": 0.1791003793478012, "rewards/accuracy_reward": 0.16294643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2181 }, { "completion_length": 606.0045013427734, "epoch": 0.6517810469718468, "grad_norm": 0.8520134091377258, "kl": 0.52880859375, "learning_rate": 3.933009624121562e-07, "loss": 0.0212, "reward": 1.248883992433548, "reward_std": 0.18700524512678385, "rewards/accuracy_reward": 0.2611607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232313156128, "step": 2182 }, { "completion_length": 678.5402221679688, "epoch": 0.6520797550593682, "grad_norm": 1.009979009628296, "kl": 0.9560546875, "learning_rate": 3.9286106229142224e-07, "loss": 0.0381, "reward": 1.1478795260190964, "reward_std": 0.15424437448382378, "rewards/accuracy_reward": 0.17187500977888703, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 2183 }, { "completion_length": 714.6250152587891, "epoch": 0.6523784631468897, "grad_norm": 0.857051432132721, "kl": 0.882568359375, "learning_rate": 3.924213331224515e-07, "loss": 0.0352, "reward": 1.096540242433548, "reward_std": 0.16859738714993, "rewards/accuracy_reward": 0.12276786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723469734192, "step": 2184 }, { "completion_length": 680.3973541259766, "epoch": 0.6526771712344112, "grad_norm": 0.6215050220489502, "kl": 0.7041015625, "learning_rate": 3.9198177538362585e-07, "loss": 0.0281, "reward": 1.1222098767757416, "reward_std": 0.148010466247797, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2185 }, { "completion_length": 722.9933319091797, "epoch": 0.6529758793219327, "grad_norm": 0.8160883188247681, "kl": 0.8369140625, "learning_rate": 3.915423895531411e-07, "loss": 0.0335, "reward": 1.2165179252624512, "reward_std": 0.20041286945343018, "rewards/accuracy_reward": 0.2388392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 2186 }, { "completion_length": 705.8527069091797, "epoch": 0.6532745874094541, "grad_norm": 0.6771142482757568, "kl": 0.77783203125, "learning_rate": 3.9110317610900613e-07, "loss": 0.0311, "reward": 1.1573661267757416, "reward_std": 0.18527483008801937, "rewards/accuracy_reward": 0.18080357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625447034836, "step": 2187 }, { "completion_length": 669.1116485595703, "epoch": 0.6535732954969756, "grad_norm": 0.4583927392959595, "kl": 0.739501953125, "learning_rate": 3.90664135529042e-07, "loss": 0.0296, "reward": 1.1601562798023224, "reward_std": 0.1738608404994011, "rewards/accuracy_reward": 0.18080358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 2188 }, { "completion_length": 635.8661041259766, "epoch": 0.653872003584497, "grad_norm": 0.8665212392807007, "kl": 0.666015625, "learning_rate": 3.9022526829088176e-07, "loss": 0.0266, "reward": 1.1266741454601288, "reward_std": 0.1924309842288494, "rewards/accuracy_reward": 0.1495535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205633878708, "step": 2189 }, { "completion_length": 627.8013610839844, "epoch": 0.6541707116720186, "grad_norm": 0.42165765166282654, "kl": 0.51513671875, "learning_rate": 3.8978657487196987e-07, "loss": 0.0206, "reward": 1.1266741752624512, "reward_std": 0.1567845195531845, "rewards/accuracy_reward": 0.14955357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977120578289032, "step": 2190 }, { "completion_length": 674.8482666015625, "epoch": 0.65446941975954, "grad_norm": 0.4189852774143219, "kl": 0.85986328125, "learning_rate": 3.893480557495621e-07, "loss": 0.0344, "reward": 1.092075914144516, "reward_std": 0.17098821885883808, "rewards/accuracy_reward": 0.11160714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804688096046448, "step": 2191 }, { "completion_length": 706.7768096923828, "epoch": 0.6547681278470615, "grad_norm": 0.5812774300575256, "kl": 0.8837890625, "learning_rate": 3.8890971140072405e-07, "loss": 0.0354, "reward": 1.1992187798023224, "reward_std": 0.2061171755194664, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 2192 }, { "completion_length": 745.8504791259766, "epoch": 0.6550668359345829, "grad_norm": 1.3969553709030151, "kl": 1.00244140625, "learning_rate": 3.884715423023314e-07, "loss": 0.0401, "reward": 1.1746652126312256, "reward_std": 0.2620616815984249, "rewards/accuracy_reward": 0.2075892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9670759290456772, "step": 2193 }, { "completion_length": 668.8549499511719, "epoch": 0.6553655440221045, "grad_norm": 0.4831981658935547, "kl": 0.77099609375, "learning_rate": 3.8803354893106933e-07, "loss": 0.0309, "reward": 1.2862723469734192, "reward_std": 0.13356562703847885, "rewards/accuracy_reward": 0.2946428768336773, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 2194 }, { "completion_length": 658.5178680419922, "epoch": 0.6556642521096259, "grad_norm": 0.3697880208492279, "kl": 0.316162109375, "learning_rate": 3.875957317634315e-07, "loss": 0.0126, "reward": 1.1383928954601288, "reward_std": 0.13497847877442837, "rewards/accuracy_reward": 0.14955357927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393431901932, "step": 2195 }, { "completion_length": 777.3393249511719, "epoch": 0.6559629601971473, "grad_norm": 0.5321431756019592, "kl": 0.634765625, "learning_rate": 3.871580912757203e-07, "loss": 0.0254, "reward": 1.1026785969734192, "reward_std": 0.13094273954629898, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 2196 }, { "completion_length": 746.3080749511719, "epoch": 0.6562616682846688, "grad_norm": 0.4104750454425812, "kl": 0.462158203125, "learning_rate": 3.867206279440455e-07, "loss": 0.0185, "reward": 1.163504496216774, "reward_std": 0.14277069456875324, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2197 }, { "completion_length": 652.4107360839844, "epoch": 0.6565603763721902, "grad_norm": 0.5951827168464661, "kl": 0.4287109375, "learning_rate": 3.8628334224432437e-07, "loss": 0.0172, "reward": 1.1568081080913544, "reward_std": 0.1547691971063614, "rewards/accuracy_reward": 0.1696428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2198 }, { "completion_length": 672.9643249511719, "epoch": 0.6568590844597118, "grad_norm": 1.078956961631775, "kl": 0.6630859375, "learning_rate": 3.8584623465228094e-07, "loss": 0.0265, "reward": 1.1936384439468384, "reward_std": 0.24539217352867126, "rewards/accuracy_reward": 0.21428572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 2199 }, { "completion_length": 647.4732360839844, "epoch": 0.6571577925472332, "grad_norm": 0.4965885281562805, "kl": 0.90478515625, "learning_rate": 3.854093056434453e-07, "loss": 0.0361, "reward": 1.1149553954601288, "reward_std": 0.13636750355362892, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946939468384, "step": 2200 }, { "completion_length": 704.0022735595703, "epoch": 0.6574565006347547, "grad_norm": 0.5578575134277344, "kl": 0.68896484375, "learning_rate": 3.849725556931537e-07, "loss": 0.0275, "reward": 1.152901828289032, "reward_std": 0.17601131834089756, "rewards/accuracy_reward": 0.16741072316654027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911267757416, "step": 2201 }, { "completion_length": 654.9509124755859, "epoch": 0.6577552087222761, "grad_norm": 1.7626385688781738, "kl": 1.38671875, "learning_rate": 3.8453598527654696e-07, "loss": 0.0555, "reward": 1.1752232909202576, "reward_std": 0.14481504261493683, "rewards/accuracy_reward": 0.1986607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625447034836, "step": 2202 }, { "completion_length": 699.4598541259766, "epoch": 0.6580539168097976, "grad_norm": 0.8457069396972656, "kl": 0.52490234375, "learning_rate": 3.8409959486857116e-07, "loss": 0.021, "reward": 1.1244420111179352, "reward_std": 0.12975095584988594, "rewards/accuracy_reward": 0.13616071734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2203 }, { "completion_length": 577.8058242797852, "epoch": 0.658352624897319, "grad_norm": 0.6122873425483704, "kl": 0.29638671875, "learning_rate": 3.836633849439759e-07, "loss": 0.0118, "reward": 1.193638414144516, "reward_std": 0.14830287918448448, "rewards/accuracy_reward": 0.19642858067527413, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.997209832072258, "step": 2204 }, { "completion_length": 674.8817291259766, "epoch": 0.6586513329848406, "grad_norm": 0.4746233820915222, "kl": 0.59033203125, "learning_rate": 3.8322735597731526e-07, "loss": 0.0236, "reward": 1.0703125447034836, "reward_std": 0.15719245374202728, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946939468384, "step": 2205 }, { "completion_length": 659.9263763427734, "epoch": 0.658950041072362, "grad_norm": 0.24163813889026642, "kl": 0.39501953125, "learning_rate": 3.8279150844294595e-07, "loss": 0.0158, "reward": 1.1077009290456772, "reward_std": 0.09363269805908203, "rewards/accuracy_reward": 0.11607143585570157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916294813156128, "step": 2206 }, { "completion_length": 732.6808319091797, "epoch": 0.6592487491598835, "grad_norm": 0.9015367031097412, "kl": 0.622802734375, "learning_rate": 3.8235584281502696e-07, "loss": 0.0249, "reward": 1.0613839775323868, "reward_std": 0.1385907307267189, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2207 }, { "completion_length": 565.0669860839844, "epoch": 0.6595474572474049, "grad_norm": 0.2957952916622162, "kl": 0.332275390625, "learning_rate": 3.8192035956752033e-07, "loss": 0.0133, "reward": 1.1813616752624512, "reward_std": 0.15130778588354588, "rewards/accuracy_reward": 0.19866072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 2208 }, { "completion_length": 672.1094055175781, "epoch": 0.6598461653349265, "grad_norm": 0.9155352711677551, "kl": 0.636474609375, "learning_rate": 3.814850591741889e-07, "loss": 0.0254, "reward": 1.2215402126312256, "reward_std": 0.19069792330265045, "rewards/accuracy_reward": 0.2410714440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 2209 }, { "completion_length": 748.0670013427734, "epoch": 0.6601448734224479, "grad_norm": 0.48253530263900757, "kl": 0.473876953125, "learning_rate": 3.8104994210859687e-07, "loss": 0.019, "reward": 1.1266741305589676, "reward_std": 0.14502913318574429, "rewards/accuracy_reward": 0.14732143771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 2210 }, { "completion_length": 663.1964569091797, "epoch": 0.6604435815099694, "grad_norm": 2.7159066200256348, "kl": 0.560791015625, "learning_rate": 3.806150088441087e-07, "loss": 0.0224, "reward": 1.1255580484867096, "reward_std": 0.10446021938696504, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2211 }, { "completion_length": 651.716552734375, "epoch": 0.6607422895974908, "grad_norm": 0.5239043235778809, "kl": 0.371337890625, "learning_rate": 3.8018025985388957e-07, "loss": 0.0149, "reward": 1.2315848469734192, "reward_std": 0.21446016058325768, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 2212 }, { "completion_length": 622.6004638671875, "epoch": 0.6610409976850123, "grad_norm": 0.36069244146347046, "kl": 0.5439453125, "learning_rate": 3.7974569561090366e-07, "loss": 0.0218, "reward": 1.2795759439468384, "reward_std": 0.1657116673886776, "rewards/accuracy_reward": 0.3058035895228386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723618745804, "step": 2213 }, { "completion_length": 686.1451110839844, "epoch": 0.6613397057725338, "grad_norm": 1.480942964553833, "kl": 0.48779296875, "learning_rate": 3.7931131658791406e-07, "loss": 0.0195, "reward": 1.1127232611179352, "reward_std": 0.20933926478028297, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 2214 }, { "completion_length": 742.8013763427734, "epoch": 0.6616384138600553, "grad_norm": 0.9098065495491028, "kl": 0.57421875, "learning_rate": 3.788771232574828e-07, "loss": 0.023, "reward": 1.305245578289032, "reward_std": 0.27732453867793083, "rewards/accuracy_reward": 0.3415178805589676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9637277126312256, "step": 2215 }, { "completion_length": 681.7366333007812, "epoch": 0.6619371219475767, "grad_norm": 0.37386125326156616, "kl": 0.265380859375, "learning_rate": 3.7844311609196964e-07, "loss": 0.0106, "reward": 1.1997768580913544, "reward_std": 0.13219659589231014, "rewards/accuracy_reward": 0.20758928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875447034836, "step": 2216 }, { "completion_length": 670.8370971679688, "epoch": 0.6622358300350982, "grad_norm": 0.779879093170166, "kl": 0.69580078125, "learning_rate": 3.780092955635318e-07, "loss": 0.0279, "reward": 1.1277902126312256, "reward_std": 0.1940794698894024, "rewards/accuracy_reward": 0.14955358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366305589676, "step": 2217 }, { "completion_length": 718.3861999511719, "epoch": 0.6625345381226196, "grad_norm": 1.052168607711792, "kl": 0.5458984375, "learning_rate": 3.775756621441233e-07, "loss": 0.0219, "reward": 1.1774553954601288, "reward_std": 0.21800090000033379, "rewards/accuracy_reward": 0.19419643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2218 }, { "completion_length": 615.6986846923828, "epoch": 0.6628332462101412, "grad_norm": 0.3233179748058319, "kl": 0.530517578125, "learning_rate": 3.7714221630549513e-07, "loss": 0.0212, "reward": 1.1289062798023224, "reward_std": 0.08669052552431822, "rewards/accuracy_reward": 0.13616071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 2219 }, { "completion_length": 686.4174499511719, "epoch": 0.6631319542976626, "grad_norm": 0.3762674033641815, "kl": 0.2974853515625, "learning_rate": 3.767089585191937e-07, "loss": 0.0119, "reward": 1.1774554252624512, "reward_std": 0.14970761723816395, "rewards/accuracy_reward": 0.1897321566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2220 }, { "completion_length": 633.6250305175781, "epoch": 0.6634306623851841, "grad_norm": 0.4733186960220337, "kl": 0.3974609375, "learning_rate": 3.762758892565612e-07, "loss": 0.0159, "reward": 1.280133992433548, "reward_std": 0.17894714511930943, "rewards/accuracy_reward": 0.2924107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2221 }, { "completion_length": 707.6004791259766, "epoch": 0.6637293704727055, "grad_norm": 0.873540997505188, "kl": 0.56787109375, "learning_rate": 3.758430089887341e-07, "loss": 0.0227, "reward": 1.169084906578064, "reward_std": 0.1974303051829338, "rewards/accuracy_reward": 0.18303572479635477, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 2222 }, { "completion_length": 678.8951110839844, "epoch": 0.664028078560227, "grad_norm": 0.3512565791606903, "kl": 0.5634765625, "learning_rate": 3.754103181866443e-07, "loss": 0.0226, "reward": 1.1824777126312256, "reward_std": 0.1380501314997673, "rewards/accuracy_reward": 0.19642858486622572, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 2223 }, { "completion_length": 660.8795013427734, "epoch": 0.6643267866477485, "grad_norm": 0.48399898409843445, "kl": 0.70849609375, "learning_rate": 3.749778173210165e-07, "loss": 0.0284, "reward": 1.1813616752624512, "reward_std": 0.16618647519499063, "rewards/accuracy_reward": 0.1986607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 2224 }, { "completion_length": 639.325927734375, "epoch": 0.66462549473527, "grad_norm": 0.3766624629497528, "kl": 0.446533203125, "learning_rate": 3.745455068623694e-07, "loss": 0.0179, "reward": 1.1478795111179352, "reward_std": 0.12871980667114258, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2225 }, { "completion_length": 698.1183319091797, "epoch": 0.6649242028227914, "grad_norm": 1.0169199705123901, "kl": 0.55029296875, "learning_rate": 3.741133872810146e-07, "loss": 0.022, "reward": 1.198102742433548, "reward_std": 0.2228015698492527, "rewards/accuracy_reward": 0.2209821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977120578289032, "step": 2226 }, { "completion_length": 598.6361770629883, "epoch": 0.6652229109103129, "grad_norm": 0.5745580196380615, "kl": 0.2177734375, "learning_rate": 3.7368145904705564e-07, "loss": 0.0087, "reward": 1.1601563394069672, "reward_std": 0.12013013660907745, "rewards/accuracy_reward": 0.16741071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 2227 }, { "completion_length": 694.794677734375, "epoch": 0.6655216189978344, "grad_norm": 0.4121044874191284, "kl": 0.854248046875, "learning_rate": 3.732497226303881e-07, "loss": 0.0342, "reward": 1.2377232760190964, "reward_std": 0.25021858513355255, "rewards/accuracy_reward": 0.2700892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9676339626312256, "step": 2228 }, { "completion_length": 719.1518096923828, "epoch": 0.6658203270853559, "grad_norm": 0.6263669729232788, "kl": 0.809326171875, "learning_rate": 3.728181785006991e-07, "loss": 0.0324, "reward": 1.0993304252624512, "reward_std": 0.11673249863088131, "rewards/accuracy_reward": 0.12053572246804833, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946790456772, "step": 2229 }, { "completion_length": 665.0580749511719, "epoch": 0.6661190351728773, "grad_norm": 0.7324029803276062, "kl": 0.51953125, "learning_rate": 3.7238682712746606e-07, "loss": 0.0208, "reward": 1.148995578289032, "reward_std": 0.19775832444429398, "rewards/accuracy_reward": 0.160714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2230 }, { "completion_length": 678.122802734375, "epoch": 0.6664177432603988, "grad_norm": 0.45013564825057983, "kl": 0.7802734375, "learning_rate": 3.719556689799572e-07, "loss": 0.0313, "reward": 1.1523437947034836, "reward_std": 0.1984379030764103, "rewards/accuracy_reward": 0.18526786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9670759290456772, "step": 2231 }, { "completion_length": 549.9107360839844, "epoch": 0.6667164513479202, "grad_norm": 0.35214635729789734, "kl": 0.224853515625, "learning_rate": 3.7152470452723015e-07, "loss": 0.009, "reward": 1.1908482611179352, "reward_std": 0.10274824313819408, "rewards/accuracy_reward": 0.19642858440056443, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9944196790456772, "step": 2232 }, { "completion_length": 693.669677734375, "epoch": 0.6670151594354418, "grad_norm": 1.061846137046814, "kl": 0.8701171875, "learning_rate": 3.710939342381324e-07, "loss": 0.0347, "reward": 1.0630581080913544, "reward_std": 0.1832900047302246, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.976004496216774, "step": 2233 }, { "completion_length": 788.9777069091797, "epoch": 0.6673138675229632, "grad_norm": 0.39327284693717957, "kl": 0.804931640625, "learning_rate": 3.7066335858129925e-07, "loss": 0.0322, "reward": 1.0859375447034836, "reward_std": 0.19251112639904022, "rewards/accuracy_reward": 0.11383928777649999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9720982611179352, "step": 2234 }, { "completion_length": 641.3839416503906, "epoch": 0.6676125756104847, "grad_norm": 0.4393138289451599, "kl": 0.6416015625, "learning_rate": 3.702329780251552e-07, "loss": 0.0256, "reward": 1.1462054550647736, "reward_std": 0.1517640221863985, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 2235 }, { "completion_length": 755.1027069091797, "epoch": 0.6679112836980061, "grad_norm": 0.6446362733840942, "kl": 0.574462890625, "learning_rate": 3.6980279303791193e-07, "loss": 0.023, "reward": 1.086495578289032, "reward_std": 0.14376276545226574, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 2236 }, { "completion_length": 632.3527069091797, "epoch": 0.6682099917855276, "grad_norm": 0.4197699725627899, "kl": 0.47802734375, "learning_rate": 3.693728040875688e-07, "loss": 0.0191, "reward": 1.0976562798023224, "reward_std": 0.18840929958969355, "rewards/accuracy_reward": 0.11160714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 2237 }, { "completion_length": 812.7701263427734, "epoch": 0.6685086998730491, "grad_norm": 0.45854344964027405, "kl": 0.76416015625, "learning_rate": 3.689430116419112e-07, "loss": 0.0306, "reward": 1.12667416036129, "reward_std": 0.11580377630889416, "rewards/accuracy_reward": 0.1450892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 2238 }, { "completion_length": 606.5558166503906, "epoch": 0.6688074079605706, "grad_norm": 0.5468735694885254, "kl": 0.5869140625, "learning_rate": 3.685134161685115e-07, "loss": 0.0234, "reward": 1.2120535969734192, "reward_std": 0.2278108410537243, "rewards/accuracy_reward": 0.2232143022119999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 2239 }, { "completion_length": 649.1317291259766, "epoch": 0.669106116048092, "grad_norm": 0.6328897476196289, "kl": 0.193603515625, "learning_rate": 3.6808401813472754e-07, "loss": 0.0077, "reward": 1.1238839626312256, "reward_std": 0.13667956925928593, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 2240 }, { "completion_length": 618.6406555175781, "epoch": 0.6694048241356134, "grad_norm": 0.7206335067749023, "kl": 0.433837890625, "learning_rate": 3.6765481800770217e-07, "loss": 0.0174, "reward": 1.194196492433548, "reward_std": 0.17674890346825123, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 2241 }, { "completion_length": 643.5290374755859, "epoch": 0.669703532223135, "grad_norm": 0.4011751115322113, "kl": 0.3839111328125, "learning_rate": 3.67225816254363e-07, "loss": 0.0154, "reward": 1.2087054252624512, "reward_std": 0.1024878453463316, "rewards/accuracy_reward": 0.22098215483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232313156128, "step": 2242 }, { "completion_length": 619.4084930419922, "epoch": 0.6700022403106564, "grad_norm": 0.33688127994537354, "kl": 0.46044921875, "learning_rate": 3.6679701334142177e-07, "loss": 0.0184, "reward": 1.3002232313156128, "reward_std": 0.09754435252398252, "rewards/accuracy_reward": 0.3147321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 2243 }, { "completion_length": 683.5893249511719, "epoch": 0.6703009483981779, "grad_norm": 0.8616107702255249, "kl": 0.416259765625, "learning_rate": 3.6636840973537443e-07, "loss": 0.0167, "reward": 1.1400670111179352, "reward_std": 0.14912305772304535, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 2244 }, { "completion_length": 603.1942443847656, "epoch": 0.6705996564856993, "grad_norm": 0.4223703145980835, "kl": 0.357421875, "learning_rate": 3.659400059024994e-07, "loss": 0.0143, "reward": 1.2360491752624512, "reward_std": 0.1292467936873436, "rewards/accuracy_reward": 0.247767873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2245 }, { "completion_length": 702.4486846923828, "epoch": 0.6708983645732208, "grad_norm": 0.41938111186027527, "kl": 0.611328125, "learning_rate": 3.6551180230885814e-07, "loss": 0.0245, "reward": 1.1635045111179352, "reward_std": 0.11445498187094927, "rewards/accuracy_reward": 0.17633929196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 2246 }, { "completion_length": 646.5647583007812, "epoch": 0.6711970726607422, "grad_norm": 0.3262552320957184, "kl": 0.3876953125, "learning_rate": 3.650837994202942e-07, "loss": 0.0155, "reward": 1.2433036267757416, "reward_std": 0.1337076686322689, "rewards/accuracy_reward": 0.254464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393431901932, "step": 2247 }, { "completion_length": 653.6830749511719, "epoch": 0.6714957807482638, "grad_norm": 0.3545220196247101, "kl": 0.277587890625, "learning_rate": 3.646559977024327e-07, "loss": 0.0111, "reward": 1.0781250298023224, "reward_std": 0.11881025996990502, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714775323868, "step": 2248 }, { "completion_length": 664.5736999511719, "epoch": 0.6717944888357852, "grad_norm": 0.7468522191047668, "kl": 0.765625, "learning_rate": 3.6422839762068016e-07, "loss": 0.0306, "reward": 1.0870535969734192, "reward_std": 0.19508523121476173, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500447034836, "step": 2249 }, { "completion_length": 708.0736846923828, "epoch": 0.6720931969233067, "grad_norm": 1.091046690940857, "kl": 0.4052734375, "learning_rate": 3.638009996402233e-07, "loss": 0.0162, "reward": 1.2327009439468384, "reward_std": 0.22422432526946068, "rewards/accuracy_reward": 0.2455357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2250 }, { "completion_length": 731.1339569091797, "epoch": 0.6723919050108281, "grad_norm": 0.7161775827407837, "kl": 0.5986328125, "learning_rate": 3.6337380422602935e-07, "loss": 0.024, "reward": 1.1657366454601288, "reward_std": 0.16986897960305214, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 2251 }, { "completion_length": 672.2031402587891, "epoch": 0.6726906130983497, "grad_norm": 0.5717853307723999, "kl": 0.52294921875, "learning_rate": 3.6294681184284514e-07, "loss": 0.021, "reward": 1.127790242433548, "reward_std": 0.22296488657593727, "rewards/accuracy_reward": 0.14955358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 2252 }, { "completion_length": 734.5580596923828, "epoch": 0.6729893211858711, "grad_norm": 0.9730808734893799, "kl": 0.525390625, "learning_rate": 3.625200229551966e-07, "loss": 0.021, "reward": 1.219308078289032, "reward_std": 0.22315068542957306, "rewards/accuracy_reward": 0.2343750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2253 }, { "completion_length": 631.9710083007812, "epoch": 0.6732880292733926, "grad_norm": 0.28357821702957153, "kl": 0.597900390625, "learning_rate": 3.6209343802738776e-07, "loss": 0.0239, "reward": 1.128348246216774, "reward_std": 0.13323556818068027, "rewards/accuracy_reward": 0.14732143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 2254 }, { "completion_length": 659.4888763427734, "epoch": 0.673586737360914, "grad_norm": 0.44739988446235657, "kl": 0.359619140625, "learning_rate": 3.6166705752350167e-07, "loss": 0.0143, "reward": 1.1623884737491608, "reward_std": 0.18654615432024002, "rewards/accuracy_reward": 0.16741072572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949777126312256, "step": 2255 }, { "completion_length": 687.0513610839844, "epoch": 0.6738854454484355, "grad_norm": 1.0895094871520996, "kl": 0.544189453125, "learning_rate": 3.6124088190739843e-07, "loss": 0.0218, "reward": 1.0959821939468384, "reward_std": 0.15450122859328985, "rewards/accuracy_reward": 0.11607143399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 2256 }, { "completion_length": 691.6652221679688, "epoch": 0.674184153535957, "grad_norm": 0.4244364798069, "kl": 0.6298828125, "learning_rate": 3.6081491164271525e-07, "loss": 0.0252, "reward": 1.0987723469734192, "reward_std": 0.1102019278332591, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 2257 }, { "completion_length": 665.6830596923828, "epoch": 0.6744828616234785, "grad_norm": 0.25554928183555603, "kl": 0.3515625, "learning_rate": 3.6038914719286606e-07, "loss": 0.0141, "reward": 1.0937500596046448, "reward_std": 0.09205561876296997, "rewards/accuracy_reward": 0.10937500093132257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2258 }, { "completion_length": 641.435302734375, "epoch": 0.6747815697109999, "grad_norm": 0.3996293246746063, "kl": 0.515625, "learning_rate": 3.59963589021041e-07, "loss": 0.0206, "reward": 1.1791295111179352, "reward_std": 0.18410402908921242, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916295111179352, "step": 2259 }, { "completion_length": 675.4687805175781, "epoch": 0.6750802777985214, "grad_norm": 0.40319085121154785, "kl": 0.41259765625, "learning_rate": 3.595382375902053e-07, "loss": 0.0165, "reward": 1.1802456080913544, "reward_std": 0.12338549643754959, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 2260 }, { "completion_length": 616.9754791259766, "epoch": 0.6753789858860428, "grad_norm": 0.30337828397750854, "kl": 0.451171875, "learning_rate": 3.5911309336310004e-07, "loss": 0.0181, "reward": 1.2008928954601288, "reward_std": 0.1434051264077425, "rewards/accuracy_reward": 0.2075892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 2261 }, { "completion_length": 687.6361846923828, "epoch": 0.6756776939735644, "grad_norm": 0.7498324513435364, "kl": 0.66943359375, "learning_rate": 3.5868815680224007e-07, "loss": 0.0268, "reward": 1.0468750596046448, "reward_std": 0.11872186325490475, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 2262 }, { "completion_length": 669.6808319091797, "epoch": 0.6759764020610858, "grad_norm": 0.5112208127975464, "kl": 0.951171875, "learning_rate": 3.582634283699151e-07, "loss": 0.038, "reward": 1.1768973767757416, "reward_std": 0.23253438621759415, "rewards/accuracy_reward": 0.2053571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9715402275323868, "step": 2263 }, { "completion_length": 669.6674194335938, "epoch": 0.6762751101486073, "grad_norm": 0.4590159058570862, "kl": 0.215087890625, "learning_rate": 3.5783890852818777e-07, "loss": 0.0086, "reward": 1.2734375298023224, "reward_std": 0.20252204686403275, "rewards/accuracy_reward": 0.29241072945296764, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 2264 }, { "completion_length": 645.0178833007812, "epoch": 0.6765738182361287, "grad_norm": 0.4186525344848633, "kl": 0.38232421875, "learning_rate": 3.574145977388942e-07, "loss": 0.0153, "reward": 1.2561384439468384, "reward_std": 0.19800018519163132, "rewards/accuracy_reward": 0.26562500977888703, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 2265 }, { "completion_length": 642.9062652587891, "epoch": 0.6768725263236502, "grad_norm": 0.7580711245536804, "kl": 0.4945068359375, "learning_rate": 3.569904964636428e-07, "loss": 0.0198, "reward": 1.176339328289032, "reward_std": 0.13234323309734464, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 2266 }, { "completion_length": 684.5513763427734, "epoch": 0.6771712344111717, "grad_norm": 0.6209385991096497, "kl": 0.69970703125, "learning_rate": 3.565666051638144e-07, "loss": 0.028, "reward": 1.188058078289032, "reward_std": 0.13911480642855167, "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 2267 }, { "completion_length": 634.122802734375, "epoch": 0.6774699424986932, "grad_norm": 0.591657280921936, "kl": 0.542724609375, "learning_rate": 3.5614292430056094e-07, "loss": 0.0217, "reward": 1.2628348767757416, "reward_std": 0.18300987780094147, "rewards/accuracy_reward": 0.274553582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2268 }, { "completion_length": 669.8326263427734, "epoch": 0.6777686505862146, "grad_norm": 0.41613253951072693, "kl": 0.37353515625, "learning_rate": 3.5571945433480586e-07, "loss": 0.015, "reward": 1.2388392984867096, "reward_std": 0.1874837651848793, "rewards/accuracy_reward": 0.2477678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 2269 }, { "completion_length": 702.919677734375, "epoch": 0.6780673586737361, "grad_norm": 0.9143329858779907, "kl": 0.40673828125, "learning_rate": 3.5529619572724303e-07, "loss": 0.0162, "reward": 1.1127232611179352, "reward_std": 0.1795841045677662, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946939468384, "step": 2270 }, { "completion_length": 654.3861770629883, "epoch": 0.6783660667612575, "grad_norm": 0.38300731778144836, "kl": 0.303466796875, "learning_rate": 3.548731489383361e-07, "loss": 0.0121, "reward": 1.1322544813156128, "reward_std": 0.16018901392817497, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871651977300644, "step": 2271 }, { "completion_length": 717.9732360839844, "epoch": 0.6786647748487791, "grad_norm": 0.257844477891922, "kl": 0.36572265625, "learning_rate": 3.5445031442831876e-07, "loss": 0.0146, "reward": 1.2220982611179352, "reward_std": 0.15130354324355721, "rewards/accuracy_reward": 0.2366071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 2272 }, { "completion_length": 699.6205596923828, "epoch": 0.6789634829363005, "grad_norm": 0.8857427835464478, "kl": 0.466796875, "learning_rate": 3.540276926571932e-07, "loss": 0.0186, "reward": 1.159040242433548, "reward_std": 0.15301972813904285, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2273 }, { "completion_length": 618.9955596923828, "epoch": 0.679262191023822, "grad_norm": 0.6843724250793457, "kl": 0.451904296875, "learning_rate": 3.5360528408473076e-07, "loss": 0.0181, "reward": 1.1930803954601288, "reward_std": 0.1778108850121498, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2274 }, { "completion_length": 653.2187652587891, "epoch": 0.6795608991113434, "grad_norm": 0.22693674266338348, "kl": 0.37109375, "learning_rate": 3.531830891704707e-07, "loss": 0.0148, "reward": 1.0870536267757416, "reward_std": 0.1107848018873483, "rewards/accuracy_reward": 0.09821428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 2275 }, { "completion_length": 660.6585083007812, "epoch": 0.679859607198865, "grad_norm": 0.48283588886260986, "kl": 0.53466796875, "learning_rate": 3.527611083737192e-07, "loss": 0.0214, "reward": 1.125558078289032, "reward_std": 0.138392997905612, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2276 }, { "completion_length": 683.9777069091797, "epoch": 0.6801583152863864, "grad_norm": 0.6074633598327637, "kl": 0.935302734375, "learning_rate": 3.5233934215355033e-07, "loss": 0.0375, "reward": 1.2767857611179352, "reward_std": 0.2142954207956791, "rewards/accuracy_reward": 0.2991071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 2277 }, { "completion_length": 712.4799652099609, "epoch": 0.6804570233739079, "grad_norm": 0.36975014209747314, "kl": 0.6494140625, "learning_rate": 3.519177909688046e-07, "loss": 0.026, "reward": 1.1841518431901932, "reward_std": 0.16823190450668335, "rewards/accuracy_reward": 0.1986607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 2278 }, { "completion_length": 586.0379791259766, "epoch": 0.6807557314614293, "grad_norm": 1.02152681350708, "kl": 0.33642578125, "learning_rate": 3.514964552780879e-07, "loss": 0.0134, "reward": 1.1049107909202576, "reward_std": 0.20850055292248726, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071939468384, "step": 2279 }, { "completion_length": 641.6160888671875, "epoch": 0.6810544395489508, "grad_norm": 0.5248803496360779, "kl": 0.50439453125, "learning_rate": 3.5107533553977244e-07, "loss": 0.0202, "reward": 1.1941964626312256, "reward_std": 0.20682628825306892, "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 2280 }, { "completion_length": 646.022346496582, "epoch": 0.6813531476364723, "grad_norm": 0.4219398498535156, "kl": 0.55615234375, "learning_rate": 3.506544322119954e-07, "loss": 0.0223, "reward": 1.156808078289032, "reward_std": 0.1165022924542427, "rewards/accuracy_reward": 0.16741072130389512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 2281 }, { "completion_length": 636.4486999511719, "epoch": 0.6816518557239938, "grad_norm": 0.36758941411972046, "kl": 0.743896484375, "learning_rate": 3.50233745752658e-07, "loss": 0.0298, "reward": 1.1233259439468384, "reward_std": 0.1599388998001814, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 2282 }, { "completion_length": 658.6183395385742, "epoch": 0.6819505638115152, "grad_norm": 0.647700309753418, "kl": 0.63427734375, "learning_rate": 3.4981327661942594e-07, "loss": 0.0253, "reward": 1.1941964626312256, "reward_std": 0.12044198624789715, "rewards/accuracy_reward": 0.2075892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2283 }, { "completion_length": 624.9330596923828, "epoch": 0.6822492718990366, "grad_norm": 0.3744043707847595, "kl": 0.4383544921875, "learning_rate": 3.4939302526972874e-07, "loss": 0.0175, "reward": 1.1400670111179352, "reward_std": 0.11246487172320485, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 2284 }, { "completion_length": 781.075927734375, "epoch": 0.6825479799865581, "grad_norm": 0.6921916604042053, "kl": 0.8828125, "learning_rate": 3.489729921607583e-07, "loss": 0.0354, "reward": 1.145089328289032, "reward_std": 0.1976143643260002, "rewards/accuracy_reward": 0.18303572572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9620535969734192, "step": 2285 }, { "completion_length": 625.6473541259766, "epoch": 0.6828466880740796, "grad_norm": 0.503641664981842, "kl": 0.66357421875, "learning_rate": 3.485531777494698e-07, "loss": 0.0266, "reward": 1.1556920260190964, "reward_std": 0.17441847547888756, "rewards/accuracy_reward": 0.16964286682195961, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 2286 }, { "completion_length": 651.8393096923828, "epoch": 0.6831453961616011, "grad_norm": 0.5779915452003479, "kl": 0.68408203125, "learning_rate": 3.4813358249257973e-07, "loss": 0.0274, "reward": 1.0842634737491608, "reward_std": 0.17449062131345272, "rewards/accuracy_reward": 0.10714286426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977120578289032, "step": 2287 }, { "completion_length": 637.6272583007812, "epoch": 0.6834441042491225, "grad_norm": 0.5089426636695862, "kl": 0.6416015625, "learning_rate": 3.47714206846567e-07, "loss": 0.0257, "reward": 1.1735491156578064, "reward_std": 0.18324212729930878, "rewards/accuracy_reward": 0.18973215855658054, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 2288 }, { "completion_length": 673.9710083007812, "epoch": 0.683742812336644, "grad_norm": 1.0148558616638184, "kl": 0.453369140625, "learning_rate": 3.472950512676712e-07, "loss": 0.0181, "reward": 1.2126116752624512, "reward_std": 0.18130704388022423, "rewards/accuracy_reward": 0.227678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330931901932, "step": 2289 }, { "completion_length": 642.7433319091797, "epoch": 0.6840415204241654, "grad_norm": 0.484261155128479, "kl": 0.625244140625, "learning_rate": 3.4687611621189216e-07, "loss": 0.025, "reward": 1.1651786118745804, "reward_std": 0.054776369128376245, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2290 }, { "completion_length": 618.8482513427734, "epoch": 0.684340228511687, "grad_norm": 0.544169545173645, "kl": 0.7899169921875, "learning_rate": 3.4645740213499047e-07, "loss": 0.0316, "reward": 1.237165242433548, "reward_std": 0.1599214794114232, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871651977300644, "step": 2291 }, { "completion_length": 730.0156707763672, "epoch": 0.6846389365992084, "grad_norm": 0.9032988548278809, "kl": 0.91064453125, "learning_rate": 3.460389094924861e-07, "loss": 0.0364, "reward": 1.1657366752624512, "reward_std": 0.23999125510454178, "rewards/accuracy_reward": 0.21205358393490314, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.953683078289032, "step": 2292 }, { "completion_length": 607.6495819091797, "epoch": 0.6849376446867299, "grad_norm": 0.5479968786239624, "kl": 0.451171875, "learning_rate": 3.4562063873965774e-07, "loss": 0.0181, "reward": 1.1819196939468384, "reward_std": 0.1742946282029152, "rewards/accuracy_reward": 0.1986607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2293 }, { "completion_length": 720.1451110839844, "epoch": 0.6852363527742513, "grad_norm": 0.38239622116088867, "kl": 0.67041015625, "learning_rate": 3.4520259033154296e-07, "loss": 0.0268, "reward": 1.1110491752624512, "reward_std": 0.21419401839375496, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884439468384, "step": 2294 }, { "completion_length": 571.9710083007812, "epoch": 0.6855350608617728, "grad_norm": 0.6585218906402588, "kl": 0.4248046875, "learning_rate": 3.447847647229379e-07, "loss": 0.017, "reward": 1.1674107909202576, "reward_std": 0.1375804473645985, "rewards/accuracy_reward": 0.1741071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 2295 }, { "completion_length": 586.2902145385742, "epoch": 0.6858337689492943, "grad_norm": 0.42170822620391846, "kl": 0.31829833984375, "learning_rate": 3.443671623683954e-07, "loss": 0.0127, "reward": 1.270089328289032, "reward_std": 0.19203783199191093, "rewards/accuracy_reward": 0.2879464402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428805589676, "step": 2296 }, { "completion_length": 649.7969055175781, "epoch": 0.6861324770368158, "grad_norm": 0.2834652066230774, "kl": 0.2451171875, "learning_rate": 3.43949783722226e-07, "loss": 0.0098, "reward": 1.2522321939468384, "reward_std": 0.12848996557295322, "rewards/accuracy_reward": 0.2633928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 2297 }, { "completion_length": 690.7344055175781, "epoch": 0.6864311851243372, "grad_norm": 0.3862066864967346, "kl": 0.435791015625, "learning_rate": 3.43532629238497e-07, "loss": 0.0174, "reward": 1.0753348916769028, "reward_std": 0.15905370563268661, "rewards/accuracy_reward": 0.09598215040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793526977300644, "step": 2298 }, { "completion_length": 724.122802734375, "epoch": 0.6867298932118587, "grad_norm": 0.6219689846038818, "kl": 0.3720703125, "learning_rate": 3.431156993710312e-07, "loss": 0.0149, "reward": 1.1556919813156128, "reward_std": 0.17732944898307323, "rewards/accuracy_reward": 0.1785714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977120578289032, "step": 2299 }, { "completion_length": 631.0067291259766, "epoch": 0.6870286012993801, "grad_norm": 0.4325084686279297, "kl": 0.283447265625, "learning_rate": 3.42698994573408e-07, "loss": 0.0114, "reward": 1.1478795409202576, "reward_std": 0.1637396840378642, "rewards/accuracy_reward": 0.15625001047737896, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916294813156128, "step": 2300 }, { "completion_length": 650.7455673217773, "epoch": 0.6873273093869017, "grad_norm": 0.6465408205986023, "kl": 0.7353515625, "learning_rate": 3.422825152989606e-07, "loss": 0.0294, "reward": 1.1389509737491608, "reward_std": 0.12848623655736446, "rewards/accuracy_reward": 0.1674107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9715402126312256, "step": 2301 }, { "completion_length": 679.9977874755859, "epoch": 0.6876260174744231, "grad_norm": 0.6918081045150757, "kl": 0.41552734375, "learning_rate": 3.418662620007782e-07, "loss": 0.0166, "reward": 1.184151828289032, "reward_std": 0.1573163066059351, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 2302 }, { "completion_length": 639.4129791259766, "epoch": 0.6879247255619446, "grad_norm": 0.5518285036087036, "kl": 0.44580078125, "learning_rate": 3.4145023513170355e-07, "loss": 0.0178, "reward": 1.182477742433548, "reward_std": 0.18448578380048275, "rewards/accuracy_reward": 0.1941964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2303 }, { "completion_length": 707.9375305175781, "epoch": 0.688223433649466, "grad_norm": 0.33681538701057434, "kl": 0.548828125, "learning_rate": 3.410344351443329e-07, "loss": 0.0219, "reward": 1.1127232611179352, "reward_std": 0.18761966563761234, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589477300644, "step": 2304 }, { "completion_length": 706.450927734375, "epoch": 0.6885221417369876, "grad_norm": 1.0736850500106812, "kl": 0.532470703125, "learning_rate": 3.4061886249101594e-07, "loss": 0.0213, "reward": 1.2232143580913544, "reward_std": 0.1833713036030531, "rewards/accuracy_reward": 0.2366071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2305 }, { "completion_length": 600.8303985595703, "epoch": 0.688820849824509, "grad_norm": 0.5217152237892151, "kl": 0.3369140625, "learning_rate": 3.4020351762385527e-07, "loss": 0.0135, "reward": 1.2126116454601288, "reward_std": 0.15875808522105217, "rewards/accuracy_reward": 0.22544644214212894, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2306 }, { "completion_length": 685.8326263427734, "epoch": 0.6891195579120305, "grad_norm": 0.42247945070266724, "kl": 0.30908203125, "learning_rate": 3.397884009947049e-07, "loss": 0.0124, "reward": 1.1434152275323868, "reward_std": 0.17887084186077118, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 2307 }, { "completion_length": 662.6094055175781, "epoch": 0.6894182659995519, "grad_norm": 0.6613858342170715, "kl": 0.525634765625, "learning_rate": 3.3937351305517137e-07, "loss": 0.021, "reward": 1.1601563096046448, "reward_std": 0.2338860183954239, "rewards/accuracy_reward": 0.18526786752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884290456772, "step": 2308 }, { "completion_length": 671.5424346923828, "epoch": 0.6897169740870734, "grad_norm": 0.4266843795776367, "kl": 0.531982421875, "learning_rate": 3.3895885425661206e-07, "loss": 0.0213, "reward": 1.1668527126312256, "reward_std": 0.20223019272089005, "rewards/accuracy_reward": 0.1897321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205633878708, "step": 2309 }, { "completion_length": 660.122802734375, "epoch": 0.6900156821745949, "grad_norm": 0.39782199263572693, "kl": 0.32421875, "learning_rate": 3.38544425050135e-07, "loss": 0.0129, "reward": 1.15011166036129, "reward_std": 0.1267025675624609, "rewards/accuracy_reward": 0.1674107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 2310 }, { "completion_length": 593.279052734375, "epoch": 0.6903143902621164, "grad_norm": 0.416043758392334, "kl": 0.6689453125, "learning_rate": 3.3813022588659864e-07, "loss": 0.0267, "reward": 1.1718750596046448, "reward_std": 0.13368067145347595, "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933036118745804, "step": 2311 }, { "completion_length": 684.3281402587891, "epoch": 0.6906130983496378, "grad_norm": 0.6901879906654358, "kl": 0.56396484375, "learning_rate": 3.3771625721661116e-07, "loss": 0.0226, "reward": 1.1702009439468384, "reward_std": 0.12451012805104256, "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.989397332072258, "step": 2312 }, { "completion_length": 626.0870819091797, "epoch": 0.6909118064371593, "grad_norm": 0.3835359513759613, "kl": 0.256591796875, "learning_rate": 3.3730251949052966e-07, "loss": 0.0102, "reward": 1.2466518580913544, "reward_std": 0.14822658244520426, "rewards/accuracy_reward": 0.25446429336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 2313 }, { "completion_length": 617.3549499511719, "epoch": 0.6912105145246807, "grad_norm": 0.4745554029941559, "kl": 0.25146484375, "learning_rate": 3.3688901315846045e-07, "loss": 0.0101, "reward": 1.1361607909202576, "reward_std": 0.15566978976130486, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9955357313156128, "step": 2314 }, { "completion_length": 625.2723388671875, "epoch": 0.6915092226122023, "grad_norm": 0.3713350296020508, "kl": 0.5283203125, "learning_rate": 3.364757386702577e-07, "loss": 0.0211, "reward": 1.1953125596046448, "reward_std": 0.17273222468793392, "rewards/accuracy_reward": 0.2120535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2315 }, { "completion_length": 616.8504638671875, "epoch": 0.6918079306997237, "grad_norm": 0.6140142679214478, "kl": 0.283203125, "learning_rate": 3.3606269647552365e-07, "loss": 0.0113, "reward": 1.1986607611179352, "reward_std": 0.1740307081490755, "rewards/accuracy_reward": 0.2053571566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933036118745804, "step": 2316 }, { "completion_length": 665.1339721679688, "epoch": 0.6921066387872452, "grad_norm": 0.7308787703514099, "kl": 0.685791015625, "learning_rate": 3.3564988702360785e-07, "loss": 0.0274, "reward": 1.1383929252624512, "reward_std": 0.18473666161298752, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071939468384, "step": 2317 }, { "completion_length": 727.3638763427734, "epoch": 0.6924053468747666, "grad_norm": 0.5595221519470215, "kl": 0.44677734375, "learning_rate": 3.352373107636063e-07, "loss": 0.0179, "reward": 1.2014509439468384, "reward_std": 0.23863646760582924, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 2318 }, { "completion_length": 696.2254791259766, "epoch": 0.6927040549622882, "grad_norm": 1.0239696502685547, "kl": 0.33984375, "learning_rate": 3.3482496814436157e-07, "loss": 0.0136, "reward": 1.127790242433548, "reward_std": 0.17136183753609657, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 2319 }, { "completion_length": 677.2187805175781, "epoch": 0.6930027630498096, "grad_norm": 0.5502526164054871, "kl": 0.49755859375, "learning_rate": 3.344128596144623e-07, "loss": 0.0199, "reward": 1.160714328289032, "reward_std": 0.1855195313692093, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 2320 }, { "completion_length": 679.6741333007812, "epoch": 0.6933014711373311, "grad_norm": 0.46501052379608154, "kl": 0.49169921875, "learning_rate": 3.340009856222417e-07, "loss": 0.0197, "reward": 1.1344866752624512, "reward_std": 0.2011480825021863, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 2321 }, { "completion_length": 611.169677734375, "epoch": 0.6936001792248525, "grad_norm": 0.49228087067604065, "kl": 0.4283447265625, "learning_rate": 3.3358934661577863e-07, "loss": 0.0171, "reward": 1.0641741752624512, "reward_std": 0.09961958788335323, "rewards/accuracy_reward": 0.07589286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 2322 }, { "completion_length": 681.8236846923828, "epoch": 0.693898887312374, "grad_norm": 0.6644508838653564, "kl": 0.55859375, "learning_rate": 3.331779430428961e-07, "loss": 0.0224, "reward": 1.3264509439468384, "reward_std": 0.19565399736166, "rewards/accuracy_reward": 0.3482142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.97823666036129, "step": 2323 }, { "completion_length": 748.904052734375, "epoch": 0.6941975953998955, "grad_norm": 0.6191872954368591, "kl": 0.57177734375, "learning_rate": 3.3276677535116047e-07, "loss": 0.0229, "reward": 1.1194197088479996, "reward_std": 0.15497989766299725, "rewards/accuracy_reward": 0.14285715157166123, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625447034836, "step": 2324 }, { "completion_length": 683.279052734375, "epoch": 0.694496303487417, "grad_norm": 1.1856110095977783, "kl": 0.423583984375, "learning_rate": 3.323558439878822e-07, "loss": 0.017, "reward": 1.1183035969734192, "reward_std": 0.11082203686237335, "rewards/accuracy_reward": 0.13169643515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2325 }, { "completion_length": 611.3169860839844, "epoch": 0.6947950115749384, "grad_norm": 0.38274064660072327, "kl": 0.519287109375, "learning_rate": 3.3194514940011437e-07, "loss": 0.0207, "reward": 1.1344866454601288, "reward_std": 0.1387349870055914, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 2326 }, { "completion_length": 725.4352874755859, "epoch": 0.6950937196624598, "grad_norm": 0.4746192395687103, "kl": 0.783203125, "learning_rate": 3.315346920346521e-07, "loss": 0.0313, "reward": 1.158482164144516, "reward_std": 0.21246575564146042, "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464775323868, "step": 2327 }, { "completion_length": 709.6964416503906, "epoch": 0.6953924277499813, "grad_norm": 0.5887142419815063, "kl": 0.52978515625, "learning_rate": 3.311244723380332e-07, "loss": 0.0211, "reward": 1.1015625298023224, "reward_std": 0.13117839395999908, "rewards/accuracy_reward": 0.11830357741564512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2328 }, { "completion_length": 663.7634124755859, "epoch": 0.6956911358375027, "grad_norm": 0.6191689372062683, "kl": 1.0068359375, "learning_rate": 3.3071449075653617e-07, "loss": 0.0403, "reward": 1.1858259737491608, "reward_std": 0.1667478960007429, "rewards/accuracy_reward": 0.2053571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 2329 }, { "completion_length": 647.2209930419922, "epoch": 0.6959898439250243, "grad_norm": 0.5271393656730652, "kl": 0.57275390625, "learning_rate": 3.303047477361809e-07, "loss": 0.0229, "reward": 1.1729911267757416, "reward_std": 0.1411107126623392, "rewards/accuracy_reward": 0.19419643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946939468384, "step": 2330 }, { "completion_length": 673.3415374755859, "epoch": 0.6962885520125457, "grad_norm": 0.5956228971481323, "kl": 0.5615234375, "learning_rate": 3.298952437227278e-07, "loss": 0.0224, "reward": 1.1690848767757416, "reward_std": 0.22931889817118645, "rewards/accuracy_reward": 0.18750000977888703, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 2331 }, { "completion_length": 729.5781707763672, "epoch": 0.6965872601000672, "grad_norm": 1.056908369064331, "kl": 1.197265625, "learning_rate": 3.2948597916167677e-07, "loss": 0.0479, "reward": 1.2025670111179352, "reward_std": 0.21256034076213837, "rewards/accuracy_reward": 0.2254464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205633878708, "step": 2332 }, { "completion_length": 639.0535888671875, "epoch": 0.6968859681875886, "grad_norm": 1.0342059135437012, "kl": 0.42919921875, "learning_rate": 3.2907695449826766e-07, "loss": 0.0172, "reward": 1.1618304252624512, "reward_std": 0.13305442593991756, "rewards/accuracy_reward": 0.1763392984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 2333 }, { "completion_length": 651.1227874755859, "epoch": 0.6971846762751102, "grad_norm": 0.5884181261062622, "kl": 0.365234375, "learning_rate": 3.2866817017747947e-07, "loss": 0.0146, "reward": 1.2008929550647736, "reward_std": 0.15070844627916813, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 2334 }, { "completion_length": 678.9821624755859, "epoch": 0.6974833843626316, "grad_norm": 1.013664960861206, "kl": 0.912109375, "learning_rate": 3.2825962664402914e-07, "loss": 0.0364, "reward": 1.162946492433548, "reward_std": 0.1437363363802433, "rewards/accuracy_reward": 0.1808035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98214291036129, "step": 2335 }, { "completion_length": 738.3817291259766, "epoch": 0.6977820924501531, "grad_norm": 1.1947189569473267, "kl": 1.208740234375, "learning_rate": 3.2785132434237215e-07, "loss": 0.0484, "reward": 1.207589328289032, "reward_std": 0.25435473397374153, "rewards/accuracy_reward": 0.2455357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9620536267757416, "step": 2336 }, { "completion_length": 660.3393096923828, "epoch": 0.6980808005376745, "grad_norm": 0.5196542143821716, "kl": 0.44970703125, "learning_rate": 3.2744326371670153e-07, "loss": 0.018, "reward": 1.0814732313156128, "reward_std": 0.12041330710053444, "rewards/accuracy_reward": 0.09375000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2337 }, { "completion_length": 722.966552734375, "epoch": 0.698379508625196, "grad_norm": 0.8017103672027588, "kl": 0.89111328125, "learning_rate": 3.270354452109468e-07, "loss": 0.0356, "reward": 1.1188616156578064, "reward_std": 0.1495901457965374, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 2338 }, { "completion_length": 759.3906707763672, "epoch": 0.6986782167127175, "grad_norm": 0.9962562918663025, "kl": 1.4248046875, "learning_rate": 3.2662786926877494e-07, "loss": 0.057, "reward": 1.0552455633878708, "reward_std": 0.16164319962263107, "rewards/accuracy_reward": 0.08482143306173384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9704241454601288, "step": 2339 }, { "completion_length": 630.3928833007812, "epoch": 0.698976924800239, "grad_norm": 0.40534305572509766, "kl": 0.57275390625, "learning_rate": 3.262205363335885e-07, "loss": 0.0229, "reward": 1.1657366752624512, "reward_std": 0.15202689403668046, "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 2340 }, { "completion_length": 656.7991333007812, "epoch": 0.6992756328877604, "grad_norm": 0.33302804827690125, "kl": 0.68798828125, "learning_rate": 3.258134468485258e-07, "loss": 0.0275, "reward": 1.1422991752624512, "reward_std": 0.17135540768504143, "rewards/accuracy_reward": 0.16517857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205633878708, "step": 2341 }, { "completion_length": 659.0759124755859, "epoch": 0.6995743409752819, "grad_norm": 0.584369957447052, "kl": 0.796875, "learning_rate": 3.2540660125646035e-07, "loss": 0.0319, "reward": 1.0468750298023224, "reward_std": 0.18235351517796516, "rewards/accuracy_reward": 0.0758928582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9709821939468384, "step": 2342 }, { "completion_length": 694.3504791259766, "epoch": 0.6998730490628033, "grad_norm": 0.7129337191581726, "kl": 0.73388671875, "learning_rate": 3.250000000000001e-07, "loss": 0.0294, "reward": 1.241071492433548, "reward_std": 0.15080557018518448, "rewards/accuracy_reward": 0.2656250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464775323868, "step": 2343 }, { "completion_length": 696.0401916503906, "epoch": 0.7001717571503249, "grad_norm": 0.9485521912574768, "kl": 0.67724609375, "learning_rate": 3.2459364352148743e-07, "loss": 0.0271, "reward": 1.1674107611179352, "reward_std": 0.15503185987472534, "rewards/accuracy_reward": 0.18303572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 2344 }, { "completion_length": 742.8683471679688, "epoch": 0.7004704652378463, "grad_norm": 0.3126676678657532, "kl": 0.681640625, "learning_rate": 3.2418753226299853e-07, "loss": 0.0272, "reward": 1.1489956080913544, "reward_std": 0.17630625516176224, "rewards/accuracy_reward": 0.1696428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 2345 }, { "completion_length": 723.0268096923828, "epoch": 0.7007691733253678, "grad_norm": 0.7735111713409424, "kl": 1.29052734375, "learning_rate": 3.2378166666634257e-07, "loss": 0.0517, "reward": 1.0920759588479996, "reward_std": 0.1307613030076027, "rewards/accuracy_reward": 0.1227678619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9693080633878708, "step": 2346 }, { "completion_length": 737.9710083007812, "epoch": 0.7010678814128892, "grad_norm": 0.4651695787906647, "kl": 0.5048828125, "learning_rate": 3.233760471730613e-07, "loss": 0.0202, "reward": 1.1702009439468384, "reward_std": 0.2336510643362999, "rewards/accuracy_reward": 0.191964291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 2347 }, { "completion_length": 652.4821624755859, "epoch": 0.7013665895004108, "grad_norm": 0.6926632523536682, "kl": 0.316162109375, "learning_rate": 3.2297067422442937e-07, "loss": 0.0126, "reward": 1.108258992433548, "reward_std": 0.12783564254641533, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 2348 }, { "completion_length": 673.3169937133789, "epoch": 0.7016652975879322, "grad_norm": 0.5778827667236328, "kl": 0.703125, "learning_rate": 3.2256554826145255e-07, "loss": 0.0282, "reward": 1.1679687798023224, "reward_std": 0.22731972485780716, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 2349 }, { "completion_length": 691.0736846923828, "epoch": 0.7019640056754537, "grad_norm": 0.5053426623344421, "kl": 0.57421875, "learning_rate": 3.221606697248681e-07, "loss": 0.0229, "reward": 1.1523438096046448, "reward_std": 0.176797516644001, "rewards/accuracy_reward": 0.17633929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.976004496216774, "step": 2350 }, { "completion_length": 658.4085235595703, "epoch": 0.7022627137629751, "grad_norm": 1.085326075553894, "kl": 0.56494140625, "learning_rate": 3.2175603905514457e-07, "loss": 0.0225, "reward": 1.0680804252624512, "reward_std": 0.15874860249459743, "rewards/accuracy_reward": 0.08705357438884676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 2351 }, { "completion_length": 556.8616256713867, "epoch": 0.7025614218504966, "grad_norm": 0.3343418836593628, "kl": 0.3216552734375, "learning_rate": 3.213516566924801e-07, "loss": 0.0129, "reward": 1.3058035969734192, "reward_std": 0.20880779065191746, "rewards/accuracy_reward": 0.3169643022119999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 2352 }, { "completion_length": 625.1652069091797, "epoch": 0.702860129938018, "grad_norm": 0.4423394799232483, "kl": 0.476318359375, "learning_rate": 3.209475230768034e-07, "loss": 0.0191, "reward": 1.244977742433548, "reward_std": 0.18933769688010216, "rewards/accuracy_reward": 0.2656250186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 2353 }, { "completion_length": 775.279052734375, "epoch": 0.7031588380255396, "grad_norm": 0.5651177167892456, "kl": 0.822265625, "learning_rate": 3.205436386477718e-07, "loss": 0.0329, "reward": 1.0920759737491608, "reward_std": 0.19765349105000496, "rewards/accuracy_reward": 0.12276786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9693080931901932, "step": 2354 }, { "completion_length": 582.2991333007812, "epoch": 0.703457546113061, "grad_norm": 0.5187432765960693, "kl": 0.287109375, "learning_rate": 3.2014000384477223e-07, "loss": 0.0115, "reward": 1.180245578289032, "reward_std": 0.14191873371601105, "rewards/accuracy_reward": 0.1875000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455484867096, "step": 2355 }, { "completion_length": 647.1562652587891, "epoch": 0.7037562542005825, "grad_norm": 0.8697028160095215, "kl": 0.32080078125, "learning_rate": 3.197366191069199e-07, "loss": 0.0128, "reward": 1.1065848469734192, "reward_std": 0.16291911993175745, "rewards/accuracy_reward": 0.11830357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2356 }, { "completion_length": 646.7991333007812, "epoch": 0.7040549622881039, "grad_norm": 0.7140395641326904, "kl": 0.5103759765625, "learning_rate": 3.193334848730577e-07, "loss": 0.0205, "reward": 1.1506696790456772, "reward_std": 0.12040416523814201, "rewards/accuracy_reward": 0.16517858440056443, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 2357 }, { "completion_length": 683.638427734375, "epoch": 0.7043536703756255, "grad_norm": 0.6177530288696289, "kl": 0.63916015625, "learning_rate": 3.1893060158175607e-07, "loss": 0.0256, "reward": 1.2031250596046448, "reward_std": 0.146495227701962, "rewards/accuracy_reward": 0.2209821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428805589676, "step": 2358 }, { "completion_length": 684.591552734375, "epoch": 0.7046523784631469, "grad_norm": 0.5927422046661377, "kl": 0.41015625, "learning_rate": 3.185279696713129e-07, "loss": 0.0164, "reward": 1.1835938096046448, "reward_std": 0.21062232181429863, "rewards/accuracy_reward": 0.1986607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2359 }, { "completion_length": 654.2857360839844, "epoch": 0.7049510865506684, "grad_norm": 0.9022149443626404, "kl": 0.582275390625, "learning_rate": 3.181255895797519e-07, "loss": 0.0233, "reward": 1.2589286267757416, "reward_std": 0.14318148791790009, "rewards/accuracy_reward": 0.270089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 2360 }, { "completion_length": 697.3192291259766, "epoch": 0.7052497946381898, "grad_norm": 0.6671894192695618, "kl": 0.399169921875, "learning_rate": 3.1772346174482325e-07, "loss": 0.016, "reward": 1.0920759439468384, "reward_std": 0.15058785676956177, "rewards/accuracy_reward": 0.11607143026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760044813156128, "step": 2361 }, { "completion_length": 689.3817291259766, "epoch": 0.7055485027257113, "grad_norm": 0.5627826452255249, "kl": 0.62158203125, "learning_rate": 3.1732158660400286e-07, "loss": 0.0249, "reward": 1.161272406578064, "reward_std": 0.1390739567577839, "rewards/accuracy_reward": 0.17633929196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 2362 }, { "completion_length": 662.6607360839844, "epoch": 0.7058472108132328, "grad_norm": 0.44565293192863464, "kl": 0.654541015625, "learning_rate": 3.169199645944912e-07, "loss": 0.0262, "reward": 1.1456473469734192, "reward_std": 0.1721078585833311, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2363 }, { "completion_length": 684.8817291259766, "epoch": 0.7061459189007543, "grad_norm": 0.5028607249259949, "kl": 0.57421875, "learning_rate": 3.1651859615321367e-07, "loss": 0.0229, "reward": 1.1015625298023224, "reward_std": 0.15935473330318928, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2364 }, { "completion_length": 702.1161193847656, "epoch": 0.7064446269882757, "grad_norm": 0.9143485426902771, "kl": 0.786376953125, "learning_rate": 3.161174817168202e-07, "loss": 0.0314, "reward": 1.1121652126312256, "reward_std": 0.17104613780975342, "rewards/accuracy_reward": 0.12946429150179029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 2365 }, { "completion_length": 621.0736999511719, "epoch": 0.7067433350757972, "grad_norm": 0.4340303838253021, "kl": 0.1876220703125, "learning_rate": 3.1571662172168334e-07, "loss": 0.0075, "reward": 1.2522321939468384, "reward_std": 0.11027080751955509, "rewards/accuracy_reward": 0.2544642984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9977678805589676, "step": 2366 }, { "completion_length": 724.8125305175781, "epoch": 0.7070420431633186, "grad_norm": 0.8462481498718262, "kl": 0.40869140625, "learning_rate": 3.153160166039e-07, "loss": 0.0164, "reward": 1.1618304252624512, "reward_std": 0.11033247038722038, "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 2367 }, { "completion_length": 684.7745971679688, "epoch": 0.7073407512508402, "grad_norm": 0.22198502719402313, "kl": 0.287109375, "learning_rate": 3.1491566679928895e-07, "loss": 0.0115, "reward": 1.1478795409202576, "reward_std": 0.11673073098063469, "rewards/accuracy_reward": 0.15625000861473382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916295111179352, "step": 2368 }, { "completion_length": 604.2031631469727, "epoch": 0.7076394593383616, "grad_norm": 0.5811465978622437, "kl": 0.8553466796875, "learning_rate": 3.145155727433917e-07, "loss": 0.0343, "reward": 1.1830357760190964, "reward_std": 0.17699044570326805, "rewards/accuracy_reward": 0.20758928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464626312256, "step": 2369 }, { "completion_length": 796.9486846923828, "epoch": 0.707938167425883, "grad_norm": 0.6837474703788757, "kl": 1.0771484375, "learning_rate": 3.141157348714716e-07, "loss": 0.043, "reward": 1.1372768580913544, "reward_std": 0.22500445321202278, "rewards/accuracy_reward": 0.17410715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9631696939468384, "step": 2370 }, { "completion_length": 630.5245666503906, "epoch": 0.7082368755134045, "grad_norm": 0.8532623052597046, "kl": 0.462646484375, "learning_rate": 3.1371615361851246e-07, "loss": 0.0185, "reward": 1.217633992433548, "reward_std": 0.2230624444782734, "rewards/accuracy_reward": 0.23437500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 2371 }, { "completion_length": 604.138427734375, "epoch": 0.7085355836009259, "grad_norm": 0.6351072788238525, "kl": 0.250244140625, "learning_rate": 3.1331682941922e-07, "loss": 0.01, "reward": 1.1947545111179352, "reward_std": 0.13783417362719774, "rewards/accuracy_reward": 0.20089285937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616156578064, "step": 2372 }, { "completion_length": 773.2656555175781, "epoch": 0.7088342916884475, "grad_norm": 0.3803645670413971, "kl": 0.66064453125, "learning_rate": 3.129177627080198e-07, "loss": 0.0264, "reward": 1.079241156578064, "reward_std": 0.13366305083036423, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946939468384, "step": 2373 }, { "completion_length": 686.5156555175781, "epoch": 0.7091329997759689, "grad_norm": 0.4023769795894623, "kl": 0.3818359375, "learning_rate": 3.125189539190571e-07, "loss": 0.0153, "reward": 1.2460937798023224, "reward_std": 0.17554089799523354, "rewards/accuracy_reward": 0.2566964402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 2374 }, { "completion_length": 550.2009048461914, "epoch": 0.7094317078634904, "grad_norm": 0.5140553712844849, "kl": 0.37841796875, "learning_rate": 3.121204034861969e-07, "loss": 0.0151, "reward": 1.2092634439468384, "reward_std": 0.15760095790028572, "rewards/accuracy_reward": 0.2209821566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 2375 }, { "completion_length": 626.0178833007812, "epoch": 0.7097304159510118, "grad_norm": 0.8240669369697571, "kl": 0.51495361328125, "learning_rate": 3.117221118430231e-07, "loss": 0.0206, "reward": 1.1908482611179352, "reward_std": 0.14501736499369144, "rewards/accuracy_reward": 0.1986607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875447034836, "step": 2376 }, { "completion_length": 662.4821624755859, "epoch": 0.7100291240385334, "grad_norm": 0.3727743327617645, "kl": 0.4091796875, "learning_rate": 3.1132407942283777e-07, "loss": 0.0164, "reward": 1.1785714626312256, "reward_std": 0.13599021965637803, "rewards/accuracy_reward": 0.1897321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 2377 }, { "completion_length": 665.5111999511719, "epoch": 0.7103278321260548, "grad_norm": 0.7867696285247803, "kl": 0.2540283203125, "learning_rate": 3.109263066586613e-07, "loss": 0.0102, "reward": 1.1612723469734192, "reward_std": 0.15079933032393456, "rewards/accuracy_reward": 0.16741072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 2378 }, { "completion_length": 829.3147735595703, "epoch": 0.7106265402135763, "grad_norm": 1.1507467031478882, "kl": 0.908203125, "learning_rate": 3.105287939832316e-07, "loss": 0.0363, "reward": 1.1155134439468384, "reward_std": 0.12993014603853226, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562947034836, "step": 2379 }, { "completion_length": 635.8661193847656, "epoch": 0.7109252483010977, "grad_norm": 0.3716336190700531, "kl": 0.38427734375, "learning_rate": 3.1013154182900307e-07, "loss": 0.0154, "reward": 1.1713170111179352, "reward_std": 0.1533975750207901, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 2380 }, { "completion_length": 678.6428909301758, "epoch": 0.7112239563886192, "grad_norm": 0.545291543006897, "kl": 0.53466796875, "learning_rate": 3.0973455062814767e-07, "loss": 0.0214, "reward": 1.1523438096046448, "reward_std": 0.17684771306812763, "rewards/accuracy_reward": 0.16294644121080637, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 2381 }, { "completion_length": 673.7344055175781, "epoch": 0.7115226644761407, "grad_norm": 0.8204124569892883, "kl": 0.330078125, "learning_rate": 3.0933782081255243e-07, "loss": 0.0132, "reward": 1.1579241454601288, "reward_std": 0.14442112855613232, "rewards/accuracy_reward": 0.16964286682195961, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 2382 }, { "completion_length": 727.0625305175781, "epoch": 0.7118213725636622, "grad_norm": 0.7092128992080688, "kl": 0.6015625, "learning_rate": 3.089413528138207e-07, "loss": 0.024, "reward": 1.1149554252624512, "reward_std": 0.20186252146959305, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2383 }, { "completion_length": 736.3102874755859, "epoch": 0.7121200806511836, "grad_norm": 0.5488457679748535, "kl": 0.806640625, "learning_rate": 3.0854514706327105e-07, "loss": 0.0323, "reward": 1.093191996216774, "reward_std": 0.2170930951833725, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.961495578289032, "step": 2384 }, { "completion_length": 665.763427734375, "epoch": 0.7124187887387051, "grad_norm": 0.9240119457244873, "kl": 0.4537353515625, "learning_rate": 3.081492039919361e-07, "loss": 0.0181, "reward": 1.126116156578064, "reward_std": 0.16306137293577194, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2385 }, { "completion_length": 693.0245819091797, "epoch": 0.7127174968262265, "grad_norm": 1.114945888519287, "kl": 0.482421875, "learning_rate": 3.077535240305632e-07, "loss": 0.0193, "reward": 1.1757813394069672, "reward_std": 0.2071743682026863, "rewards/accuracy_reward": 0.18973214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98604916036129, "step": 2386 }, { "completion_length": 708.0111846923828, "epoch": 0.7130162049137481, "grad_norm": 0.3271197974681854, "kl": 0.48193359375, "learning_rate": 3.0735810760961367e-07, "loss": 0.0193, "reward": 1.1088170111179352, "reward_std": 0.123127281665802, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 2387 }, { "completion_length": 637.5067443847656, "epoch": 0.7133149130012695, "grad_norm": 0.3790363073348999, "kl": 0.376953125, "learning_rate": 3.069629551592615e-07, "loss": 0.0151, "reward": 1.299665242433548, "reward_std": 0.16863951459527016, "rewards/accuracy_reward": 0.3147321604192257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 2388 }, { "completion_length": 614.1361846923828, "epoch": 0.713613621088791, "grad_norm": 0.27142369747161865, "kl": 0.3916015625, "learning_rate": 3.065680671093939e-07, "loss": 0.0157, "reward": 1.174665242433548, "reward_std": 0.15175586752593517, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2389 }, { "completion_length": 652.1652221679688, "epoch": 0.7139123291763124, "grad_norm": 0.41833987832069397, "kl": 0.511474609375, "learning_rate": 3.061734438896104e-07, "loss": 0.0204, "reward": 1.131138414144516, "reward_std": 0.16174760460853577, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 2390 }, { "completion_length": 607.1920013427734, "epoch": 0.714211037263834, "grad_norm": 0.37101781368255615, "kl": 0.4300537109375, "learning_rate": 3.0577908592922235e-07, "loss": 0.0172, "reward": 1.2299107611179352, "reward_std": 0.19012343510985374, "rewards/accuracy_reward": 0.24107143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 2391 }, { "completion_length": 622.4687805175781, "epoch": 0.7145097453513554, "grad_norm": 0.5508427023887634, "kl": 0.54052734375, "learning_rate": 3.053849936572526e-07, "loss": 0.0216, "reward": 1.198660746216774, "reward_std": 0.11699402518570423, "rewards/accuracy_reward": 0.21428571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2392 }, { "completion_length": 633.122802734375, "epoch": 0.7148084534388769, "grad_norm": 0.6454895734786987, "kl": 0.462890625, "learning_rate": 3.0499116750243504e-07, "loss": 0.0185, "reward": 1.164620578289032, "reward_std": 0.17186135612428188, "rewards/accuracy_reward": 0.18080358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 2393 }, { "completion_length": 704.3705749511719, "epoch": 0.7151071615263983, "grad_norm": 0.8968779444694519, "kl": 0.431640625, "learning_rate": 3.0459760789321357e-07, "loss": 0.0173, "reward": 1.1277902126312256, "reward_std": 0.17784049920737743, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2394 }, { "completion_length": 688.9174499511719, "epoch": 0.7154058696139198, "grad_norm": 1.5242491960525513, "kl": 0.533203125, "learning_rate": 3.0420431525774293e-07, "loss": 0.0213, "reward": 1.102120578289032, "reward_std": 0.17947758361697197, "rewards/accuracy_reward": 0.13169643143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9704241454601288, "step": 2395 }, { "completion_length": 637.4219131469727, "epoch": 0.7157045777014412, "grad_norm": 0.7748288512229919, "kl": 0.55535888671875, "learning_rate": 3.0381129002388653e-07, "loss": 0.0222, "reward": 1.2410714626312256, "reward_std": 0.2458823099732399, "rewards/accuracy_reward": 0.2633928693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 2396 }, { "completion_length": 589.2232513427734, "epoch": 0.7160032857889628, "grad_norm": 0.6973243355751038, "kl": 0.2890625, "learning_rate": 3.0341853261921753e-07, "loss": 0.0116, "reward": 1.182477742433548, "reward_std": 0.15717919170856476, "rewards/accuracy_reward": 0.1897321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455484867096, "step": 2397 }, { "completion_length": 577.959846496582, "epoch": 0.7163019938764842, "grad_norm": 0.38035279512405396, "kl": 0.3299560546875, "learning_rate": 3.0302604347101763e-07, "loss": 0.0132, "reward": 1.2790178656578064, "reward_std": 0.2006077915430069, "rewards/accuracy_reward": 0.2924107275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071939468384, "step": 2398 }, { "completion_length": 655.6808471679688, "epoch": 0.7166007019640057, "grad_norm": 0.5496160387992859, "kl": 0.68212890625, "learning_rate": 3.0263382300627615e-07, "loss": 0.0273, "reward": 1.0931920111179352, "reward_std": 0.14177726581692696, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562947034836, "step": 2399 }, { "completion_length": 648.8527069091797, "epoch": 0.7168994100515271, "grad_norm": 0.43605920672416687, "kl": 0.5118408203125, "learning_rate": 3.022418716516908e-07, "loss": 0.0205, "reward": 1.3152902126312256, "reward_std": 0.1568480972200632, "rewards/accuracy_reward": 0.3258928768336773, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 2400 }, { "completion_length": 762.341552734375, "epoch": 0.7171981181390487, "grad_norm": 0.5842763185501099, "kl": 0.59033203125, "learning_rate": 3.018501898336664e-07, "loss": 0.0236, "reward": 1.1545759737491608, "reward_std": 0.14105918630957603, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 2401 }, { "completion_length": 684.4509124755859, "epoch": 0.7174968262265701, "grad_norm": 0.7680397629737854, "kl": 0.75732421875, "learning_rate": 3.0145877797831385e-07, "loss": 0.0302, "reward": 1.0920759439468384, "reward_std": 0.22132233157753944, "rewards/accuracy_reward": 0.11607143515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 2402 }, { "completion_length": 689.4375305175781, "epoch": 0.7177955343140916, "grad_norm": 0.5447782278060913, "kl": 0.9189453125, "learning_rate": 3.0106763651145134e-07, "loss": 0.0367, "reward": 1.2014509737491608, "reward_std": 0.21877088397741318, "rewards/accuracy_reward": 0.2410714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.960379496216774, "step": 2403 }, { "completion_length": 659.5201110839844, "epoch": 0.718094242401613, "grad_norm": 0.5093882083892822, "kl": 0.547119140625, "learning_rate": 3.006767658586024e-07, "loss": 0.0218, "reward": 1.1445313096046448, "reward_std": 0.1990182213485241, "rewards/accuracy_reward": 0.16294643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 2404 }, { "completion_length": 723.1094055175781, "epoch": 0.7183929504891345, "grad_norm": 0.42530277371406555, "kl": 0.7734375, "learning_rate": 3.002861664449957e-07, "loss": 0.0309, "reward": 1.1188616454601288, "reward_std": 0.1752729658037424, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 2405 }, { "completion_length": 705.529052734375, "epoch": 0.718691658576656, "grad_norm": 0.5124548673629761, "kl": 0.91650390625, "learning_rate": 2.998958386955654e-07, "loss": 0.0367, "reward": 1.1328125596046448, "reward_std": 0.1352465357631445, "rewards/accuracy_reward": 0.16517857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9676339626312256, "step": 2406 }, { "completion_length": 621.7209930419922, "epoch": 0.7189903666641775, "grad_norm": 0.40570908784866333, "kl": 0.4407958984375, "learning_rate": 2.9950578303494976e-07, "loss": 0.0176, "reward": 1.060825914144516, "reward_std": 0.15396713837981224, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723469734192, "step": 2407 }, { "completion_length": 639.2879791259766, "epoch": 0.7192890747516989, "grad_norm": 0.9462034106254578, "kl": 0.67138671875, "learning_rate": 2.9911599988749114e-07, "loss": 0.0268, "reward": 1.1640625298023224, "reward_std": 0.1908876858651638, "rewards/accuracy_reward": 0.18080358393490314, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 2408 }, { "completion_length": 749.5357513427734, "epoch": 0.7195877828392204, "grad_norm": 0.4760220944881439, "kl": 0.73681640625, "learning_rate": 2.9872648967723545e-07, "loss": 0.0295, "reward": 1.089285746216774, "reward_std": 0.16554349474608898, "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464775323868, "step": 2409 }, { "completion_length": 614.9129638671875, "epoch": 0.7198864909267418, "grad_norm": 0.38819727301597595, "kl": 0.488525390625, "learning_rate": 2.9833725282793145e-07, "loss": 0.0195, "reward": 1.1255580931901932, "reward_std": 0.1366370990872383, "rewards/accuracy_reward": 0.13616072107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 2410 }, { "completion_length": 669.0000305175781, "epoch": 0.7201851990142634, "grad_norm": 0.4934050440788269, "kl": 0.3740234375, "learning_rate": 2.979482897630307e-07, "loss": 0.015, "reward": 1.1222098767757416, "reward_std": 0.1496394444257021, "rewards/accuracy_reward": 0.13392857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2411 }, { "completion_length": 700.0379791259766, "epoch": 0.7204839071017848, "grad_norm": 0.8868652582168579, "kl": 0.7381591796875, "learning_rate": 2.975596009056871e-07, "loss": 0.0296, "reward": 1.1378348618745804, "reward_std": 0.12727986462414265, "rewards/accuracy_reward": 0.16294643213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884290456772, "step": 2412 }, { "completion_length": 619.0692138671875, "epoch": 0.7207826151893062, "grad_norm": 0.3220604658126831, "kl": 0.2047119140625, "learning_rate": 2.9717118667875584e-07, "loss": 0.0082, "reward": 1.109933078289032, "reward_std": 0.132086843252182, "rewards/accuracy_reward": 0.11830357555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 2413 }, { "completion_length": 617.2366333007812, "epoch": 0.7210813232768277, "grad_norm": 0.5479776859283447, "kl": 0.439453125, "learning_rate": 2.9678304750479366e-07, "loss": 0.0176, "reward": 1.2047991454601288, "reward_std": 0.24222875759005547, "rewards/accuracy_reward": 0.2209821566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 2414 }, { "completion_length": 642.5736999511719, "epoch": 0.7213800313643491, "grad_norm": 0.8895737528800964, "kl": 0.6435546875, "learning_rate": 2.9639518380605776e-07, "loss": 0.0257, "reward": 1.1936384439468384, "reward_std": 0.19038019701838493, "rewards/accuracy_reward": 0.2120535857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 2415 }, { "completion_length": 700.8326263427734, "epoch": 0.7216787394518707, "grad_norm": 0.9623447060585022, "kl": 0.35205078125, "learning_rate": 2.96007596004506e-07, "loss": 0.0141, "reward": 1.1947545111179352, "reward_std": 0.14629819244146347, "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 2416 }, { "completion_length": 684.4777221679688, "epoch": 0.7219774475393921, "grad_norm": 0.8925963044166565, "kl": 0.69140625, "learning_rate": 2.956202845217959e-07, "loss": 0.0277, "reward": 1.1166295111179352, "reward_std": 0.15859886817634106, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.969308078289032, "step": 2417 }, { "completion_length": 674.2388763427734, "epoch": 0.7222761556269136, "grad_norm": 0.7834488749504089, "kl": 0.648681640625, "learning_rate": 2.952332497792842e-07, "loss": 0.0259, "reward": 1.1071429252624512, "reward_std": 0.14838719926774502, "rewards/accuracy_reward": 0.12946429196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786267757416, "step": 2418 }, { "completion_length": 616.4375305175781, "epoch": 0.722574863714435, "grad_norm": 0.342614084482193, "kl": 0.34912109375, "learning_rate": 2.94846492198027e-07, "loss": 0.014, "reward": 1.20089291036129, "reward_std": 0.13192778080701828, "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071939468384, "step": 2419 }, { "completion_length": 602.7433319091797, "epoch": 0.7228735718019565, "grad_norm": 0.6146829724311829, "kl": 0.63134765625, "learning_rate": 2.944600121987786e-07, "loss": 0.0253, "reward": 1.1674107611179352, "reward_std": 0.13303334498777986, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 2420 }, { "completion_length": 608.9754791259766, "epoch": 0.723172279889478, "grad_norm": 0.5750711560249329, "kl": 0.47607421875, "learning_rate": 2.9407381020199125e-07, "loss": 0.019, "reward": 1.2645090222358704, "reward_std": 0.18188991025090218, "rewards/accuracy_reward": 0.2790178768336773, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911267757416, "step": 2421 }, { "completion_length": 737.9776916503906, "epoch": 0.7234709879769995, "grad_norm": 0.6689335107803345, "kl": 0.8349609375, "learning_rate": 2.9368788662781493e-07, "loss": 0.0335, "reward": 1.05636166036129, "reward_std": 0.1838175132870674, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723618745804, "step": 2422 }, { "completion_length": 673.216552734375, "epoch": 0.7237696960645209, "grad_norm": 0.3838225305080414, "kl": 0.59033203125, "learning_rate": 2.9330224189609674e-07, "loss": 0.0236, "reward": 1.108258992433548, "reward_std": 0.12512248195707798, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 2423 }, { "completion_length": 703.7120819091797, "epoch": 0.7240684041520424, "grad_norm": 0.47517216205596924, "kl": 0.55517578125, "learning_rate": 2.929168764263802e-07, "loss": 0.0222, "reward": 1.1266741454601288, "reward_std": 0.1898755244910717, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848767757416, "step": 2424 }, { "completion_length": 664.4888610839844, "epoch": 0.7243671122395638, "grad_norm": 0.45075860619544983, "kl": 0.60791015625, "learning_rate": 2.9253179063790525e-07, "loss": 0.0243, "reward": 1.1741071939468384, "reward_std": 0.16495573334395885, "rewards/accuracy_reward": 0.18750000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2425 }, { "completion_length": 689.6272583007812, "epoch": 0.7246658203270854, "grad_norm": 0.31885939836502075, "kl": 0.66943359375, "learning_rate": 2.921469849496077e-07, "loss": 0.0268, "reward": 1.1579241454601288, "reward_std": 0.15715443529188633, "rewards/accuracy_reward": 0.17857143771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 2426 }, { "completion_length": 629.810302734375, "epoch": 0.7249645284146068, "grad_norm": 1.3981578350067139, "kl": 0.724853515625, "learning_rate": 2.917624597801179e-07, "loss": 0.029, "reward": 1.1104911416769028, "reward_std": 0.15207409486174583, "rewards/accuracy_reward": 0.13616072316654027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9743303954601288, "step": 2427 }, { "completion_length": 693.450927734375, "epoch": 0.7252632365021283, "grad_norm": 0.5202295184135437, "kl": 0.943359375, "learning_rate": 2.913782155477622e-07, "loss": 0.0377, "reward": 1.2282366752624512, "reward_std": 0.20403716154396534, "rewards/accuracy_reward": 0.2522321578580886, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.976004496216774, "step": 2428 }, { "completion_length": 691.2544860839844, "epoch": 0.7255619445896497, "grad_norm": 0.594882607460022, "kl": 0.70751953125, "learning_rate": 2.909942526705601e-07, "loss": 0.0283, "reward": 1.0803571790456772, "reward_std": 0.0981718385592103, "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2429 }, { "completion_length": 664.294677734375, "epoch": 0.7258606526771713, "grad_norm": 0.6099300384521484, "kl": 0.4990234375, "learning_rate": 2.906105715662257e-07, "loss": 0.02, "reward": 1.1925223767757416, "reward_std": 0.15233573503792286, "rewards/accuracy_reward": 0.2075892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2430 }, { "completion_length": 647.3348388671875, "epoch": 0.7261593607646927, "grad_norm": 1.0583909749984741, "kl": 0.75390625, "learning_rate": 2.902271726521668e-07, "loss": 0.0302, "reward": 1.0580357760190964, "reward_std": 0.14598862640559673, "rewards/accuracy_reward": 0.07589286123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98214291036129, "step": 2431 }, { "completion_length": 664.4955749511719, "epoch": 0.7264580688522142, "grad_norm": 0.6976466774940491, "kl": 0.93017578125, "learning_rate": 2.898440563454834e-07, "loss": 0.0372, "reward": 1.195870578289032, "reward_std": 0.15161737171001732, "rewards/accuracy_reward": 0.2098214440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98604916036129, "step": 2432 }, { "completion_length": 636.5469055175781, "epoch": 0.7267567769397356, "grad_norm": 0.4727076292037964, "kl": 0.399658203125, "learning_rate": 2.8946122306296874e-07, "loss": 0.016, "reward": 1.1830357611179352, "reward_std": 0.16286448389291763, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 2433 }, { "completion_length": 685.4977874755859, "epoch": 0.7270554850272571, "grad_norm": 0.680952250957489, "kl": 0.572021484375, "learning_rate": 2.890786732211079e-07, "loss": 0.0229, "reward": 1.1439732611179352, "reward_std": 0.18211625888943672, "rewards/accuracy_reward": 0.16294643469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 2434 }, { "completion_length": 561.0335083007812, "epoch": 0.7273541931147786, "grad_norm": 0.3455101549625397, "kl": 0.21923828125, "learning_rate": 2.886964072360775e-07, "loss": 0.0088, "reward": 1.1735491454601288, "reward_std": 0.21862948685884476, "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 2435 }, { "completion_length": 708.0268249511719, "epoch": 0.7276529012023001, "grad_norm": 1.0478179454803467, "kl": 0.583251953125, "learning_rate": 2.883144255237454e-07, "loss": 0.0234, "reward": 1.1422991454601288, "reward_std": 0.16756990179419518, "rewards/accuracy_reward": 0.16294643515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793526977300644, "step": 2436 }, { "completion_length": 729.0402069091797, "epoch": 0.7279516092898215, "grad_norm": 0.4012419283390045, "kl": 0.552978515625, "learning_rate": 2.879327284996706e-07, "loss": 0.0221, "reward": 1.1305804252624512, "reward_std": 0.11799791548401117, "rewards/accuracy_reward": 0.13839286426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 2437 }, { "completion_length": 637.7187805175781, "epoch": 0.728250317377343, "grad_norm": 0.42716702818870544, "kl": 0.540283203125, "learning_rate": 2.875513165791017e-07, "loss": 0.0216, "reward": 1.156808078289032, "reward_std": 0.18348083645105362, "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2438 }, { "completion_length": 592.6317138671875, "epoch": 0.7285490254648644, "grad_norm": 0.8715078830718994, "kl": 0.385498046875, "learning_rate": 2.8717019017697774e-07, "loss": 0.0155, "reward": 1.1250000447034836, "reward_std": 0.13989360630512238, "rewards/accuracy_reward": 0.1406250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2439 }, { "completion_length": 653.7165451049805, "epoch": 0.728847733552386, "grad_norm": 0.36432400345802307, "kl": 0.47265625, "learning_rate": 2.867893497079267e-07, "loss": 0.0189, "reward": 1.2656250596046448, "reward_std": 0.2352246306836605, "rewards/accuracy_reward": 0.28125002095475793, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2440 }, { "completion_length": 658.9196624755859, "epoch": 0.7291464416399074, "grad_norm": 0.6060393452644348, "kl": 0.498779296875, "learning_rate": 2.864087955862657e-07, "loss": 0.02, "reward": 1.1445313096046448, "reward_std": 0.12966248579323292, "rewards/accuracy_reward": 0.16071429569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 2441 }, { "completion_length": 727.6138763427734, "epoch": 0.7294451497274289, "grad_norm": 0.6045852899551392, "kl": 0.833984375, "learning_rate": 2.8602852822600055e-07, "loss": 0.0334, "reward": 1.1166295111179352, "reward_std": 0.17283577099442482, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 2442 }, { "completion_length": 655.4911041259766, "epoch": 0.7297438578149503, "grad_norm": 0.5622548460960388, "kl": 0.2470703125, "learning_rate": 2.8564854804082455e-07, "loss": 0.0099, "reward": 1.2064732909202576, "reward_std": 0.22036585211753845, "rewards/accuracy_reward": 0.2165178693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 2443 }, { "completion_length": 640.3861846923828, "epoch": 0.7300425659024719, "grad_norm": 0.3574458956718445, "kl": 0.30810546875, "learning_rate": 2.8526885544411906e-07, "loss": 0.0123, "reward": 1.1506697237491608, "reward_std": 0.16149957850575447, "rewards/accuracy_reward": 0.16294643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 2444 }, { "completion_length": 629.8638763427734, "epoch": 0.7303412739899933, "grad_norm": 0.3264460861682892, "kl": 0.4091796875, "learning_rate": 2.8488945084895256e-07, "loss": 0.0164, "reward": 1.0959821939468384, "reward_std": 0.11238122172653675, "rewards/accuracy_reward": 0.11160714831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2445 }, { "completion_length": 654.1964569091797, "epoch": 0.7306399820775148, "grad_norm": 0.405087947845459, "kl": 0.265869140625, "learning_rate": 2.8451033466807976e-07, "loss": 0.0106, "reward": 1.0887277126312256, "reward_std": 0.08836942166090012, "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 2446 }, { "completion_length": 751.6652221679688, "epoch": 0.7309386901650362, "grad_norm": 0.43111366033554077, "kl": 0.96435546875, "learning_rate": 2.8413150731394207e-07, "loss": 0.0386, "reward": 1.1093750298023224, "reward_std": 0.19686784595251083, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464626312256, "step": 2447 }, { "completion_length": 602.7500305175781, "epoch": 0.7312373982525577, "grad_norm": 0.725508987903595, "kl": 0.699951171875, "learning_rate": 2.8375296919866666e-07, "loss": 0.028, "reward": 1.0842634439468384, "reward_std": 0.17933557368814945, "rewards/accuracy_reward": 0.1049107147846371, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 2448 }, { "completion_length": 651.0290679931641, "epoch": 0.7315361063400792, "grad_norm": 0.888671338558197, "kl": 0.591064453125, "learning_rate": 2.8337472073406554e-07, "loss": 0.0236, "reward": 1.1595982611179352, "reward_std": 0.1882731132209301, "rewards/accuracy_reward": 0.18526786752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9743303954601288, "step": 2449 }, { "completion_length": 623.5513610839844, "epoch": 0.7318348144276007, "grad_norm": 0.2109786570072174, "kl": 0.295654296875, "learning_rate": 2.829967623316362e-07, "loss": 0.0118, "reward": 1.1411831080913544, "reward_std": 0.12879267055541277, "rewards/accuracy_reward": 0.14732143632136285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 2450 }, { "completion_length": 674.0491333007812, "epoch": 0.7321335225151221, "grad_norm": 0.45082607865333557, "kl": 0.34130859375, "learning_rate": 2.8261909440256053e-07, "loss": 0.0136, "reward": 1.1294643580913544, "reward_std": 0.12178853899240494, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 2451 }, { "completion_length": 621.122802734375, "epoch": 0.7324322306026436, "grad_norm": 0.4954417049884796, "kl": 0.355224609375, "learning_rate": 2.822417173577038e-07, "loss": 0.0142, "reward": 1.1674107611179352, "reward_std": 0.10137096792459488, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 2452 }, { "completion_length": 701.7745819091797, "epoch": 0.732730938690165, "grad_norm": 0.3723377585411072, "kl": 0.575927734375, "learning_rate": 2.818646316076156e-07, "loss": 0.0231, "reward": 1.1568081080913544, "reward_std": 0.15209506824612617, "rewards/accuracy_reward": 0.17857143515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 2453 }, { "completion_length": 662.6920013427734, "epoch": 0.7330296467776866, "grad_norm": 0.6553583741188049, "kl": 0.42523193359375, "learning_rate": 2.8148783756252803e-07, "loss": 0.017, "reward": 1.2460937798023224, "reward_std": 0.1941849421709776, "rewards/accuracy_reward": 0.258928582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2454 }, { "completion_length": 720.8861999511719, "epoch": 0.733328354865208, "grad_norm": 0.3435068428516388, "kl": 0.734619140625, "learning_rate": 2.8111133563235613e-07, "loss": 0.0294, "reward": 1.1484375298023224, "reward_std": 0.21749213337898254, "rewards/accuracy_reward": 0.1674107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 2455 }, { "completion_length": 654.7076110839844, "epoch": 0.7336270629527294, "grad_norm": 0.36126449704170227, "kl": 0.79052734375, "learning_rate": 2.8073512622669726e-07, "loss": 0.0316, "reward": 1.2042411267757416, "reward_std": 0.17168943583965302, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 2456 }, { "completion_length": 575.6004791259766, "epoch": 0.7339257710402509, "grad_norm": 0.346926748752594, "kl": 0.292694091796875, "learning_rate": 2.803592097548301e-07, "loss": 0.0117, "reward": 1.2321429252624512, "reward_std": 0.14931496046483517, "rewards/accuracy_reward": 0.2366071566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9955357313156128, "step": 2457 }, { "completion_length": 563.5803833007812, "epoch": 0.7342244791277723, "grad_norm": 0.5474554896354675, "kl": 0.34912109375, "learning_rate": 2.7998358662571513e-07, "loss": 0.014, "reward": 1.1635045111179352, "reward_std": 0.1273167785257101, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916294813156128, "step": 2458 }, { "completion_length": 679.4911041259766, "epoch": 0.7345231872152939, "grad_norm": 1.048106074333191, "kl": 0.7236328125, "learning_rate": 2.796082572479936e-07, "loss": 0.029, "reward": 1.1735491752624512, "reward_std": 0.18457341566681862, "rewards/accuracy_reward": 0.18750000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 2459 }, { "completion_length": 592.2611846923828, "epoch": 0.7348218953028153, "grad_norm": 0.39796000719070435, "kl": 0.46533203125, "learning_rate": 2.7923322202998685e-07, "loss": 0.0186, "reward": 1.2299107611179352, "reward_std": 0.1413571573793888, "rewards/accuracy_reward": 0.2433035857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2460 }, { "completion_length": 640.9174346923828, "epoch": 0.7351206033903368, "grad_norm": 0.43393832445144653, "kl": 0.6884765625, "learning_rate": 2.7885848137969643e-07, "loss": 0.0276, "reward": 1.2700893580913544, "reward_std": 0.18829024583101273, "rewards/accuracy_reward": 0.2857143022119999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2461 }, { "completion_length": 728.9397735595703, "epoch": 0.7354193114778582, "grad_norm": 0.5466667413711548, "kl": 0.9326171875, "learning_rate": 2.784840357048038e-07, "loss": 0.0374, "reward": 1.0731027275323868, "reward_std": 0.1892985086888075, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562947034836, "step": 2462 }, { "completion_length": 613.3214416503906, "epoch": 0.7357180195653797, "grad_norm": 0.37570974230766296, "kl": 0.4267578125, "learning_rate": 2.781098854126687e-07, "loss": 0.0171, "reward": 1.3147322237491608, "reward_std": 0.11907562613487244, "rewards/accuracy_reward": 0.3236607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 2463 }, { "completion_length": 694.6183471679688, "epoch": 0.7360167276529012, "grad_norm": 0.8106791377067566, "kl": 0.79931640625, "learning_rate": 2.777360309103301e-07, "loss": 0.032, "reward": 1.0479911118745804, "reward_std": 0.13388493098318577, "rewards/accuracy_reward": 0.07142857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625447034836, "step": 2464 }, { "completion_length": 674.8326263427734, "epoch": 0.7363154357404227, "grad_norm": 0.47428059577941895, "kl": 0.344482421875, "learning_rate": 2.773624726045054e-07, "loss": 0.0138, "reward": 1.072544664144516, "reward_std": 0.07793218083679676, "rewards/accuracy_reward": 0.0803571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875149011612, "step": 2465 }, { "completion_length": 671.8973388671875, "epoch": 0.7366141438279441, "grad_norm": 0.9054591655731201, "kl": 0.7685546875, "learning_rate": 2.76989210901589e-07, "loss": 0.0307, "reward": 1.0954241454601288, "reward_std": 0.18996408209204674, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 2466 }, { "completion_length": 739.7321624755859, "epoch": 0.7369128519154656, "grad_norm": 0.8506317734718323, "kl": 0.568603515625, "learning_rate": 2.7661624620765324e-07, "loss": 0.0227, "reward": 1.1768973767757416, "reward_std": 0.06669938750565052, "rewards/accuracy_reward": 0.1897321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2467 }, { "completion_length": 675.3281555175781, "epoch": 0.737211560002987, "grad_norm": 0.4030800759792328, "kl": 0.70263671875, "learning_rate": 2.7624357892844705e-07, "loss": 0.0281, "reward": 1.0937500447034836, "reward_std": 0.18890333361923695, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 2468 }, { "completion_length": 601.169677734375, "epoch": 0.7375102680905086, "grad_norm": 0.5132657289505005, "kl": 0.712646484375, "learning_rate": 2.7587120946939595e-07, "loss": 0.0285, "reward": 1.2237723767757416, "reward_std": 0.21010714024305344, "rewards/accuracy_reward": 0.23437500861473382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 2469 }, { "completion_length": 647.4955749511719, "epoch": 0.73780897617803, "grad_norm": 0.7767412662506104, "kl": 0.48583984375, "learning_rate": 2.7549913823560163e-07, "loss": 0.0194, "reward": 1.1785714626312256, "reward_std": 0.1261272244155407, "rewards/accuracy_reward": 0.20312500605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464626312256, "step": 2470 }, { "completion_length": 596.2344207763672, "epoch": 0.7381076842655515, "grad_norm": 0.36288192868232727, "kl": 0.4786376953125, "learning_rate": 2.751273656318408e-07, "loss": 0.0192, "reward": 1.1389509439468384, "reward_std": 0.1383173782378435, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2471 }, { "completion_length": 626.9375381469727, "epoch": 0.7384063923530729, "grad_norm": 0.38470083475112915, "kl": 0.6158447265625, "learning_rate": 2.7475589206256565e-07, "loss": 0.0247, "reward": 1.285714328289032, "reward_std": 0.18818224966526031, "rewards/accuracy_reward": 0.29910716228187084, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071939468384, "step": 2472 }, { "completion_length": 675.2433242797852, "epoch": 0.7387051004405945, "grad_norm": 0.32927989959716797, "kl": 0.40576171875, "learning_rate": 2.743847179319034e-07, "loss": 0.0162, "reward": 1.227120578289032, "reward_std": 0.17821387201547623, "rewards/accuracy_reward": 0.2366071604192257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 2473 }, { "completion_length": 687.7254638671875, "epoch": 0.7390038085281159, "grad_norm": 0.31651434302330017, "kl": 0.397216796875, "learning_rate": 2.7401384364365453e-07, "loss": 0.0159, "reward": 1.1768973469734192, "reward_std": 0.09877932630479336, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2474 }, { "completion_length": 672.747802734375, "epoch": 0.7393025166156374, "grad_norm": 0.5766432285308838, "kl": 0.772705078125, "learning_rate": 2.7364326960129435e-07, "loss": 0.0309, "reward": 1.1071428954601288, "reward_std": 0.15725211426615715, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 2475 }, { "completion_length": 784.3571929931641, "epoch": 0.7396012247031588, "grad_norm": 1.1642906665802002, "kl": 1.015625, "learning_rate": 2.7327299620797107e-07, "loss": 0.0405, "reward": 1.026785746216774, "reward_std": 0.13747178111225367, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9732143133878708, "step": 2476 }, { "completion_length": 627.9576263427734, "epoch": 0.7398999327906803, "grad_norm": 0.8612712025642395, "kl": 0.40478515625, "learning_rate": 2.729030238665056e-07, "loss": 0.0162, "reward": 1.1556920409202576, "reward_std": 0.20020100101828575, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 2477 }, { "completion_length": 701.357177734375, "epoch": 0.7401986408782018, "grad_norm": 0.9902294874191284, "kl": 0.80712890625, "learning_rate": 2.7253335297939175e-07, "loss": 0.0323, "reward": 1.0479911416769028, "reward_std": 0.12339498475193977, "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625447034836, "step": 2478 }, { "completion_length": 646.7879791259766, "epoch": 0.7404973489657233, "grad_norm": 0.5039253830909729, "kl": 0.34326171875, "learning_rate": 2.7216398394879535e-07, "loss": 0.0137, "reward": 1.2399554252624512, "reward_std": 0.18417700193822384, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 2479 }, { "completion_length": 663.3370819091797, "epoch": 0.7407960570532447, "grad_norm": 0.41412413120269775, "kl": 0.2149658203125, "learning_rate": 2.7179491717655345e-07, "loss": 0.0086, "reward": 1.1612723767757416, "reward_std": 0.12155245337635279, "rewards/accuracy_reward": 0.162946441443637, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9983258992433548, "step": 2480 }, { "completion_length": 680.3527221679688, "epoch": 0.7410947651407662, "grad_norm": 0.2167527973651886, "kl": 0.3399658203125, "learning_rate": 2.714261530641747e-07, "loss": 0.0136, "reward": 1.256696492433548, "reward_std": 0.1655062586069107, "rewards/accuracy_reward": 0.26562501210719347, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 2481 }, { "completion_length": 573.8884124755859, "epoch": 0.7413934732282876, "grad_norm": 0.292254775762558, "kl": 0.3232421875, "learning_rate": 2.7105769201283825e-07, "loss": 0.0129, "reward": 1.2667411267757416, "reward_std": 0.18813197873532772, "rewards/accuracy_reward": 0.2790178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2482 }, { "completion_length": 576.6227874755859, "epoch": 0.7416921813158092, "grad_norm": 0.4484267234802246, "kl": 0.2679443359375, "learning_rate": 2.706895344233935e-07, "loss": 0.0107, "reward": 1.1562500596046448, "reward_std": 0.12885727640241385, "rewards/accuracy_reward": 0.1651785857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 2483 }, { "completion_length": 651.1830596923828, "epoch": 0.7419908894033306, "grad_norm": 0.6979923844337463, "kl": 0.3359375, "learning_rate": 2.7032168069636003e-07, "loss": 0.0134, "reward": 1.1562500596046448, "reward_std": 0.13372872257605195, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 2484 }, { "completion_length": 744.029052734375, "epoch": 0.7422895974908521, "grad_norm": 0.7012373805046082, "kl": 0.369140625, "learning_rate": 2.6995413123192647e-07, "loss": 0.0148, "reward": 1.0636161267757416, "reward_std": 0.15243787318468094, "rewards/accuracy_reward": 0.07589286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2485 }, { "completion_length": 667.0156555175781, "epoch": 0.7425883055783735, "grad_norm": 0.7681600451469421, "kl": 0.3204345703125, "learning_rate": 2.6958688642995064e-07, "loss": 0.0128, "reward": 1.1389509290456772, "reward_std": 0.10861445404589176, "rewards/accuracy_reward": 0.15401786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2486 }, { "completion_length": 740.9777069091797, "epoch": 0.742887013665895, "grad_norm": 0.6146787405014038, "kl": 0.60009765625, "learning_rate": 2.6921994668995904e-07, "loss": 0.024, "reward": 1.0898437798023224, "reward_std": 0.09208167949691415, "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 2487 }, { "completion_length": 688.9152069091797, "epoch": 0.7431857217534165, "grad_norm": 0.3104405105113983, "kl": 0.345458984375, "learning_rate": 2.6885331241114595e-07, "loss": 0.0138, "reward": 1.0680803954601288, "reward_std": 0.0933489054441452, "rewards/accuracy_reward": 0.07812500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 2488 }, { "completion_length": 707.0580749511719, "epoch": 0.743484429840938, "grad_norm": 0.9672223925590515, "kl": 0.412109375, "learning_rate": 2.684869839923737e-07, "loss": 0.0165, "reward": 1.159040242433548, "reward_std": 0.15322706755250692, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045260190964, "step": 2489 }, { "completion_length": 686.2701263427734, "epoch": 0.7437831379284594, "grad_norm": 1.0663185119628906, "kl": 0.400634765625, "learning_rate": 2.681209618321717e-07, "loss": 0.016, "reward": 1.0753348767757416, "reward_std": 0.12278243899345398, "rewards/accuracy_reward": 0.09598214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 2490 }, { "completion_length": 774.8192291259766, "epoch": 0.7440818460159809, "grad_norm": 0.5732888579368591, "kl": 0.7138671875, "learning_rate": 2.677552463287359e-07, "loss": 0.0286, "reward": 1.0931920111179352, "reward_std": 0.14092433266341686, "rewards/accuracy_reward": 0.11830357741564512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884439468384, "step": 2491 }, { "completion_length": 757.372802734375, "epoch": 0.7443805541035023, "grad_norm": 0.8840270042419434, "kl": 0.65625, "learning_rate": 2.6738983787992917e-07, "loss": 0.0263, "reward": 1.1741072237491608, "reward_std": 0.22287635318934917, "rewards/accuracy_reward": 0.19866071920841932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464775323868, "step": 2492 }, { "completion_length": 600.241081237793, "epoch": 0.7446792621910239, "grad_norm": 0.5358966588973999, "kl": 0.4150390625, "learning_rate": 2.670247368832803e-07, "loss": 0.0166, "reward": 1.1462053954601288, "reward_std": 0.1281783115118742, "rewards/accuracy_reward": 0.16517858067527413, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 2493 }, { "completion_length": 610.7968902587891, "epoch": 0.7449779702785453, "grad_norm": 0.589209258556366, "kl": 0.625, "learning_rate": 2.666599437359829e-07, "loss": 0.025, "reward": 1.2488839626312256, "reward_std": 0.14963248744606972, "rewards/accuracy_reward": 0.2678571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 2494 }, { "completion_length": 717.5781555175781, "epoch": 0.7452766783660668, "grad_norm": 0.43820032477378845, "kl": 0.45458984375, "learning_rate": 2.662954588348966e-07, "loss": 0.0182, "reward": 1.1707589626312256, "reward_std": 0.1690003201365471, "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 2495 }, { "completion_length": 632.7522583007812, "epoch": 0.7455753864535882, "grad_norm": 0.5296022295951843, "kl": 0.527099609375, "learning_rate": 2.659312825765448e-07, "loss": 0.0211, "reward": 1.2578125596046448, "reward_std": 0.21370577812194824, "rewards/accuracy_reward": 0.2745535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2496 }, { "completion_length": 649.741096496582, "epoch": 0.7458740945411098, "grad_norm": 0.5464529991149902, "kl": 0.32379150390625, "learning_rate": 2.6556741535711593e-07, "loss": 0.013, "reward": 1.1774553954601288, "reward_std": 0.13936439715325832, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966517984867096, "step": 2497 }, { "completion_length": 717.1540374755859, "epoch": 0.7461728026286312, "grad_norm": 0.5323456525802612, "kl": 0.415771484375, "learning_rate": 2.6520385757246196e-07, "loss": 0.0167, "reward": 1.1238839775323868, "reward_std": 0.18278956972062588, "rewards/accuracy_reward": 0.13839286682195961, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 2498 }, { "completion_length": 599.7924270629883, "epoch": 0.7464715107161526, "grad_norm": 0.29341140389442444, "kl": 0.357177734375, "learning_rate": 2.648406096180977e-07, "loss": 0.0143, "reward": 1.2533482909202576, "reward_std": 0.12751440703868866, "rewards/accuracy_reward": 0.2611607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 2499 }, { "completion_length": 672.5424423217773, "epoch": 0.7467702188036741, "grad_norm": 0.5189691185951233, "kl": 0.701904296875, "learning_rate": 2.644776718892015e-07, "loss": 0.0281, "reward": 1.1718750447034836, "reward_std": 0.23768755793571472, "rewards/accuracy_reward": 0.20312500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500298023224, "step": 2500 }, { "completion_length": 681.8527221679688, "epoch": 0.7470689268911955, "grad_norm": 0.6098598837852478, "kl": 0.424560546875, "learning_rate": 2.641150447806143e-07, "loss": 0.017, "reward": 1.137276828289032, "reward_std": 0.16884762607514858, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 2501 }, { "completion_length": 656.0580596923828, "epoch": 0.747367634978717, "grad_norm": 0.8992906212806702, "kl": 0.546875, "learning_rate": 2.637527286868385e-07, "loss": 0.0219, "reward": 1.2103795409202576, "reward_std": 0.13632748369127512, "rewards/accuracy_reward": 0.21651786658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 2502 }, { "completion_length": 604.8683319091797, "epoch": 0.7476663430662385, "grad_norm": 0.6640143394470215, "kl": 0.599609375, "learning_rate": 2.6339072400203866e-07, "loss": 0.024, "reward": 1.273995578289032, "reward_std": 0.19534354284405708, "rewards/accuracy_reward": 0.2901785857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 2503 }, { "completion_length": 695.4777069091797, "epoch": 0.74796505115376, "grad_norm": 0.48381879925727844, "kl": 0.63525390625, "learning_rate": 2.630290311200405e-07, "loss": 0.0254, "reward": 1.0424107611179352, "reward_std": 0.17576707899570465, "rewards/accuracy_reward": 0.06473214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786267757416, "step": 2504 }, { "completion_length": 666.3281555175781, "epoch": 0.7482637592412814, "grad_norm": 0.5058448910713196, "kl": 0.623046875, "learning_rate": 2.6266765043433013e-07, "loss": 0.0249, "reward": 1.1361607611179352, "reward_std": 0.19877468538470566, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750596046448, "step": 2505 }, { "completion_length": 658.9174346923828, "epoch": 0.7485624673288029, "grad_norm": 0.9587224125862122, "kl": 0.6201171875, "learning_rate": 2.623065823380545e-07, "loss": 0.0248, "reward": 1.1489956080913544, "reward_std": 0.18579282239079475, "rewards/accuracy_reward": 0.16741071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 2506 }, { "completion_length": 675.3169860839844, "epoch": 0.7488611754163244, "grad_norm": 0.5620136260986328, "kl": 0.640625, "learning_rate": 2.6194582722402046e-07, "loss": 0.0256, "reward": 1.0652902126312256, "reward_std": 0.12419993989169598, "rewards/accuracy_reward": 0.08035714598372579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2507 }, { "completion_length": 561.1518249511719, "epoch": 0.7491598835038459, "grad_norm": 0.4424521028995514, "kl": 0.471923828125, "learning_rate": 2.61585385484694e-07, "loss": 0.0189, "reward": 1.248883992433548, "reward_std": 0.19968200102448463, "rewards/accuracy_reward": 0.2611607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 2508 }, { "completion_length": 589.0870819091797, "epoch": 0.7494585915913673, "grad_norm": 0.6090760827064514, "kl": 0.4853515625, "learning_rate": 2.6122525751220047e-07, "loss": 0.0194, "reward": 1.1908482611179352, "reward_std": 0.16675208695232868, "rewards/accuracy_reward": 0.20089286658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 2509 }, { "completion_length": 697.4330596923828, "epoch": 0.7497572996788888, "grad_norm": 0.6162382364273071, "kl": 0.8125, "learning_rate": 2.6086544369832373e-07, "loss": 0.0325, "reward": 1.0697545111179352, "reward_std": 0.1563388668000698, "rewards/accuracy_reward": 0.08928571944124997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 2510 }, { "completion_length": 630.6339569091797, "epoch": 0.7500560077664102, "grad_norm": 0.6262379288673401, "kl": 0.72021484375, "learning_rate": 2.6050594443450604e-07, "loss": 0.0288, "reward": 1.1969866454601288, "reward_std": 0.18606897816061974, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 2511 }, { "completion_length": 619.8214645385742, "epoch": 0.7503547158539318, "grad_norm": 0.5058699250221252, "kl": 0.580078125, "learning_rate": 2.6014676011184743e-07, "loss": 0.0232, "reward": 1.2070313096046448, "reward_std": 0.14243726804852486, "rewards/accuracy_reward": 0.227678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 2512 }, { "completion_length": 574.6830596923828, "epoch": 0.7506534239414532, "grad_norm": 0.6106504201889038, "kl": 0.416015625, "learning_rate": 2.5978789112110496e-07, "loss": 0.0167, "reward": 1.2237723767757416, "reward_std": 0.17788911052048206, "rewards/accuracy_reward": 0.23437501210719347, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2513 }, { "completion_length": 609.2790603637695, "epoch": 0.7509521320289747, "grad_norm": 1.1334960460662842, "kl": 0.8277587890625, "learning_rate": 2.5942933785269316e-07, "loss": 0.0331, "reward": 1.0943081080913544, "reward_std": 0.1737553933635354, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366305589676, "step": 2514 }, { "completion_length": 682.3058319091797, "epoch": 0.7512508401164961, "grad_norm": 0.6430526375770569, "kl": 0.638671875, "learning_rate": 2.5907110069668293e-07, "loss": 0.0255, "reward": 1.1116072237491608, "reward_std": 0.19355052150785923, "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 2515 }, { "completion_length": 604.4196624755859, "epoch": 0.7515495482040176, "grad_norm": 0.7148637771606445, "kl": 0.3292236328125, "learning_rate": 2.587131800428009e-07, "loss": 0.0132, "reward": 1.1501116454601288, "reward_std": 0.1778946854174137, "rewards/accuracy_reward": 0.15401786752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937798023224, "step": 2516 }, { "completion_length": 653.4888763427734, "epoch": 0.7518482562915391, "grad_norm": 0.4371843636035919, "kl": 0.5595703125, "learning_rate": 2.5835557628042983e-07, "loss": 0.0224, "reward": 1.2070313096046448, "reward_std": 0.18922762665897608, "rewards/accuracy_reward": 0.2209821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 2517 }, { "completion_length": 662.0937805175781, "epoch": 0.7521469643790606, "grad_norm": 0.43792346119880676, "kl": 0.68896484375, "learning_rate": 2.5799828979860764e-07, "loss": 0.0276, "reward": 1.2064732611179352, "reward_std": 0.1533360257744789, "rewards/accuracy_reward": 0.22098215855658054, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911267757416, "step": 2518 }, { "completion_length": 696.3683319091797, "epoch": 0.752445672466582, "grad_norm": 1.011910080909729, "kl": 0.45068359375, "learning_rate": 2.5764132098602676e-07, "loss": 0.0181, "reward": 1.1049107611179352, "reward_std": 0.13825749419629574, "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 2519 }, { "completion_length": 685.6116409301758, "epoch": 0.7527443805541035, "grad_norm": 0.963358461856842, "kl": 0.69384765625, "learning_rate": 2.5728467023103463e-07, "loss": 0.0277, "reward": 1.0864956080913544, "reward_std": 0.10172754805535078, "rewards/accuracy_reward": 0.10044643376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 2520 }, { "completion_length": 656.3593902587891, "epoch": 0.753043088641625, "grad_norm": 0.33862242102622986, "kl": 0.544921875, "learning_rate": 2.5692833792163195e-07, "loss": 0.0218, "reward": 1.1662946939468384, "reward_std": 0.18831395730376244, "rewards/accuracy_reward": 0.18303572572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2521 }, { "completion_length": 731.2411041259766, "epoch": 0.7533417967291465, "grad_norm": 0.47881966829299927, "kl": 0.6328125, "learning_rate": 2.565723244454734e-07, "loss": 0.0253, "reward": 1.1914063096046448, "reward_std": 0.2137678973376751, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848767757416, "step": 2522 }, { "completion_length": 622.2567138671875, "epoch": 0.7536405048166679, "grad_norm": 0.47356733679771423, "kl": 0.68359375, "learning_rate": 2.5621663018986705e-07, "loss": 0.0274, "reward": 1.1646206080913544, "reward_std": 0.17797930911183357, "rewards/accuracy_reward": 0.180803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 2523 }, { "completion_length": 735.810302734375, "epoch": 0.7539392129041894, "grad_norm": 0.711662232875824, "kl": 0.66064453125, "learning_rate": 2.558612555417731e-07, "loss": 0.0264, "reward": 1.0909598469734192, "reward_std": 0.15444066002964973, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884290456772, "step": 2524 }, { "completion_length": 658.0692138671875, "epoch": 0.7542379209917108, "grad_norm": 0.610651969909668, "kl": 0.56005859375, "learning_rate": 2.5550620088780437e-07, "loss": 0.0224, "reward": 1.195870578289032, "reward_std": 0.18611848913133144, "rewards/accuracy_reward": 0.2142857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 2525 }, { "completion_length": 674.6205749511719, "epoch": 0.7545366290792324, "grad_norm": 0.3661974370479584, "kl": 0.32080078125, "learning_rate": 2.551514666142257e-07, "loss": 0.0129, "reward": 1.0954241752624512, "reward_std": 0.11754373926669359, "rewards/accuracy_reward": 0.10267857392318547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.992745578289032, "step": 2526 }, { "completion_length": 558.4062652587891, "epoch": 0.7548353371667538, "grad_norm": 0.5640013217926025, "kl": 0.4462890625, "learning_rate": 2.54797053106953e-07, "loss": 0.0178, "reward": 1.3275670409202576, "reward_std": 0.1655694767832756, "rewards/accuracy_reward": 0.3415178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 2527 }, { "completion_length": 554.5647583007812, "epoch": 0.7551340452542753, "grad_norm": 0.44804027676582336, "kl": 0.413330078125, "learning_rate": 2.5444296075155347e-07, "loss": 0.0165, "reward": 1.1875000298023224, "reward_std": 0.12870780937373638, "rewards/accuracy_reward": 0.2031250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2528 }, { "completion_length": 688.9241333007812, "epoch": 0.7554327533417967, "grad_norm": 1.2073266506195068, "kl": 0.342041015625, "learning_rate": 2.540891899332451e-07, "loss": 0.0137, "reward": 1.1350446939468384, "reward_std": 0.14390244893729687, "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 2529 }, { "completion_length": 706.7790374755859, "epoch": 0.7557314614293182, "grad_norm": 0.27664104104042053, "kl": 0.360107421875, "learning_rate": 2.5373574103689565e-07, "loss": 0.0144, "reward": 1.168526828289032, "reward_std": 0.137325219810009, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875447034836, "step": 2530 }, { "completion_length": 651.5201263427734, "epoch": 0.7560301695168397, "grad_norm": 0.7574002146720886, "kl": 0.380126953125, "learning_rate": 2.5338261444702287e-07, "loss": 0.0152, "reward": 1.1439732909202576, "reward_std": 0.15367485582828522, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2531 }, { "completion_length": 599.8884124755859, "epoch": 0.7563288776043612, "grad_norm": 0.3411481976509094, "kl": 0.2880859375, "learning_rate": 2.5302981054779403e-07, "loss": 0.0115, "reward": 1.128348246216774, "reward_std": 0.13712910935282707, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966518133878708, "step": 2532 }, { "completion_length": 639.8593978881836, "epoch": 0.7566275856918826, "grad_norm": 0.5409788489341736, "kl": 0.33758544921875, "learning_rate": 2.52677329723025e-07, "loss": 0.0135, "reward": 1.2025670111179352, "reward_std": 0.12578790076076984, "rewards/accuracy_reward": 0.20982143888249993, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 2533 }, { "completion_length": 668.0960235595703, "epoch": 0.7569262937794041, "grad_norm": 0.49556076526641846, "kl": 0.2340087890625, "learning_rate": 2.523251723561807e-07, "loss": 0.0093, "reward": 1.1328125596046448, "reward_std": 0.20079810172319412, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966517984867096, "step": 2534 }, { "completion_length": 612.3058242797852, "epoch": 0.7572250018669255, "grad_norm": 0.3084305226802826, "kl": 0.2100830078125, "learning_rate": 2.519733388303734e-07, "loss": 0.0084, "reward": 1.2220982611179352, "reward_std": 0.14371091965585947, "rewards/accuracy_reward": 0.2299107275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 2535 }, { "completion_length": 670.1540374755859, "epoch": 0.7575237099544471, "grad_norm": 0.42399629950523376, "kl": 0.359619140625, "learning_rate": 2.516218295283637e-07, "loss": 0.0144, "reward": 1.180245578289032, "reward_std": 0.21799056604504585, "rewards/accuracy_reward": 0.1941964440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 2536 }, { "completion_length": 656.3036041259766, "epoch": 0.7578224180419685, "grad_norm": 0.3407740294933319, "kl": 0.46337890625, "learning_rate": 2.512706448325594e-07, "loss": 0.0186, "reward": 1.036272406578064, "reward_std": 0.1364223938435316, "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2537 }, { "completion_length": 656.6741333007812, "epoch": 0.75812112612949, "grad_norm": 0.9407913088798523, "kl": 0.438232421875, "learning_rate": 2.509197851250148e-07, "loss": 0.0175, "reward": 1.262834906578064, "reward_std": 0.17543229088187218, "rewards/accuracy_reward": 0.2723214477300644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 2538 }, { "completion_length": 656.5513763427734, "epoch": 0.7584198342170114, "grad_norm": 0.3639694154262543, "kl": 0.2718505859375, "learning_rate": 2.505692507874309e-07, "loss": 0.0109, "reward": 1.170758992433548, "reward_std": 0.1053079990670085, "rewards/accuracy_reward": 0.17410715040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966517984867096, "step": 2539 }, { "completion_length": 672.1518249511719, "epoch": 0.758718542304533, "grad_norm": 0.564014732837677, "kl": 0.4393310546875, "learning_rate": 2.5021904220115496e-07, "loss": 0.0176, "reward": 1.1266741752624512, "reward_std": 0.17157482844777405, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 2540 }, { "completion_length": 740.3348693847656, "epoch": 0.7590172503920544, "grad_norm": 0.6453250050544739, "kl": 0.7802734375, "learning_rate": 2.4986915974717927e-07, "loss": 0.0312, "reward": 1.1422991752624512, "reward_std": 0.1936271581798792, "rewards/accuracy_reward": 0.16517858067527413, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977120578289032, "step": 2541 }, { "completion_length": 604.9643096923828, "epoch": 0.7593159584795758, "grad_norm": 0.6892363429069519, "kl": 0.43115234375, "learning_rate": 2.495196038061418e-07, "loss": 0.0173, "reward": 1.2561384439468384, "reward_std": 0.1681501567363739, "rewards/accuracy_reward": 0.2656250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134439468384, "step": 2542 }, { "completion_length": 652.0692291259766, "epoch": 0.7596146665670973, "grad_norm": 0.4153762757778168, "kl": 0.1632080078125, "learning_rate": 2.491703747583253e-07, "loss": 0.0065, "reward": 1.0585938096046448, "reward_std": 0.06712112575769424, "rewards/accuracy_reward": 0.060267859138548374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9983258992433548, "step": 2543 }, { "completion_length": 601.2433471679688, "epoch": 0.7599133746546187, "grad_norm": 0.315904825925827, "kl": 0.2774658203125, "learning_rate": 2.4882147298365636e-07, "loss": 0.0111, "reward": 1.1060268580913544, "reward_std": 0.1096867024898529, "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966517984867096, "step": 2544 }, { "completion_length": 754.2701263427734, "epoch": 0.7602120827421402, "grad_norm": 1.1802897453308105, "kl": 1.125, "learning_rate": 2.484728988617063e-07, "loss": 0.0451, "reward": 1.1478795111179352, "reward_std": 0.219047449529171, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.960379496216774, "step": 2545 }, { "completion_length": 707.7991485595703, "epoch": 0.7605107908296617, "grad_norm": 0.5569186210632324, "kl": 0.60205078125, "learning_rate": 2.481246527716895e-07, "loss": 0.0241, "reward": 1.1819196939468384, "reward_std": 0.18264539539813995, "rewards/accuracy_reward": 0.20089287124574184, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 2546 }, { "completion_length": 704.263427734375, "epoch": 0.7608094989171832, "grad_norm": 0.8942164778709412, "kl": 0.638916015625, "learning_rate": 2.477767350924633e-07, "loss": 0.0255, "reward": 1.2126116454601288, "reward_std": 0.14597710222005844, "rewards/accuracy_reward": 0.223214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2547 }, { "completion_length": 661.0736999511719, "epoch": 0.7611082070047046, "grad_norm": 0.6002625226974487, "kl": 0.4716796875, "learning_rate": 2.474291462025285e-07, "loss": 0.0189, "reward": 1.1283482611179352, "reward_std": 0.11250904016196728, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 2548 }, { "completion_length": 642.7031555175781, "epoch": 0.7614069150922261, "grad_norm": 1.1922039985656738, "kl": 0.6329345703125, "learning_rate": 2.4708188648002736e-07, "loss": 0.0253, "reward": 1.3370536267757416, "reward_std": 0.23170649260282516, "rewards/accuracy_reward": 0.3459821604192257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 2549 }, { "completion_length": 624.8661041259766, "epoch": 0.7617056231797475, "grad_norm": 0.2764449417591095, "kl": 0.2657470703125, "learning_rate": 2.467349563027445e-07, "loss": 0.0106, "reward": 1.0786830931901932, "reward_std": 0.09288868959993124, "rewards/accuracy_reward": 0.08482143143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 2550 }, { "completion_length": 689.0335235595703, "epoch": 0.7620043312672691, "grad_norm": 0.4306022822856903, "kl": 0.379150390625, "learning_rate": 2.463883560481062e-07, "loss": 0.0152, "reward": 1.1729911267757416, "reward_std": 0.14998980704694986, "rewards/accuracy_reward": 0.1852678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2551 }, { "completion_length": 658.9978179931641, "epoch": 0.7623030393547905, "grad_norm": 0.4821315109729767, "kl": 0.423095703125, "learning_rate": 2.4604208609317923e-07, "loss": 0.017, "reward": 1.2003348469734192, "reward_std": 0.19026106595993042, "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949776977300644, "step": 2552 }, { "completion_length": 715.560302734375, "epoch": 0.762601747442312, "grad_norm": 0.8362588882446289, "kl": 0.359375, "learning_rate": 2.4569614681467156e-07, "loss": 0.0144, "reward": 1.1300223469734192, "reward_std": 0.18164506554603577, "rewards/accuracy_reward": 0.14508928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 2553 }, { "completion_length": 660.5067291259766, "epoch": 0.7629004555298334, "grad_norm": 0.45417773723602295, "kl": 0.5296630859375, "learning_rate": 2.4535053858893126e-07, "loss": 0.0211, "reward": 1.1344867050647736, "reward_std": 0.21358435600996017, "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2554 }, { "completion_length": 620.5937652587891, "epoch": 0.763199163617355, "grad_norm": 0.43201082944869995, "kl": 0.56298828125, "learning_rate": 2.45005261791946e-07, "loss": 0.0225, "reward": 1.2126116752624512, "reward_std": 0.15139968320727348, "rewards/accuracy_reward": 0.22544643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2555 }, { "completion_length": 607.9977874755859, "epoch": 0.7634978717048764, "grad_norm": 0.2933107316493988, "kl": 0.23779296875, "learning_rate": 2.4466031679934314e-07, "loss": 0.0095, "reward": 1.1875000298023224, "reward_std": 0.18044034019112587, "rewards/accuracy_reward": 0.19642858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 2556 }, { "completion_length": 642.0491180419922, "epoch": 0.7637965797923979, "grad_norm": 0.35583990812301636, "kl": 0.30615234375, "learning_rate": 2.443157039863894e-07, "loss": 0.0123, "reward": 1.1830357909202576, "reward_std": 0.17689681611955166, "rewards/accuracy_reward": 0.1897321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 2557 }, { "completion_length": 620.872802734375, "epoch": 0.7640952878799193, "grad_norm": 0.48688939213752747, "kl": 0.3243408203125, "learning_rate": 2.4397142372798914e-07, "loss": 0.013, "reward": 1.2142857909202576, "reward_std": 0.18156100064516068, "rewards/accuracy_reward": 0.227678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2558 }, { "completion_length": 642.2946624755859, "epoch": 0.7643939959674408, "grad_norm": 0.679360568523407, "kl": 0.7412109375, "learning_rate": 2.4362747639868594e-07, "loss": 0.0297, "reward": 1.1300223618745804, "reward_std": 0.18170379847288132, "rewards/accuracy_reward": 0.14955357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804688096046448, "step": 2559 }, { "completion_length": 671.4040451049805, "epoch": 0.7646927040549623, "grad_norm": 0.36114972829818726, "kl": 0.4390869140625, "learning_rate": 2.4328386237266075e-07, "loss": 0.0175, "reward": 1.2829241454601288, "reward_std": 0.17514384351670742, "rewards/accuracy_reward": 0.3013393059372902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 2560 }, { "completion_length": 689.0960235595703, "epoch": 0.7649914121424838, "grad_norm": 0.3191191852092743, "kl": 0.41015625, "learning_rate": 2.429405820237318e-07, "loss": 0.0164, "reward": 1.1361607909202576, "reward_std": 0.11406477726995945, "rewards/accuracy_reward": 0.1495535729918629, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2561 }, { "completion_length": 611.1651992797852, "epoch": 0.7652901202300052, "grad_norm": 0.41290155053138733, "kl": 0.422607421875, "learning_rate": 2.4259763572535466e-07, "loss": 0.0169, "reward": 1.2142857611179352, "reward_std": 0.20542102865874767, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.9888392984867096, "step": 2562 }, { "completion_length": 718.7277069091797, "epoch": 0.7655888283175267, "grad_norm": 0.3295189142227173, "kl": 0.74951171875, "learning_rate": 2.422550238506211e-07, "loss": 0.0299, "reward": 1.1361607611179352, "reward_std": 0.2084176205098629, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9709821790456772, "step": 2563 }, { "completion_length": 677.5781707763672, "epoch": 0.7658875364050481, "grad_norm": 0.5599618554115295, "kl": 0.59912109375, "learning_rate": 2.4191274677225924e-07, "loss": 0.024, "reward": 1.099888414144516, "reward_std": 0.1652199812233448, "rewards/accuracy_reward": 0.12500000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884290456772, "step": 2564 }, { "completion_length": 630.1696701049805, "epoch": 0.7661862444925697, "grad_norm": 0.4968911111354828, "kl": 0.308837890625, "learning_rate": 2.415708048626333e-07, "loss": 0.0124, "reward": 1.0976562947034836, "reward_std": 0.13005417119711637, "rewards/accuracy_reward": 0.11830357555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 2565 }, { "completion_length": 663.419677734375, "epoch": 0.7664849525800911, "grad_norm": 0.9280692934989929, "kl": 0.48193359375, "learning_rate": 2.4122919849374223e-07, "loss": 0.0193, "reward": 1.0781250447034836, "reward_std": 0.09982139989733696, "rewards/accuracy_reward": 0.09375000046566129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2566 }, { "completion_length": 658.5312957763672, "epoch": 0.7667836606676126, "grad_norm": 0.5192353129386902, "kl": 0.48388671875, "learning_rate": 2.4088792803722036e-07, "loss": 0.0193, "reward": 1.2148438096046448, "reward_std": 0.1869187206029892, "rewards/accuracy_reward": 0.2276785857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 2567 }, { "completion_length": 682.6518096923828, "epoch": 0.767082368755134, "grad_norm": 0.41408848762512207, "kl": 0.5679931640625, "learning_rate": 2.4054699386433674e-07, "loss": 0.0227, "reward": 1.0837054252624512, "reward_std": 0.07608592044562101, "rewards/accuracy_reward": 0.10044643213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2568 }, { "completion_length": 616.7879791259766, "epoch": 0.7673810768426556, "grad_norm": 0.46538245677948, "kl": 0.484130859375, "learning_rate": 2.40206396345994e-07, "loss": 0.0194, "reward": 1.2315848767757416, "reward_std": 0.17071275785565376, "rewards/accuracy_reward": 0.24553572619333863, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 2569 }, { "completion_length": 663.7656555175781, "epoch": 0.767679784930177, "grad_norm": 0.3260214030742645, "kl": 0.341796875, "learning_rate": 2.398661358527289e-07, "loss": 0.0137, "reward": 1.1562500596046448, "reward_std": 0.12405448779463768, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933036118745804, "step": 2570 }, { "completion_length": 649.2634124755859, "epoch": 0.7679784930176985, "grad_norm": 0.3300420641899109, "kl": 0.4609375, "learning_rate": 2.3952621275471186e-07, "loss": 0.0184, "reward": 1.1679688096046448, "reward_std": 0.14801781252026558, "rewards/accuracy_reward": 0.17410715110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616156578064, "step": 2571 }, { "completion_length": 710.6942138671875, "epoch": 0.7682772011052199, "grad_norm": 0.6228359341621399, "kl": 0.4176025390625, "learning_rate": 2.391866274217455e-07, "loss": 0.0167, "reward": 1.1143973767757416, "reward_std": 0.17361826077103615, "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 2572 }, { "completion_length": 737.7098388671875, "epoch": 0.7685759091927414, "grad_norm": 0.36631253361701965, "kl": 0.415771484375, "learning_rate": 2.3884738022326547e-07, "loss": 0.0166, "reward": 1.0652902275323868, "reward_std": 0.10298857279121876, "rewards/accuracy_reward": 0.07589285774156451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2573 }, { "completion_length": 665.5960235595703, "epoch": 0.7688746172802629, "grad_norm": 0.48220571875572205, "kl": 0.3251953125, "learning_rate": 2.3850847152833965e-07, "loss": 0.013, "reward": 1.1456473618745804, "reward_std": 0.10037697292864323, "rewards/accuracy_reward": 0.15401786682195961, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916294813156128, "step": 2574 }, { "completion_length": 666.8147583007812, "epoch": 0.7691733253677844, "grad_norm": 0.31342220306396484, "kl": 0.55517578125, "learning_rate": 2.3816990170566723e-07, "loss": 0.0222, "reward": 1.1495536267757416, "reward_std": 0.11553671769797802, "rewards/accuracy_reward": 0.16294643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071939468384, "step": 2575 }, { "completion_length": 685.8125305175781, "epoch": 0.7694720334553058, "grad_norm": 0.5470945239067078, "kl": 0.65185546875, "learning_rate": 2.378316711235793e-07, "loss": 0.0261, "reward": 1.1065848618745804, "reward_std": 0.14723572507500648, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 2576 }, { "completion_length": 596.9018096923828, "epoch": 0.7697707415428273, "grad_norm": 0.5518102645874023, "kl": 0.2681884765625, "learning_rate": 2.3749378015003724e-07, "loss": 0.0107, "reward": 1.223214328289032, "reward_std": 0.15366973727941513, "rewards/accuracy_reward": 0.2299107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 2577 }, { "completion_length": 632.4777069091797, "epoch": 0.7700694496303487, "grad_norm": 0.33397647738456726, "kl": 0.396240234375, "learning_rate": 2.3715622915263348e-07, "loss": 0.0158, "reward": 1.1662946939468384, "reward_std": 0.13889959268271923, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 2578 }, { "completion_length": 783.497802734375, "epoch": 0.7703681577178703, "grad_norm": 0.7417386770248413, "kl": 0.65771484375, "learning_rate": 2.3681901849859052e-07, "loss": 0.0263, "reward": 1.1227678954601288, "reward_std": 0.16763607133179903, "rewards/accuracy_reward": 0.13839286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750149011612, "step": 2579 }, { "completion_length": 648.7254638671875, "epoch": 0.7706668658053917, "grad_norm": 0.46290966868400574, "kl": 0.250732421875, "learning_rate": 2.3648214855476028e-07, "loss": 0.01, "reward": 1.1238839626312256, "reward_std": 0.14364181458950043, "rewards/accuracy_reward": 0.1339285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 2580 }, { "completion_length": 650.3036041259766, "epoch": 0.7709655738929132, "grad_norm": 0.9323096871376038, "kl": 0.701171875, "learning_rate": 2.361456196876244e-07, "loss": 0.0281, "reward": 1.2159598469734192, "reward_std": 0.15028841234743595, "rewards/accuracy_reward": 0.22991072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98604916036129, "step": 2581 }, { "completion_length": 674.8058319091797, "epoch": 0.7712642819804346, "grad_norm": 0.6302791833877563, "kl": 0.39697265625, "learning_rate": 2.3580943226329333e-07, "loss": 0.0159, "reward": 1.1729911267757416, "reward_std": 0.1646871231496334, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 2582 }, { "completion_length": 637.1808319091797, "epoch": 0.7715629900679561, "grad_norm": 0.36747244000434875, "kl": 0.61083984375, "learning_rate": 2.3547358664750588e-07, "loss": 0.0244, "reward": 1.0814732611179352, "reward_std": 0.14870542660355568, "rewards/accuracy_reward": 0.09821428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2583 }, { "completion_length": 636.5045013427734, "epoch": 0.7718616981554776, "grad_norm": 0.43331804871559143, "kl": 0.5076904296875, "learning_rate": 2.3513808320562925e-07, "loss": 0.0203, "reward": 1.1305803954601288, "reward_std": 0.16889068111777306, "rewards/accuracy_reward": 0.14285715203732252, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 2584 }, { "completion_length": 643.5022583007812, "epoch": 0.772160406242999, "grad_norm": 0.3641495704650879, "kl": 0.68994140625, "learning_rate": 2.3480292230265847e-07, "loss": 0.0276, "reward": 1.2393973767757416, "reward_std": 0.21198594942688942, "rewards/accuracy_reward": 0.2566964440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009588479996, "step": 2585 }, { "completion_length": 684.2522735595703, "epoch": 0.7724591143305205, "grad_norm": 0.3804858922958374, "kl": 0.537841796875, "learning_rate": 2.3446810430321544e-07, "loss": 0.0215, "reward": 1.1344866752624512, "reward_std": 0.14471117965877056, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 2586 }, { "completion_length": 708.2678985595703, "epoch": 0.7727578224180419, "grad_norm": 0.3136400580406189, "kl": 0.35205078125, "learning_rate": 2.341336295715494e-07, "loss": 0.014, "reward": 1.0998884290456772, "reward_std": 0.12663058750331402, "rewards/accuracy_reward": 0.1116071455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812649011612, "step": 2587 }, { "completion_length": 652.1116333007812, "epoch": 0.7730565305055634, "grad_norm": 0.42642730474472046, "kl": 0.60205078125, "learning_rate": 2.337994984715364e-07, "loss": 0.0241, "reward": 1.176339328289032, "reward_std": 0.16583390533924103, "rewards/accuracy_reward": 0.19196430034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 2588 }, { "completion_length": 591.0714721679688, "epoch": 0.7733552385930849, "grad_norm": 0.37953636050224304, "kl": 0.46240234375, "learning_rate": 2.334657113666779e-07, "loss": 0.0185, "reward": 1.1875000596046448, "reward_std": 0.12073462270200253, "rewards/accuracy_reward": 0.19866072479635477, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 2589 }, { "completion_length": 650.1897583007812, "epoch": 0.7736539466806064, "grad_norm": 0.768562376499176, "kl": 0.33251953125, "learning_rate": 2.3313226862010188e-07, "loss": 0.0133, "reward": 1.1311384737491608, "reward_std": 0.18465926498174667, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 2590 }, { "completion_length": 683.1451110839844, "epoch": 0.7739526547681278, "grad_norm": 0.6249557733535767, "kl": 0.2880859375, "learning_rate": 2.3279917059456112e-07, "loss": 0.0115, "reward": 1.1551339626312256, "reward_std": 0.12857073545455933, "rewards/accuracy_reward": 0.16517857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 2591 }, { "completion_length": 597.3861846923828, "epoch": 0.7742513628556493, "grad_norm": 0.3848855197429657, "kl": 0.33477783203125, "learning_rate": 2.3246641765243368e-07, "loss": 0.0134, "reward": 1.2087054252624512, "reward_std": 0.12965513207018375, "rewards/accuracy_reward": 0.2209821566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2592 }, { "completion_length": 634.350471496582, "epoch": 0.7745500709431707, "grad_norm": 0.6140620112419128, "kl": 0.737060546875, "learning_rate": 2.321340101557224e-07, "loss": 0.0294, "reward": 1.199218824505806, "reward_std": 0.13962750509381294, "rewards/accuracy_reward": 0.2187500123400241, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 2593 }, { "completion_length": 724.9464569091797, "epoch": 0.7748487790306923, "grad_norm": 1.6746001243591309, "kl": 0.6982421875, "learning_rate": 2.3180194846605364e-07, "loss": 0.0279, "reward": 1.1657366752624512, "reward_std": 0.20433032512664795, "rewards/accuracy_reward": 0.18750000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.97823666036129, "step": 2594 }, { "completion_length": 683.529052734375, "epoch": 0.7751474871182137, "grad_norm": 0.598176121711731, "kl": 0.47900390625, "learning_rate": 2.314702329446782e-07, "loss": 0.0191, "reward": 1.1127232611179352, "reward_std": 0.15732543915510178, "rewards/accuracy_reward": 0.12276786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 2595 }, { "completion_length": 650.0446624755859, "epoch": 0.7754461952057352, "grad_norm": 0.7736943960189819, "kl": 0.7705078125, "learning_rate": 2.311388639524702e-07, "loss": 0.0309, "reward": 1.1819196939468384, "reward_std": 0.18162604048848152, "rewards/accuracy_reward": 0.20089286658912897, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 2596 }, { "completion_length": 621.3995819091797, "epoch": 0.7757449032932566, "grad_norm": 0.5138546824455261, "kl": 0.498046875, "learning_rate": 2.3080784184992635e-07, "loss": 0.0199, "reward": 1.1261161416769028, "reward_std": 0.1200354378670454, "rewards/accuracy_reward": 0.13392857951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 2597 }, { "completion_length": 719.7567291259766, "epoch": 0.7760436113807782, "grad_norm": 0.751388430595398, "kl": 0.9560546875, "learning_rate": 2.3047716699716636e-07, "loss": 0.0383, "reward": 1.2047991454601288, "reward_std": 0.20557396486401558, "rewards/accuracy_reward": 0.22767858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977120578289032, "step": 2598 }, { "completion_length": 711.7522583007812, "epoch": 0.7763423194682996, "grad_norm": 0.6215216517448425, "kl": 0.45166015625, "learning_rate": 2.3014683975393222e-07, "loss": 0.018, "reward": 1.1127232611179352, "reward_std": 0.15998198464512825, "rewards/accuracy_reward": 0.1294642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 2599 }, { "completion_length": 706.763427734375, "epoch": 0.7766410275558211, "grad_norm": 1.1320033073425293, "kl": 0.6669921875, "learning_rate": 2.2981686047958732e-07, "loss": 0.0267, "reward": 1.0496652126312256, "reward_std": 0.14739832654595375, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804688096046448, "step": 2600 }, { "completion_length": 556.9843902587891, "epoch": 0.7769397356433425, "grad_norm": 0.29987362027168274, "kl": 0.1973876953125, "learning_rate": 2.29487229533117e-07, "loss": 0.0079, "reward": 1.3007813096046448, "reward_std": 0.13114353269338608, "rewards/accuracy_reward": 0.3058035895228386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949776977300644, "step": 2601 }, { "completion_length": 679.700927734375, "epoch": 0.777238443730864, "grad_norm": 0.24354112148284912, "kl": 0.304443359375, "learning_rate": 2.2915794727312722e-07, "loss": 0.0122, "reward": 1.213727742433548, "reward_std": 0.13312311843037605, "rewards/accuracy_reward": 0.227678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 2602 }, { "completion_length": 675.325927734375, "epoch": 0.7775371518183855, "grad_norm": 0.35871046781539917, "kl": 0.6424560546875, "learning_rate": 2.2882901405784485e-07, "loss": 0.0257, "reward": 1.0345982760190964, "reward_std": 0.16767553240060806, "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9743303954601288, "step": 2603 }, { "completion_length": 695.5982360839844, "epoch": 0.777835859905907, "grad_norm": 0.34551113843917847, "kl": 0.3984375, "learning_rate": 2.2850043024511724e-07, "loss": 0.016, "reward": 1.1501116752624512, "reward_std": 0.1475303117185831, "rewards/accuracy_reward": 0.15848214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 2604 }, { "completion_length": 674.4576263427734, "epoch": 0.7781345679934284, "grad_norm": 0.49044498801231384, "kl": 0.576171875, "learning_rate": 2.28172196192411e-07, "loss": 0.023, "reward": 1.2148438096046448, "reward_std": 0.13998203724622726, "rewards/accuracy_reward": 0.23214287543669343, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 2605 }, { "completion_length": 644.5357513427734, "epoch": 0.7784332760809499, "grad_norm": 0.7097657322883606, "kl": 0.58984375, "learning_rate": 2.278443122568128e-07, "loss": 0.0236, "reward": 1.1969866752624512, "reward_std": 0.19321400672197342, "rewards/accuracy_reward": 0.2165178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 2606 }, { "completion_length": 733.3437805175781, "epoch": 0.7787319841684713, "grad_norm": 0.4891389012336731, "kl": 0.3902587890625, "learning_rate": 2.2751677879502838e-07, "loss": 0.0156, "reward": 1.1289062798023224, "reward_std": 0.14739393256604671, "rewards/accuracy_reward": 0.14955357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 2607 }, { "completion_length": 639.0714569091797, "epoch": 0.7790306922559929, "grad_norm": 0.2885110080242157, "kl": 0.241455078125, "learning_rate": 2.271895961633817e-07, "loss": 0.0097, "reward": 1.2047991454601288, "reward_std": 0.10424661450088024, "rewards/accuracy_reward": 0.2120535857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 2608 }, { "completion_length": 671.4888763427734, "epoch": 0.7793294003435143, "grad_norm": 0.5013541579246521, "kl": 0.38079833984375, "learning_rate": 2.268627647178156e-07, "loss": 0.0152, "reward": 1.104352742433548, "reward_std": 0.11028365232050419, "rewards/accuracy_reward": 0.1160714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 2609 }, { "completion_length": 550.8080596923828, "epoch": 0.7796281084310358, "grad_norm": 0.5044421553611755, "kl": 0.2188720703125, "learning_rate": 2.265362848138908e-07, "loss": 0.0088, "reward": 1.1651785969734192, "reward_std": 0.1784516666084528, "rewards/accuracy_reward": 0.16964286472648382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.995535746216774, "step": 2610 }, { "completion_length": 687.9509124755859, "epoch": 0.7799268165185572, "grad_norm": 0.605812132358551, "kl": 0.46044921875, "learning_rate": 2.262101568067851e-07, "loss": 0.0184, "reward": 1.096540242433548, "reward_std": 0.20602501928806305, "rewards/accuracy_reward": 0.11383929196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 2611 }, { "completion_length": 724.0268249511719, "epoch": 0.7802255246060787, "grad_norm": 0.32217028737068176, "kl": 0.44873046875, "learning_rate": 2.2588438105129398e-07, "loss": 0.018, "reward": 1.112165242433548, "reward_std": 0.1845688372850418, "rewards/accuracy_reward": 0.12276786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2612 }, { "completion_length": 761.1830749511719, "epoch": 0.7805242326936002, "grad_norm": 0.5633214116096497, "kl": 0.7529296875, "learning_rate": 2.2555895790182967e-07, "loss": 0.0301, "reward": 1.0585938096046448, "reward_std": 0.1357769537717104, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 2613 }, { "completion_length": 641.4665374755859, "epoch": 0.7808229407811217, "grad_norm": 0.37484270334243774, "kl": 0.3289794921875, "learning_rate": 2.2523388771242036e-07, "loss": 0.0132, "reward": 1.1210937798023224, "reward_std": 0.18768243491649628, "rewards/accuracy_reward": 0.13616071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330931901932, "step": 2614 }, { "completion_length": 717.6562652587891, "epoch": 0.7811216488686431, "grad_norm": 0.4203971028327942, "kl": 0.33740234375, "learning_rate": 2.249091708367109e-07, "loss": 0.0135, "reward": 1.2477679252624512, "reward_std": 0.2237546592950821, "rewards/accuracy_reward": 0.2611607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2615 }, { "completion_length": 688.0669860839844, "epoch": 0.7814203569561646, "grad_norm": 0.395820677280426, "kl": 0.411865234375, "learning_rate": 2.245848076279611e-07, "loss": 0.0164, "reward": 1.0965402275323868, "reward_std": 0.10593012347817421, "rewards/accuracy_reward": 0.1071428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2616 }, { "completion_length": 667.3973541259766, "epoch": 0.781719065043686, "grad_norm": 1.0266906023025513, "kl": 0.518310546875, "learning_rate": 2.2426079843904643e-07, "loss": 0.0207, "reward": 1.0998884737491608, "reward_std": 0.19568588957190514, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 2617 }, { "completion_length": 621.4866180419922, "epoch": 0.7820177731312076, "grad_norm": 0.4491753876209259, "kl": 0.2734375, "learning_rate": 2.2393714362245743e-07, "loss": 0.0109, "reward": 1.2672991454601288, "reward_std": 0.1675658393651247, "rewards/accuracy_reward": 0.2745535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 2618 }, { "completion_length": 689.3549499511719, "epoch": 0.782316481218729, "grad_norm": 0.696272611618042, "kl": 0.427734375, "learning_rate": 2.2361384353029834e-07, "loss": 0.0171, "reward": 1.1328125447034836, "reward_std": 0.19569879584014416, "rewards/accuracy_reward": 0.15178572502918541, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 2619 }, { "completion_length": 590.3906402587891, "epoch": 0.7826151893062505, "grad_norm": 0.449121356010437, "kl": 0.20391845703125, "learning_rate": 2.2329089851428824e-07, "loss": 0.0082, "reward": 1.098214328289032, "reward_std": 0.14779711328446865, "rewards/accuracy_reward": 0.10267857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.995535746216774, "step": 2620 }, { "completion_length": 633.9196624755859, "epoch": 0.7829138973937719, "grad_norm": 0.33892419934272766, "kl": 0.59228515625, "learning_rate": 2.2296830892575973e-07, "loss": 0.0237, "reward": 1.1718750894069672, "reward_std": 0.11826423183083534, "rewards/accuracy_reward": 0.18303572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 2621 }, { "completion_length": 637.6785888671875, "epoch": 0.7832126054812935, "grad_norm": 0.34164488315582275, "kl": 0.2991943359375, "learning_rate": 2.2264607511565846e-07, "loss": 0.012, "reward": 1.286272406578064, "reward_std": 0.1421550028026104, "rewards/accuracy_reward": 0.2946428742725402, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916294813156128, "step": 2622 }, { "completion_length": 743.1205749511719, "epoch": 0.7835113135688149, "grad_norm": 0.5823239088058472, "kl": 0.75927734375, "learning_rate": 2.2232419743454333e-07, "loss": 0.0304, "reward": 1.034040242433548, "reward_std": 0.14455169066786766, "rewards/accuracy_reward": 0.05803571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 2623 }, { "completion_length": 599.910758972168, "epoch": 0.7838100216563364, "grad_norm": 0.437019020318985, "kl": 0.488525390625, "learning_rate": 2.2200267623258585e-07, "loss": 0.0195, "reward": 1.0797991454601288, "reward_std": 0.11451550014317036, "rewards/accuracy_reward": 0.09598214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 2624 }, { "completion_length": 605.1339416503906, "epoch": 0.7841087297438578, "grad_norm": 0.3770449161529541, "kl": 0.64306640625, "learning_rate": 2.2168151185956929e-07, "loss": 0.0257, "reward": 1.1361607611179352, "reward_std": 0.10893651191145182, "rewards/accuracy_reward": 0.14955357182770967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2625 }, { "completion_length": 698.3348541259766, "epoch": 0.7844074378313793, "grad_norm": 0.4956805408000946, "kl": 0.5859375, "learning_rate": 2.2136070466488913e-07, "loss": 0.0234, "reward": 1.1540179550647736, "reward_std": 0.14174152724444866, "rewards/accuracy_reward": 0.16964286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 2626 }, { "completion_length": 683.6651992797852, "epoch": 0.7847061459189008, "grad_norm": 0.8361905813217163, "kl": 0.44775390625, "learning_rate": 2.2104025499755236e-07, "loss": 0.018, "reward": 1.1166295111179352, "reward_std": 0.22035186737775803, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 2627 }, { "completion_length": 700.6361846923828, "epoch": 0.7850048540064222, "grad_norm": 0.5248891711235046, "kl": 0.7762451171875, "learning_rate": 2.207201632061765e-07, "loss": 0.031, "reward": 1.131696492433548, "reward_std": 0.17026782408356667, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 2628 }, { "completion_length": 748.7120971679688, "epoch": 0.7853035620939437, "grad_norm": 1.1761462688446045, "kl": 0.7958984375, "learning_rate": 2.2040042963899028e-07, "loss": 0.0318, "reward": 1.04854916036129, "reward_std": 0.16296618431806564, "rewards/accuracy_reward": 0.07812500349245965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.97042416036129, "step": 2629 }, { "completion_length": 664.3236999511719, "epoch": 0.7856022701814651, "grad_norm": 0.7494000792503357, "kl": 0.724609375, "learning_rate": 2.2008105464383227e-07, "loss": 0.029, "reward": 1.1462054252624512, "reward_std": 0.17743520997464657, "rewards/accuracy_reward": 0.16517857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 2630 }, { "completion_length": 637.4643096923828, "epoch": 0.7859009782689866, "grad_norm": 0.7068055272102356, "kl": 0.381103515625, "learning_rate": 2.1976203856815123e-07, "loss": 0.0152, "reward": 1.2393973469734192, "reward_std": 0.2008935883641243, "rewards/accuracy_reward": 0.2544643022119999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 2631 }, { "completion_length": 670.9196624755859, "epoch": 0.786199686356508, "grad_norm": 0.5272825956344604, "kl": 0.5830078125, "learning_rate": 2.1944338175900562e-07, "loss": 0.0234, "reward": 1.125558078289032, "reward_std": 0.18600949831306934, "rewards/accuracy_reward": 0.14062500232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 2632 }, { "completion_length": 646.2812805175781, "epoch": 0.7864983944440296, "grad_norm": 0.24963319301605225, "kl": 0.414306640625, "learning_rate": 2.191250845630625e-07, "loss": 0.0166, "reward": 1.1914062798023224, "reward_std": 0.24209139868617058, "rewards/accuracy_reward": 0.2008928693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 2633 }, { "completion_length": 600.0960083007812, "epoch": 0.786797102531551, "grad_norm": 0.47736409306526184, "kl": 0.444091796875, "learning_rate": 2.1880714732659805e-07, "loss": 0.0178, "reward": 1.2181920111179352, "reward_std": 0.20929482206702232, "rewards/accuracy_reward": 0.2299107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2634 }, { "completion_length": 676.1004791259766, "epoch": 0.7870958106190725, "grad_norm": 0.5432873368263245, "kl": 0.9052734375, "learning_rate": 2.1848957039549715e-07, "loss": 0.0362, "reward": 1.102120578289032, "reward_std": 0.14989979192614555, "rewards/accuracy_reward": 0.12053572246804833, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 2635 }, { "completion_length": 660.1339569091797, "epoch": 0.7873945187065939, "grad_norm": 0.3299906253814697, "kl": 0.671875, "learning_rate": 2.18172354115252e-07, "loss": 0.0269, "reward": 1.1635045111179352, "reward_std": 0.1777248978614807, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 2636 }, { "completion_length": 665.6562652587891, "epoch": 0.7876932267941155, "grad_norm": 0.4894717037677765, "kl": 0.657470703125, "learning_rate": 2.1785549883096293e-07, "loss": 0.0264, "reward": 1.2020090073347092, "reward_std": 0.16982745379209518, "rewards/accuracy_reward": 0.21651787031441927, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 2637 }, { "completion_length": 770.5625152587891, "epoch": 0.7879919348816369, "grad_norm": 1.0224024057388306, "kl": 0.755859375, "learning_rate": 2.1753900488733767e-07, "loss": 0.0302, "reward": 1.1049107611179352, "reward_std": 0.18700355663895607, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9665178954601288, "step": 2638 }, { "completion_length": 643.5803985595703, "epoch": 0.7882906429691584, "grad_norm": 0.53382807970047, "kl": 0.451904296875, "learning_rate": 2.1722287262869022e-07, "loss": 0.0181, "reward": 1.2773438096046448, "reward_std": 0.17070218548178673, "rewards/accuracy_reward": 0.2901785857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2639 }, { "completion_length": 686.2544937133789, "epoch": 0.7885893510566798, "grad_norm": 0.5445572137832642, "kl": 0.66064453125, "learning_rate": 2.1690710239894172e-07, "loss": 0.0265, "reward": 1.208147406578064, "reward_std": 0.20326032117009163, "rewards/accuracy_reward": 0.2209821566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2640 }, { "completion_length": 676.9018096923828, "epoch": 0.7888880591442013, "grad_norm": 0.41016915440559387, "kl": 0.4814453125, "learning_rate": 2.1659169454161935e-07, "loss": 0.0193, "reward": 1.042410746216774, "reward_std": 0.10548606421798468, "rewards/accuracy_reward": 0.05803571850992739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2641 }, { "completion_length": 602.4687805175781, "epoch": 0.7891867672317228, "grad_norm": 0.3674098253250122, "kl": 0.5673828125, "learning_rate": 2.162766493998556e-07, "loss": 0.0227, "reward": 1.1813616752624512, "reward_std": 0.15067432075738907, "rewards/accuracy_reward": 0.1941964402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2642 }, { "completion_length": 682.7098541259766, "epoch": 0.7894854753192443, "grad_norm": 0.4262475371360779, "kl": 0.57080078125, "learning_rate": 2.1596196731638904e-07, "loss": 0.0228, "reward": 1.1378348767757416, "reward_std": 0.12954729422926903, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98604916036129, "step": 2643 }, { "completion_length": 648.7678680419922, "epoch": 0.7897841834067657, "grad_norm": 0.6483501195907593, "kl": 0.80322265625, "learning_rate": 2.156476486335627e-07, "loss": 0.0321, "reward": 1.077566996216774, "reward_std": 0.11692232079803944, "rewards/accuracy_reward": 0.08928572060540318, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 2644 }, { "completion_length": 656.638427734375, "epoch": 0.7900828914942872, "grad_norm": 1.3761675357818604, "kl": 0.62451171875, "learning_rate": 2.1533369369332454e-07, "loss": 0.025, "reward": 1.161272406578064, "reward_std": 0.10476080141961575, "rewards/accuracy_reward": 0.17187501350417733, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 2645 }, { "completion_length": 678.2433319091797, "epoch": 0.7903815995818086, "grad_norm": 0.44865596294403076, "kl": 0.7449951171875, "learning_rate": 2.1502010283722698e-07, "loss": 0.0298, "reward": 1.2321428954601288, "reward_std": 0.1824161484837532, "rewards/accuracy_reward": 0.2477678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2646 }, { "completion_length": 789.0558319091797, "epoch": 0.7906803076693302, "grad_norm": 0.7610591053962708, "kl": 0.9814453125, "learning_rate": 2.1470687640642588e-07, "loss": 0.0392, "reward": 1.1506696939468384, "reward_std": 0.16758954897522926, "rewards/accuracy_reward": 0.18080357741564512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9698661118745804, "step": 2647 }, { "completion_length": 670.6607513427734, "epoch": 0.7909790157568516, "grad_norm": 0.33527514338493347, "kl": 0.64501953125, "learning_rate": 2.1439401474168095e-07, "loss": 0.0258, "reward": 1.2014509439468384, "reward_std": 0.16796201840043068, "rewards/accuracy_reward": 0.2165178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2648 }, { "completion_length": 559.7589416503906, "epoch": 0.7912777238443731, "grad_norm": 0.903713583946228, "kl": 0.39453125, "learning_rate": 2.1408151818335518e-07, "loss": 0.0158, "reward": 1.273995578289032, "reward_std": 0.17029524967074394, "rewards/accuracy_reward": 0.2812500111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.992745578289032, "step": 2649 }, { "completion_length": 688.9420013427734, "epoch": 0.7915764319318945, "grad_norm": 0.32964658737182617, "kl": 0.3251953125, "learning_rate": 2.13769387071414e-07, "loss": 0.013, "reward": 1.1439732313156128, "reward_std": 0.18058789521455765, "rewards/accuracy_reward": 0.15401786845177412, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 2650 }, { "completion_length": 645.3080749511719, "epoch": 0.7918751400194161, "grad_norm": 0.3820071518421173, "kl": 0.424072265625, "learning_rate": 2.1345762174542553e-07, "loss": 0.017, "reward": 1.1501116752624512, "reward_std": 0.09315621480345726, "rewards/accuracy_reward": 0.15848214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 2651 }, { "completion_length": 626.1272583007812, "epoch": 0.7921738481069375, "grad_norm": 0.5529452562332153, "kl": 0.51629638671875, "learning_rate": 2.1314622254456e-07, "loss": 0.0207, "reward": 1.08761166036129, "reward_std": 0.10163322743028402, "rewards/accuracy_reward": 0.09821429080329835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2652 }, { "completion_length": 785.7924346923828, "epoch": 0.792472556194459, "grad_norm": 0.6862122416496277, "kl": 0.7578125, "learning_rate": 2.12835189807589e-07, "loss": 0.0303, "reward": 1.042410746216774, "reward_std": 0.19476928561925888, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9598214775323868, "step": 2653 }, { "completion_length": 687.4442291259766, "epoch": 0.7927712642819804, "grad_norm": 0.5234547853469849, "kl": 0.98291015625, "learning_rate": 2.1252452387288576e-07, "loss": 0.0393, "reward": 1.0558036416769028, "reward_std": 0.1152511527761817, "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9709821790456772, "step": 2654 }, { "completion_length": 648.9531555175781, "epoch": 0.7930699723695019, "grad_norm": 0.34498360753059387, "kl": 0.55419921875, "learning_rate": 2.1221422507842458e-07, "loss": 0.0221, "reward": 1.1171875596046448, "reward_std": 0.1957431472837925, "rewards/accuracy_reward": 0.13392858020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2655 }, { "completion_length": 660.1875152587891, "epoch": 0.7933686804570234, "grad_norm": 0.6255536079406738, "kl": 0.390869140625, "learning_rate": 2.119042937617798e-07, "loss": 0.0156, "reward": 1.1422991454601288, "reward_std": 0.1643734909594059, "rewards/accuracy_reward": 0.15625000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 2656 }, { "completion_length": 644.5223541259766, "epoch": 0.7936673885445449, "grad_norm": 0.47077733278274536, "kl": 0.8173828125, "learning_rate": 2.1159473026012663e-07, "loss": 0.0327, "reward": 1.139508992433548, "reward_std": 0.1879085935652256, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9743303805589676, "step": 2657 }, { "completion_length": 664.3169860839844, "epoch": 0.7939660966320663, "grad_norm": 0.813553512096405, "kl": 0.5704345703125, "learning_rate": 2.1128553491023948e-07, "loss": 0.0229, "reward": 1.1953125596046448, "reward_std": 0.1690389271825552, "rewards/accuracy_reward": 0.2165178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.978794664144516, "step": 2658 }, { "completion_length": 683.2611999511719, "epoch": 0.7942648047195878, "grad_norm": 0.4393250048160553, "kl": 0.560791015625, "learning_rate": 2.1097670804849274e-07, "loss": 0.0224, "reward": 1.1629464626312256, "reward_std": 0.13981090858578682, "rewards/accuracy_reward": 0.1785714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2659 }, { "completion_length": 616.7678833007812, "epoch": 0.7945635128071092, "grad_norm": 0.4793045222759247, "kl": 0.48828125, "learning_rate": 2.1066825001086e-07, "loss": 0.0196, "reward": 1.1629464626312256, "reward_std": 0.15633528120815754, "rewards/accuracy_reward": 0.1785714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 2660 }, { "completion_length": 662.1875305175781, "epoch": 0.7948622208946308, "grad_norm": 0.4167242646217346, "kl": 0.4541015625, "learning_rate": 2.1036016113291298e-07, "loss": 0.0182, "reward": 1.2181920111179352, "reward_std": 0.2499294988811016, "rewards/accuracy_reward": 0.2433035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884439468384, "step": 2661 }, { "completion_length": 690.1987152099609, "epoch": 0.7951609289821522, "grad_norm": 0.4048026502132416, "kl": 0.3388671875, "learning_rate": 2.1005244174982236e-07, "loss": 0.0135, "reward": 1.1629464626312256, "reward_std": 0.1841660924255848, "rewards/accuracy_reward": 0.17633929941803217, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2662 }, { "completion_length": 612.2120819091797, "epoch": 0.7954596370696737, "grad_norm": 0.5713890790939331, "kl": 0.3509521484375, "learning_rate": 2.0974509219635684e-07, "loss": 0.014, "reward": 1.188058078289032, "reward_std": 0.12228767573833466, "rewards/accuracy_reward": 0.20312500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 2663 }, { "completion_length": 707.372802734375, "epoch": 0.7957583451571951, "grad_norm": 0.4141719341278076, "kl": 0.9287109375, "learning_rate": 2.0943811280688224e-07, "loss": 0.0372, "reward": 1.1702009439468384, "reward_std": 0.22137589193880558, "rewards/accuracy_reward": 0.2075892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9626116454601288, "step": 2664 }, { "completion_length": 671.0669860839844, "epoch": 0.7960570532447167, "grad_norm": 0.34893426299095154, "kl": 0.356689453125, "learning_rate": 2.0913150391536232e-07, "loss": 0.0143, "reward": 1.2444196939468384, "reward_std": 0.18573183380067348, "rewards/accuracy_reward": 0.2566964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2665 }, { "completion_length": 690.0580444335938, "epoch": 0.7963557613322381, "grad_norm": 0.5007513165473938, "kl": 0.47125244140625, "learning_rate": 2.088252658553576e-07, "loss": 0.0189, "reward": 1.074776828289032, "reward_std": 0.16465642303228378, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 2666 }, { "completion_length": 700.4844207763672, "epoch": 0.7966544694197596, "grad_norm": 0.552341878414154, "kl": 0.5377197265625, "learning_rate": 2.085193989600247e-07, "loss": 0.0216, "reward": 1.1367188096046448, "reward_std": 0.17410148866474628, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2667 }, { "completion_length": 672.7500228881836, "epoch": 0.796953177507281, "grad_norm": 0.3416934609413147, "kl": 0.3297119140625, "learning_rate": 2.0821390356211706e-07, "loss": 0.0132, "reward": 1.0580357313156128, "reward_std": 0.09261364862322807, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 2668 }, { "completion_length": 693.8125152587891, "epoch": 0.7972518855948025, "grad_norm": 0.41169965267181396, "kl": 0.6171875, "learning_rate": 2.0790877999398385e-07, "loss": 0.0247, "reward": 1.1032366454601288, "reward_std": 0.20083095505833626, "rewards/accuracy_reward": 0.12500000977888703, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 2669 }, { "completion_length": 643.8080596923828, "epoch": 0.797550593682324, "grad_norm": 0.6442674398422241, "kl": 0.343017578125, "learning_rate": 2.0760402858756932e-07, "loss": 0.0137, "reward": 1.184151828289032, "reward_std": 0.19865140318870544, "rewards/accuracy_reward": 0.196428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 2670 }, { "completion_length": 652.6049346923828, "epoch": 0.7978493017698454, "grad_norm": 0.5758015513420105, "kl": 0.33447265625, "learning_rate": 2.0729964967441344e-07, "loss": 0.0134, "reward": 1.13448666036129, "reward_std": 0.0868633296340704, "rewards/accuracy_reward": 0.14732143771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 2671 }, { "completion_length": 607.5469055175781, "epoch": 0.7981480098573669, "grad_norm": 0.5176047682762146, "kl": 0.27587890625, "learning_rate": 2.0699564358565026e-07, "loss": 0.011, "reward": 1.1741071939468384, "reward_std": 0.10907631972804666, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9955357313156128, "step": 2672 }, { "completion_length": 655.1004791259766, "epoch": 0.7984467179448883, "grad_norm": 0.8419874310493469, "kl": 0.24365234375, "learning_rate": 2.066920106520089e-07, "loss": 0.0097, "reward": 1.1584821939468384, "reward_std": 0.12044744566082954, "rewards/accuracy_reward": 0.17633929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98214291036129, "step": 2673 }, { "completion_length": 597.522331237793, "epoch": 0.7987454260324098, "grad_norm": 0.3223387598991394, "kl": 0.467041015625, "learning_rate": 2.0638875120381228e-07, "loss": 0.0187, "reward": 1.2053572237491608, "reward_std": 0.19346969947218895, "rewards/accuracy_reward": 0.2254464402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 2674 }, { "completion_length": 709.2076263427734, "epoch": 0.7990441341199312, "grad_norm": 0.4489079713821411, "kl": 0.73193359375, "learning_rate": 2.0608586557097667e-07, "loss": 0.0293, "reward": 1.1796875894069672, "reward_std": 0.14151110406965017, "rewards/accuracy_reward": 0.1986607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 2675 }, { "completion_length": 741.2924499511719, "epoch": 0.7993428422074528, "grad_norm": 0.5427451729774475, "kl": 0.62060546875, "learning_rate": 2.057833540830121e-07, "loss": 0.0248, "reward": 1.0993303954601288, "reward_std": 0.1868065781891346, "rewards/accuracy_reward": 0.12276786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625298023224, "step": 2676 }, { "completion_length": 686.4330749511719, "epoch": 0.7996415502949742, "grad_norm": 0.433144211769104, "kl": 0.380126953125, "learning_rate": 2.054812170690216e-07, "loss": 0.0152, "reward": 1.1696428954601288, "reward_std": 0.15515564009547234, "rewards/accuracy_reward": 0.17857143841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714775323868, "step": 2677 }, { "completion_length": 682.904052734375, "epoch": 0.7999402583824957, "grad_norm": 0.3986557126045227, "kl": 0.53369140625, "learning_rate": 2.0517945485770031e-07, "loss": 0.0214, "reward": 1.1389509439468384, "reward_std": 0.12839178927242756, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009588479996, "step": 2678 }, { "completion_length": 713.419677734375, "epoch": 0.8002389664700171, "grad_norm": 0.688186526298523, "kl": 0.520263671875, "learning_rate": 2.0487806777733608e-07, "loss": 0.0208, "reward": 1.1021205484867096, "reward_std": 0.16928992606699467, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205633878708, "step": 2679 }, { "completion_length": 672.1986846923828, "epoch": 0.8005376745575387, "grad_norm": 0.5831706523895264, "kl": 0.50390625, "learning_rate": 2.0457705615580862e-07, "loss": 0.0201, "reward": 1.1261161267757416, "reward_std": 0.12296353466808796, "rewards/accuracy_reward": 0.1406250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 2680 }, { "completion_length": 678.7835235595703, "epoch": 0.8008363826450601, "grad_norm": 0.48721522092819214, "kl": 0.4462890625, "learning_rate": 2.042764203205889e-07, "loss": 0.0178, "reward": 1.0987723618745804, "reward_std": 0.1127864457666874, "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2681 }, { "completion_length": 606.0982513427734, "epoch": 0.8011350907325816, "grad_norm": 0.2590917944908142, "kl": 0.2608642578125, "learning_rate": 2.039761605987394e-07, "loss": 0.0104, "reward": 1.1445313096046448, "reward_std": 0.11719039548188448, "rewards/accuracy_reward": 0.14955358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949776828289032, "step": 2682 }, { "completion_length": 631.8571472167969, "epoch": 0.801433798820103, "grad_norm": 0.5730973482131958, "kl": 0.44189453125, "learning_rate": 2.0367627731691312e-07, "loss": 0.0177, "reward": 1.1540179252624512, "reward_std": 0.13579068146646023, "rewards/accuracy_reward": 0.1674107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071939468384, "step": 2683 }, { "completion_length": 651.5156555175781, "epoch": 0.8017325069076245, "grad_norm": 0.6038268804550171, "kl": 0.44873046875, "learning_rate": 2.0337677080135373e-07, "loss": 0.0179, "reward": 1.139508992433548, "reward_std": 0.16713461093604565, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2684 }, { "completion_length": 641.2299346923828, "epoch": 0.802031214995146, "grad_norm": 0.231972798705101, "kl": 0.265869140625, "learning_rate": 2.0307764137789508e-07, "loss": 0.0106, "reward": 1.3320313096046448, "reward_std": 0.17588330805301666, "rewards/accuracy_reward": 0.341517873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 2685 }, { "completion_length": 651.7411041259766, "epoch": 0.8023299230826675, "grad_norm": 0.5714317560195923, "kl": 0.75830078125, "learning_rate": 2.0277888937196042e-07, "loss": 0.0304, "reward": 1.1054687798023224, "reward_std": 0.166710096411407, "rewards/accuracy_reward": 0.1294642873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.976004496216774, "step": 2686 }, { "completion_length": 612.0290374755859, "epoch": 0.8026286311701889, "grad_norm": 0.26894909143447876, "kl": 0.3294677734375, "learning_rate": 2.0248051510856285e-07, "loss": 0.0132, "reward": 1.1735491454601288, "reward_std": 0.1371786817908287, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 2687 }, { "completion_length": 674.2254638671875, "epoch": 0.8029273392577104, "grad_norm": 0.38296300172805786, "kl": 0.35986328125, "learning_rate": 2.0218251891230436e-07, "loss": 0.0144, "reward": 1.2421875298023224, "reward_std": 0.18259600922465324, "rewards/accuracy_reward": 0.2544642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2688 }, { "completion_length": 634.0134201049805, "epoch": 0.8032260473452318, "grad_norm": 0.6680785417556763, "kl": 0.375244140625, "learning_rate": 2.018849011073754e-07, "loss": 0.015, "reward": 1.1506697237491608, "reward_std": 0.1398475617170334, "rewards/accuracy_reward": 0.1629464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2689 }, { "completion_length": 649.8393096923828, "epoch": 0.8035247554327534, "grad_norm": 0.4650825262069702, "kl": 0.2906494140625, "learning_rate": 2.015876620175551e-07, "loss": 0.0116, "reward": 1.2226562798023224, "reward_std": 0.16764390841126442, "rewards/accuracy_reward": 0.2299107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 2690 }, { "completion_length": 733.9777221679688, "epoch": 0.8038234635202748, "grad_norm": 1.1274495124816895, "kl": 0.53759765625, "learning_rate": 2.0129080196621058e-07, "loss": 0.0215, "reward": 1.1316965073347092, "reward_std": 0.1502695530653, "rewards/accuracy_reward": 0.14732143259607255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2691 }, { "completion_length": 663.1674346923828, "epoch": 0.8041221716077963, "grad_norm": 0.90751713514328, "kl": 0.63134765625, "learning_rate": 2.0099432127629622e-07, "loss": 0.0252, "reward": 1.1668527126312256, "reward_std": 0.158748934045434, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 2692 }, { "completion_length": 612.0223388671875, "epoch": 0.8044208796953177, "grad_norm": 0.4701208174228668, "kl": 0.55859375, "learning_rate": 2.0069822027035406e-07, "loss": 0.0224, "reward": 1.1969866454601288, "reward_std": 0.15194769203662872, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2693 }, { "completion_length": 665.5312805175781, "epoch": 0.8047195877828393, "grad_norm": 0.498145192861557, "kl": 0.249755859375, "learning_rate": 2.004024992705131e-07, "loss": 0.01, "reward": 1.1997768580913544, "reward_std": 0.12495850399136543, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 2694 }, { "completion_length": 675.5960083007812, "epoch": 0.8050182958703607, "grad_norm": 0.3940508961677551, "kl": 0.396728515625, "learning_rate": 2.0010715859848865e-07, "loss": 0.0159, "reward": 1.1763393580913544, "reward_std": 0.15139367897063494, "rewards/accuracy_reward": 0.18973215483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2695 }, { "completion_length": 676.4330596923828, "epoch": 0.8053170039578822, "grad_norm": 0.5000612735748291, "kl": 0.48016357421875, "learning_rate": 1.998121985755825e-07, "loss": 0.0192, "reward": 1.0518973767757416, "reward_std": 0.10649280995130539, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916294813156128, "step": 2696 }, { "completion_length": 643.4710083007812, "epoch": 0.8056157120454036, "grad_norm": 0.4034111499786377, "kl": 0.57373046875, "learning_rate": 1.9951761952268208e-07, "loss": 0.023, "reward": 1.1796875596046448, "reward_std": 0.16079898551106453, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2697 }, { "completion_length": 673.9575958251953, "epoch": 0.8059144201329251, "grad_norm": 0.4121105372905731, "kl": 0.298828125, "learning_rate": 1.9922342176026072e-07, "loss": 0.012, "reward": 1.1696428954601288, "reward_std": 0.14952881820499897, "rewards/accuracy_reward": 0.17857144121080637, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 2698 }, { "completion_length": 561.3526992797852, "epoch": 0.8062131282204466, "grad_norm": 0.523226797580719, "kl": 0.270263671875, "learning_rate": 1.9892960560837677e-07, "loss": 0.0108, "reward": 1.2020089626312256, "reward_std": 0.15510574914515018, "rewards/accuracy_reward": 0.2075892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 2699 }, { "completion_length": 580.5245819091797, "epoch": 0.8065118363079681, "grad_norm": 0.6796128153800964, "kl": 0.477294921875, "learning_rate": 1.986361713866732e-07, "loss": 0.0191, "reward": 1.256138414144516, "reward_std": 0.21080714091658592, "rewards/accuracy_reward": 0.2678571566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2700 }, { "completion_length": 675.3772583007812, "epoch": 0.8068105443954895, "grad_norm": 0.5131191611289978, "kl": 0.64306640625, "learning_rate": 1.983431194143778e-07, "loss": 0.0258, "reward": 1.2366071939468384, "reward_std": 0.26865676417946815, "rewards/accuracy_reward": 0.2611607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464626312256, "step": 2701 }, { "completion_length": 660.8348388671875, "epoch": 0.807109252483011, "grad_norm": 0.4329906105995178, "kl": 0.82958984375, "learning_rate": 1.980504500103025e-07, "loss": 0.0331, "reward": 1.1233259439468384, "reward_std": 0.10605529509484768, "rewards/accuracy_reward": 0.13839286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2702 }, { "completion_length": 712.6986846923828, "epoch": 0.8074079605705324, "grad_norm": 0.4787886142730713, "kl": 0.685546875, "learning_rate": 1.9775816349284276e-07, "loss": 0.0274, "reward": 1.1210938096046448, "reward_std": 0.1688264235854149, "rewards/accuracy_reward": 0.14955357555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9715402126312256, "step": 2703 }, { "completion_length": 695.6473541259766, "epoch": 0.807706668658054, "grad_norm": 0.6224045157432556, "kl": 0.6728515625, "learning_rate": 1.9746626017997769e-07, "loss": 0.0269, "reward": 1.141183078289032, "reward_std": 0.19094597175717354, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 2704 }, { "completion_length": 630.1250457763672, "epoch": 0.8080053767455754, "grad_norm": 0.37351611256599426, "kl": 0.271728515625, "learning_rate": 1.971747403892697e-07, "loss": 0.0109, "reward": 1.0820312798023224, "reward_std": 0.13859844952821732, "rewards/accuracy_reward": 0.09375000419095159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 2705 }, { "completion_length": 623.9219055175781, "epoch": 0.8083040848330969, "grad_norm": 0.49647581577301025, "kl": 0.6695556640625, "learning_rate": 1.9688360443786336e-07, "loss": 0.0268, "reward": 1.1289063096046448, "reward_std": 0.16657604929059744, "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.974888414144516, "step": 2706 }, { "completion_length": 703.6339569091797, "epoch": 0.8086027929206183, "grad_norm": 0.4797825515270233, "kl": 0.501953125, "learning_rate": 1.9659285264248636e-07, "loss": 0.0201, "reward": 1.2042411267757416, "reward_std": 0.16217686980962753, "rewards/accuracy_reward": 0.21651786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2707 }, { "completion_length": 606.7187805175781, "epoch": 0.8089015010081398, "grad_norm": 0.8125372529029846, "kl": 0.60888671875, "learning_rate": 1.9630248531944812e-07, "loss": 0.0243, "reward": 1.2092634439468384, "reward_std": 0.19732351787388325, "rewards/accuracy_reward": 0.22098215389996767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 2708 }, { "completion_length": 647.4241333007812, "epoch": 0.8092002090956613, "grad_norm": 0.3108212649822235, "kl": 0.61669921875, "learning_rate": 1.9601250278463964e-07, "loss": 0.0246, "reward": 1.127790242433548, "reward_std": 0.14002401381731033, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2709 }, { "completion_length": 645.8348388671875, "epoch": 0.8094989171831828, "grad_norm": 0.4013464152812958, "kl": 0.4697265625, "learning_rate": 1.9572290535353383e-07, "loss": 0.0188, "reward": 1.0870536267757416, "reward_std": 0.13756517693400383, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2710 }, { "completion_length": 726.7745666503906, "epoch": 0.8097976252707042, "grad_norm": 0.5547526478767395, "kl": 0.6484375, "learning_rate": 1.9543369334118392e-07, "loss": 0.0259, "reward": 1.0904018580913544, "reward_std": 0.1794210709631443, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 2711 }, { "completion_length": 645.1942291259766, "epoch": 0.8100963333582257, "grad_norm": 0.44998475909233093, "kl": 0.292724609375, "learning_rate": 1.9514486706222443e-07, "loss": 0.0117, "reward": 1.0792411267757416, "reward_std": 0.179463692009449, "rewards/accuracy_reward": 0.0959821455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 2712 }, { "completion_length": 725.8147583007812, "epoch": 0.8103950414457471, "grad_norm": 0.5786427855491638, "kl": 0.66748046875, "learning_rate": 1.9485642683087017e-07, "loss": 0.0266, "reward": 1.119977742433548, "reward_std": 0.20515954867005348, "rewards/accuracy_reward": 0.149553582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9704241454601288, "step": 2713 }, { "completion_length": 695.7455596923828, "epoch": 0.8106937495332686, "grad_norm": 0.5545377135276794, "kl": 0.5455322265625, "learning_rate": 1.9456837296091557e-07, "loss": 0.0218, "reward": 1.1964286118745804, "reward_std": 0.13322094455361366, "rewards/accuracy_reward": 0.20758930081501603, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 2714 }, { "completion_length": 641.0335083007812, "epoch": 0.8109924576207901, "grad_norm": 0.7322794198989868, "kl": 0.2432861328125, "learning_rate": 1.9428070576573513e-07, "loss": 0.0097, "reward": 1.1992188096046448, "reward_std": 0.16955003142356873, "rewards/accuracy_reward": 0.21428572945296764, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 2715 }, { "completion_length": 612.0692291259766, "epoch": 0.8112911657083115, "grad_norm": 0.5499923825263977, "kl": 0.65185546875, "learning_rate": 1.9399342555828277e-07, "loss": 0.0261, "reward": 1.1875000298023224, "reward_std": 0.0750175304710865, "rewards/accuracy_reward": 0.2075892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 2716 }, { "completion_length": 768.8638763427734, "epoch": 0.811589873795833, "grad_norm": 0.43239879608154297, "kl": 0.5732421875, "learning_rate": 1.9370653265109083e-07, "loss": 0.0229, "reward": 1.0602678954601288, "reward_std": 0.1274018615949899, "rewards/accuracy_reward": 0.08482143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464775323868, "step": 2717 }, { "completion_length": 654.6986846923828, "epoch": 0.8118885818833544, "grad_norm": 0.4584985673427582, "kl": 0.411376953125, "learning_rate": 1.9342002735627083e-07, "loss": 0.0165, "reward": 1.1657366752624512, "reward_std": 0.17560015060007572, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 2718 }, { "completion_length": 653.9553833007812, "epoch": 0.812187289970876, "grad_norm": 0.42808884382247925, "kl": 0.49169921875, "learning_rate": 1.9313390998551264e-07, "loss": 0.0197, "reward": 1.1835938096046448, "reward_std": 0.1683831363916397, "rewards/accuracy_reward": 0.191964291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 2719 }, { "completion_length": 634.9710083007812, "epoch": 0.8124859980583974, "grad_norm": 0.355694055557251, "kl": 0.542724609375, "learning_rate": 1.9284818085008361e-07, "loss": 0.0217, "reward": 1.1780134439468384, "reward_std": 0.17906450852751732, "rewards/accuracy_reward": 0.1941964402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 2720 }, { "completion_length": 636.029052734375, "epoch": 0.8127847061459189, "grad_norm": 0.5140246748924255, "kl": 0.4248046875, "learning_rate": 1.925628402608292e-07, "loss": 0.017, "reward": 1.1071428954601288, "reward_std": 0.17596156150102615, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 2721 }, { "completion_length": 690.1585235595703, "epoch": 0.8130834142334403, "grad_norm": 0.35616934299468994, "kl": 0.3662109375, "learning_rate": 1.9227788852817212e-07, "loss": 0.0146, "reward": 1.219308078289032, "reward_std": 0.16435791924595833, "rewards/accuracy_reward": 0.227678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916294813156128, "step": 2722 }, { "completion_length": 651.5290374755859, "epoch": 0.8133821223209619, "grad_norm": 0.4744667112827301, "kl": 0.4375, "learning_rate": 1.919933259621116e-07, "loss": 0.0175, "reward": 1.2628348767757416, "reward_std": 0.15905792266130447, "rewards/accuracy_reward": 0.2812500111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 2723 }, { "completion_length": 626.7232360839844, "epoch": 0.8136808304084833, "grad_norm": 0.3208921253681183, "kl": 0.37060546875, "learning_rate": 1.9170915287222417e-07, "loss": 0.0148, "reward": 1.205915242433548, "reward_std": 0.19043539464473724, "rewards/accuracy_reward": 0.216517873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 2724 }, { "completion_length": 687.2522583007812, "epoch": 0.8139795384960048, "grad_norm": 0.2661619186401367, "kl": 0.277099609375, "learning_rate": 1.9142536956766197e-07, "loss": 0.0111, "reward": 1.0602679252624512, "reward_std": 0.12554489634931087, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2725 }, { "completion_length": 726.2187805175781, "epoch": 0.8142782465835262, "grad_norm": 0.6120608448982239, "kl": 0.47021484375, "learning_rate": 1.911419763571536e-07, "loss": 0.0188, "reward": 1.1908482313156128, "reward_std": 0.16210733726620674, "rewards/accuracy_reward": 0.2053571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 2726 }, { "completion_length": 749.7277069091797, "epoch": 0.8145769546710477, "grad_norm": 0.7549391984939575, "kl": 0.4459228515625, "learning_rate": 1.9085897354900328e-07, "loss": 0.0178, "reward": 1.1640625298023224, "reward_std": 0.10588583629578352, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2727 }, { "completion_length": 684.3326263427734, "epoch": 0.8148756627585692, "grad_norm": 0.5195310711860657, "kl": 0.669921875, "learning_rate": 1.9057636145108997e-07, "loss": 0.0268, "reward": 1.140625074505806, "reward_std": 0.12971904128789902, "rewards/accuracy_reward": 0.16964286169968545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9709821790456772, "step": 2728 }, { "completion_length": 625.7723541259766, "epoch": 0.8151743708460907, "grad_norm": 0.301553875207901, "kl": 0.53515625, "learning_rate": 1.902941403708681e-07, "loss": 0.0214, "reward": 1.1456473767757416, "reward_std": 0.1892897468060255, "rewards/accuracy_reward": 0.16071429662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 2729 }, { "completion_length": 613.2834930419922, "epoch": 0.8154730789336121, "grad_norm": 0.33284589648246765, "kl": 0.3076171875, "learning_rate": 1.9001231061536666e-07, "loss": 0.0123, "reward": 1.2020089626312256, "reward_std": 0.1863083466887474, "rewards/accuracy_reward": 0.2098214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 2730 }, { "completion_length": 719.1875305175781, "epoch": 0.8157717870211336, "grad_norm": 1.0415852069854736, "kl": 0.56103515625, "learning_rate": 1.897308724911885e-07, "loss": 0.0224, "reward": 1.074776828289032, "reward_std": 0.11172140203416348, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 2731 }, { "completion_length": 655.7210083007812, "epoch": 0.816070495108655, "grad_norm": 0.7387549877166748, "kl": 0.275146484375, "learning_rate": 1.89449826304511e-07, "loss": 0.011, "reward": 1.135044664144516, "reward_std": 0.17160254903137684, "rewards/accuracy_reward": 0.14732143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2732 }, { "completion_length": 669.3951263427734, "epoch": 0.8163692031961766, "grad_norm": 0.47161903977394104, "kl": 0.48046875, "learning_rate": 1.891691723610848e-07, "loss": 0.0192, "reward": 1.0602678954601288, "reward_std": 0.07778241205960512, "rewards/accuracy_reward": 0.07812500232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 2733 }, { "completion_length": 559.0134124755859, "epoch": 0.816667911283698, "grad_norm": 0.21497087180614471, "kl": 0.142578125, "learning_rate": 1.8888891096623376e-07, "loss": 0.0057, "reward": 1.2265625596046448, "reward_std": 0.15647070854902267, "rewards/accuracy_reward": 0.2299107201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966517984867096, "step": 2734 }, { "completion_length": 707.0045013427734, "epoch": 0.8169666193712195, "grad_norm": 0.5725014805793762, "kl": 0.67578125, "learning_rate": 1.8860904242485493e-07, "loss": 0.0271, "reward": 1.1662946939468384, "reward_std": 0.20426305755972862, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 2735 }, { "completion_length": 636.8326263427734, "epoch": 0.8172653274587409, "grad_norm": 0.36634552478790283, "kl": 0.16632080078125, "learning_rate": 1.8832956704141794e-07, "loss": 0.0067, "reward": 1.190290242433548, "reward_std": 0.08053656853735447, "rewards/accuracy_reward": 0.1941964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937798023224, "step": 2736 }, { "completion_length": 601.6116333007812, "epoch": 0.8175640355462624, "grad_norm": 0.5276097655296326, "kl": 0.45947265625, "learning_rate": 1.880504851199644e-07, "loss": 0.0184, "reward": 1.1188616752624512, "reward_std": 0.14002463221549988, "rewards/accuracy_reward": 0.12946429220028222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 2737 }, { "completion_length": 662.0000305175781, "epoch": 0.8178627436337839, "grad_norm": 0.5424684882164001, "kl": 0.445556640625, "learning_rate": 1.8777179696410822e-07, "loss": 0.0178, "reward": 1.20089291036129, "reward_std": 0.16038873046636581, "rewards/accuracy_reward": 0.2142857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2738 }, { "completion_length": 718.8125457763672, "epoch": 0.8181614517213054, "grad_norm": 0.5321694016456604, "kl": 0.673828125, "learning_rate": 1.874935028770347e-07, "loss": 0.027, "reward": 1.1696429252624512, "reward_std": 0.14290585927665234, "rewards/accuracy_reward": 0.19196429592557251, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 2739 }, { "completion_length": 722.0134429931641, "epoch": 0.8184601598088268, "grad_norm": 0.9704922437667847, "kl": 0.33056640625, "learning_rate": 1.872156031615006e-07, "loss": 0.0132, "reward": 1.1657366752624512, "reward_std": 0.17422355338931084, "rewards/accuracy_reward": 0.1875000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.97823666036129, "step": 2740 }, { "completion_length": 577.6027069091797, "epoch": 0.8187588678963483, "grad_norm": 0.8097769021987915, "kl": 0.2291259765625, "learning_rate": 1.8693809811983366e-07, "loss": 0.0092, "reward": 1.1718750596046448, "reward_std": 0.10825586132705212, "rewards/accuracy_reward": 0.18080358393490314, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 2741 }, { "completion_length": 600.4799346923828, "epoch": 0.8190575759838697, "grad_norm": 0.4474243223667145, "kl": 0.2783203125, "learning_rate": 1.8666098805393198e-07, "loss": 0.0111, "reward": 1.1478795111179352, "reward_std": 0.14910656958818436, "rewards/accuracy_reward": 0.15848215110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973767757416, "step": 2742 }, { "completion_length": 685.700927734375, "epoch": 0.8193562840713913, "grad_norm": 1.4269139766693115, "kl": 0.607421875, "learning_rate": 1.8638427326526424e-07, "loss": 0.0243, "reward": 1.141183078289032, "reward_std": 0.20375492796301842, "rewards/accuracy_reward": 0.15401786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2743 }, { "completion_length": 745.9620819091797, "epoch": 0.8196549921589127, "grad_norm": 0.39170873165130615, "kl": 0.75048828125, "learning_rate": 1.8610795405486913e-07, "loss": 0.03, "reward": 1.0770090073347092, "reward_std": 0.14552962686866522, "rewards/accuracy_reward": 0.10044643376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625298023224, "step": 2744 }, { "completion_length": 658.3795013427734, "epoch": 0.8199537002464342, "grad_norm": 0.3970778286457062, "kl": 0.55859375, "learning_rate": 1.8583203072335462e-07, "loss": 0.0224, "reward": 1.182477742433548, "reward_std": 0.18335289880633354, "rewards/accuracy_reward": 0.20089287124574184, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 2745 }, { "completion_length": 664.4821624755859, "epoch": 0.8202524083339556, "grad_norm": 0.4017375707626343, "kl": 0.389404296875, "learning_rate": 1.855565035708984e-07, "loss": 0.0156, "reward": 1.181919664144516, "reward_std": 0.14354903623461723, "rewards/accuracy_reward": 0.196428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 2746 }, { "completion_length": 654.4620819091797, "epoch": 0.8205511164214772, "grad_norm": 0.7800727486610413, "kl": 0.5006103515625, "learning_rate": 1.852813728972471e-07, "loss": 0.0201, "reward": 1.0965402126312256, "reward_std": 0.12456469051539898, "rewards/accuracy_reward": 0.11160715040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2747 }, { "completion_length": 601.8861694335938, "epoch": 0.8208498245089986, "grad_norm": 0.36367282271385193, "kl": 0.5400390625, "learning_rate": 1.8500663900171574e-07, "loss": 0.0216, "reward": 1.131696492433548, "reward_std": 0.17685137037187815, "rewards/accuracy_reward": 0.14732143213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 2748 }, { "completion_length": 655.1629791259766, "epoch": 0.8211485325965201, "grad_norm": 0.5904465317726135, "kl": 0.52001953125, "learning_rate": 1.8473230218318792e-07, "loss": 0.0208, "reward": 1.1104910969734192, "reward_std": 0.18278372287750244, "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 2749 }, { "completion_length": 672.7143249511719, "epoch": 0.8214472406840415, "grad_norm": 0.5855535268783569, "kl": 0.43505859375, "learning_rate": 1.8445836274011538e-07, "loss": 0.0174, "reward": 1.0976563096046448, "reward_std": 0.1261143572628498, "rewards/accuracy_reward": 0.11383929033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 2750 }, { "completion_length": 629.3504791259766, "epoch": 0.821745948771563, "grad_norm": 0.4949491024017334, "kl": 0.328369140625, "learning_rate": 1.8418482097051713e-07, "loss": 0.0131, "reward": 1.2282366454601288, "reward_std": 0.170924823731184, "rewards/accuracy_reward": 0.2433035857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2751 }, { "completion_length": 614.0669937133789, "epoch": 0.8220446568590845, "grad_norm": 0.37849587202072144, "kl": 0.357421875, "learning_rate": 1.8391167717198004e-07, "loss": 0.0143, "reward": 1.0837053954601288, "reward_std": 0.13288466073572636, "rewards/accuracy_reward": 0.09598214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2752 }, { "completion_length": 710.8058471679688, "epoch": 0.822343364946606, "grad_norm": 0.662055492401123, "kl": 0.9404296875, "learning_rate": 1.8363893164165756e-07, "loss": 0.0376, "reward": 1.0937500447034836, "reward_std": 0.15706472843885422, "rewards/accuracy_reward": 0.12500000675208867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500447034836, "step": 2753 }, { "completion_length": 586.7366333007812, "epoch": 0.8226420730341274, "grad_norm": 0.25181683897972107, "kl": 0.1605224609375, "learning_rate": 1.833665846762702e-07, "loss": 0.0064, "reward": 1.123883992433548, "reward_std": 0.13824315927922726, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966517984867096, "step": 2754 }, { "completion_length": 642.1317291259766, "epoch": 0.8229407811216489, "grad_norm": 0.6168525815010071, "kl": 0.837646484375, "learning_rate": 1.830946365721049e-07, "loss": 0.0336, "reward": 1.180803656578064, "reward_std": 0.1709682196378708, "rewards/accuracy_reward": 0.207589291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9732143431901932, "step": 2755 }, { "completion_length": 629.2143096923828, "epoch": 0.8232394892091703, "grad_norm": 0.9927549362182617, "kl": 0.440673828125, "learning_rate": 1.8282308762501425e-07, "loss": 0.0176, "reward": 1.1752232611179352, "reward_std": 0.21068204753100872, "rewards/accuracy_reward": 0.19419643771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 2756 }, { "completion_length": 645.1004791259766, "epoch": 0.8235381972966918, "grad_norm": 0.5125824809074402, "kl": 0.4122314453125, "learning_rate": 1.8255193813041707e-07, "loss": 0.0165, "reward": 1.1930803954601288, "reward_std": 0.14553489442914724, "rewards/accuracy_reward": 0.20758929336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 2757 }, { "completion_length": 648.9710083007812, "epoch": 0.8238369053842133, "grad_norm": 0.4532898962497711, "kl": 0.5048828125, "learning_rate": 1.8228118838329759e-07, "loss": 0.0202, "reward": 1.162946492433548, "reward_std": 0.1677693072706461, "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 2758 }, { "completion_length": 725.3817291259766, "epoch": 0.8241356134717347, "grad_norm": 0.5592590570449829, "kl": 0.71728515625, "learning_rate": 1.8201083867820472e-07, "loss": 0.0287, "reward": 1.1573660969734192, "reward_std": 0.17254193499684334, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 2759 }, { "completion_length": 667.7968902587891, "epoch": 0.8244343215592562, "grad_norm": 0.5341364741325378, "kl": 0.63427734375, "learning_rate": 1.8174088930925253e-07, "loss": 0.0254, "reward": 1.2338170409202576, "reward_std": 0.217148095369339, "rewards/accuracy_reward": 0.25000001303851604, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 2760 }, { "completion_length": 688.5089569091797, "epoch": 0.8247330296467776, "grad_norm": 0.5908727049827576, "kl": 0.657958984375, "learning_rate": 1.8147134057011963e-07, "loss": 0.0263, "reward": 1.0987723767757416, "reward_std": 0.16151245683431625, "rewards/accuracy_reward": 0.11383929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 2761 }, { "completion_length": 672.9933166503906, "epoch": 0.8250317377342992, "grad_norm": 1.1837184429168701, "kl": 0.939453125, "learning_rate": 1.8120219275404836e-07, "loss": 0.0376, "reward": 1.0630580931901932, "reward_std": 0.11326408106833696, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045260190964, "step": 2762 }, { "completion_length": 641.1361846923828, "epoch": 0.8253304458218206, "grad_norm": 0.8081303238868713, "kl": 0.301025390625, "learning_rate": 1.809334461538454e-07, "loss": 0.0121, "reward": 1.157366156578064, "reward_std": 0.18608268909156322, "rewards/accuracy_reward": 0.1651785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 2763 }, { "completion_length": 702.7143249511719, "epoch": 0.8256291539093421, "grad_norm": 0.37048983573913574, "kl": 0.73046875, "learning_rate": 1.8066510106188055e-07, "loss": 0.0292, "reward": 1.217075914144516, "reward_std": 0.14609535317867994, "rewards/accuracy_reward": 0.2343750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 2764 }, { "completion_length": 637.2277069091797, "epoch": 0.8259278619968635, "grad_norm": 0.3918754458427429, "kl": 0.45916748046875, "learning_rate": 1.80397157770087e-07, "loss": 0.0183, "reward": 1.1233259439468384, "reward_std": 0.11720455251634121, "rewards/accuracy_reward": 0.13392857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2765 }, { "completion_length": 600.8638763427734, "epoch": 0.826226570084385, "grad_norm": 0.44675949215888977, "kl": 0.569091796875, "learning_rate": 1.8012961656996095e-07, "loss": 0.0228, "reward": 1.1914063096046448, "reward_std": 0.10911357868462801, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2766 }, { "completion_length": 785.7031555175781, "epoch": 0.8265252781719065, "grad_norm": 0.8836055397987366, "kl": 0.79541015625, "learning_rate": 1.7986247775256078e-07, "loss": 0.0318, "reward": 1.0937500298023224, "reward_std": 0.21804092451930046, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 2767 }, { "completion_length": 808.9241485595703, "epoch": 0.826823986259428, "grad_norm": 0.6705446243286133, "kl": 0.5986328125, "learning_rate": 1.7959574160850744e-07, "loss": 0.0239, "reward": 1.1010045111179352, "reward_std": 0.1971924677491188, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723618745804, "step": 2768 }, { "completion_length": 622.935302734375, "epoch": 0.8271226943469494, "grad_norm": 0.556118369102478, "kl": 0.4482421875, "learning_rate": 1.793294084279838e-07, "loss": 0.0179, "reward": 1.1713169813156128, "reward_std": 0.09192205965518951, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 2769 }, { "completion_length": 682.2388610839844, "epoch": 0.8274214024344709, "grad_norm": 0.7695711255073547, "kl": 0.828125, "learning_rate": 1.7906347850073404e-07, "loss": 0.0332, "reward": 1.1780134439468384, "reward_std": 0.1885262094438076, "rewards/accuracy_reward": 0.20089286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977120578289032, "step": 2770 }, { "completion_length": 642.6451263427734, "epoch": 0.8277201105219923, "grad_norm": 0.8440923690795898, "kl": 0.54833984375, "learning_rate": 1.7879795211606402e-07, "loss": 0.022, "reward": 1.1439732611179352, "reward_std": 0.09211418638005853, "rewards/accuracy_reward": 0.15401785937137902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 2771 }, { "completion_length": 644.1919860839844, "epoch": 0.8280188186095139, "grad_norm": 0.5685098767280579, "kl": 0.6748046875, "learning_rate": 1.785328295628405e-07, "loss": 0.027, "reward": 1.1579241305589676, "reward_std": 0.15172410756349564, "rewards/accuracy_reward": 0.18526786426082253, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562947034836, "step": 2772 }, { "completion_length": 680.2500305175781, "epoch": 0.8283175266970353, "grad_norm": 0.9930422306060791, "kl": 0.56494140625, "learning_rate": 1.7826811112949058e-07, "loss": 0.0226, "reward": 1.2578125596046448, "reward_std": 0.16419037990272045, "rewards/accuracy_reward": 0.2745535895228386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 2773 }, { "completion_length": 663.7857513427734, "epoch": 0.8286162347845568, "grad_norm": 1.3212077617645264, "kl": 0.48779296875, "learning_rate": 1.7800379710400213e-07, "loss": 0.0195, "reward": 1.1752232909202576, "reward_std": 0.13075360842049122, "rewards/accuracy_reward": 0.1897321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 2774 }, { "completion_length": 699.2098541259766, "epoch": 0.8289149428720782, "grad_norm": 0.49122974276542664, "kl": 0.603515625, "learning_rate": 1.77739887773923e-07, "loss": 0.0242, "reward": 1.1953125298023224, "reward_std": 0.21671994403004646, "rewards/accuracy_reward": 0.2142857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 2775 }, { "completion_length": 644.1495819091797, "epoch": 0.8292136509595998, "grad_norm": 0.3707715570926666, "kl": 0.459716796875, "learning_rate": 1.7747638342636042e-07, "loss": 0.0184, "reward": 1.2304687798023224, "reward_std": 0.08572806627489626, "rewards/accuracy_reward": 0.2388392984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 2776 }, { "completion_length": 654.4844055175781, "epoch": 0.8295123590471212, "grad_norm": 0.5209150314331055, "kl": 0.531982421875, "learning_rate": 1.772132843479816e-07, "loss": 0.0213, "reward": 1.1155134737491608, "reward_std": 0.14850245974957943, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 2777 }, { "completion_length": 599.8214416503906, "epoch": 0.8298110671346427, "grad_norm": 0.39071574807167053, "kl": 0.2999267578125, "learning_rate": 1.7695059082501224e-07, "loss": 0.012, "reward": 1.3281250596046448, "reward_std": 0.18709978833794594, "rewards/accuracy_reward": 0.3415178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2778 }, { "completion_length": 701.3013610839844, "epoch": 0.8301097752221641, "grad_norm": 0.3802553713321686, "kl": 0.52490234375, "learning_rate": 1.7668830314323726e-07, "loss": 0.021, "reward": 1.112165242433548, "reward_std": 0.20133781991899014, "rewards/accuracy_reward": 0.12946429592557251, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 2779 }, { "completion_length": 660.5312805175781, "epoch": 0.8304084833096856, "grad_norm": 0.8292340636253357, "kl": 0.473876953125, "learning_rate": 1.7642642158800015e-07, "loss": 0.019, "reward": 1.0675223767757416, "reward_std": 0.1663736905902624, "rewards/accuracy_reward": 0.08482143143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 2780 }, { "completion_length": 664.4799346923828, "epoch": 0.8307071913972071, "grad_norm": 0.41842561960220337, "kl": 0.431640625, "learning_rate": 1.7616494644420208e-07, "loss": 0.0172, "reward": 1.190290242433548, "reward_std": 0.1685549896210432, "rewards/accuracy_reward": 0.20312501303851604, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871651977300644, "step": 2781 }, { "completion_length": 714.0067291259766, "epoch": 0.8310058994847286, "grad_norm": 0.34211423993110657, "kl": 0.307861328125, "learning_rate": 1.7590387799630246e-07, "loss": 0.0123, "reward": 1.1517857611179352, "reward_std": 0.1519470177590847, "rewards/accuracy_reward": 0.16294643748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 2782 }, { "completion_length": 673.5960235595703, "epoch": 0.83130460757225, "grad_norm": 0.3449040651321411, "kl": 0.4375, "learning_rate": 1.7564321652831827e-07, "loss": 0.0175, "reward": 1.1891741454601288, "reward_std": 0.1965593546628952, "rewards/accuracy_reward": 0.2008928693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 2783 }, { "completion_length": 674.6964569091797, "epoch": 0.8316033156597715, "grad_norm": 0.5388943552970886, "kl": 0.453857421875, "learning_rate": 1.7538296232382355e-07, "loss": 0.0181, "reward": 1.030133992433548, "reward_std": 0.1364658484235406, "rewards/accuracy_reward": 0.049107145285233855, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 2784 }, { "completion_length": 671.8660888671875, "epoch": 0.8319020237472929, "grad_norm": 0.4683125913143158, "kl": 0.3968505859375, "learning_rate": 1.7512311566594955e-07, "loss": 0.0159, "reward": 1.252790242433548, "reward_std": 0.0955209331586957, "rewards/accuracy_reward": 0.2589285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616454601288, "step": 2785 }, { "completion_length": 634.9866333007812, "epoch": 0.8322007318348145, "grad_norm": 0.42949849367141724, "kl": 0.311279296875, "learning_rate": 1.7486367683738375e-07, "loss": 0.0125, "reward": 1.2187500596046448, "reward_std": 0.19115600548684597, "rewards/accuracy_reward": 0.2366071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 2786 }, { "completion_length": 734.7656555175781, "epoch": 0.8324994399223359, "grad_norm": 0.722129225730896, "kl": 0.65380859375, "learning_rate": 1.746046461203701e-07, "loss": 0.0261, "reward": 1.1244420111179352, "reward_std": 0.18904531747102737, "rewards/accuracy_reward": 0.14955358067527413, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884290456772, "step": 2787 }, { "completion_length": 689.2946624755859, "epoch": 0.8327981480098574, "grad_norm": 1.001439094543457, "kl": 0.466796875, "learning_rate": 1.74346023796709e-07, "loss": 0.0187, "reward": 1.1004464626312256, "reward_std": 0.13103633373975754, "rewards/accuracy_reward": 0.11383929033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2788 }, { "completion_length": 699.9888610839844, "epoch": 0.8330968560973788, "grad_norm": 0.3727978467941284, "kl": 0.44140625, "learning_rate": 1.740878101477558e-07, "loss": 0.0176, "reward": 1.2031250596046448, "reward_std": 0.14886145666241646, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2789 }, { "completion_length": 553.7611999511719, "epoch": 0.8333955641849004, "grad_norm": 0.4086970388889313, "kl": 0.42822265625, "learning_rate": 1.7383000545442188e-07, "loss": 0.0171, "reward": 1.184151828289032, "reward_std": 0.16877024853602052, "rewards/accuracy_reward": 0.1897321455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 2790 }, { "completion_length": 676.4531555175781, "epoch": 0.8336942722724218, "grad_norm": 0.2764447331428528, "kl": 0.2144775390625, "learning_rate": 1.7357260999717343e-07, "loss": 0.0086, "reward": 1.1835938096046448, "reward_std": 0.1325739361345768, "rewards/accuracy_reward": 0.1919642984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916294813156128, "step": 2791 }, { "completion_length": 712.8750457763672, "epoch": 0.8339929803599433, "grad_norm": 0.6269875168800354, "kl": 0.54736328125, "learning_rate": 1.733156240560314e-07, "loss": 0.0219, "reward": 1.1383929252624512, "reward_std": 0.20720381662249565, "rewards/accuracy_reward": 0.16071429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 2792 }, { "completion_length": 698.0781555175781, "epoch": 0.8342916884474647, "grad_norm": 0.3070805072784424, "kl": 0.361328125, "learning_rate": 1.7305904791057135e-07, "loss": 0.0145, "reward": 1.137276828289032, "reward_std": 0.1734618879854679, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 2793 }, { "completion_length": 607.9977874755859, "epoch": 0.8345903965349862, "grad_norm": 0.5596662759780884, "kl": 0.305908203125, "learning_rate": 1.7280288183992307e-07, "loss": 0.0122, "reward": 1.238839328289032, "reward_std": 0.19735226221382618, "rewards/accuracy_reward": 0.24330358300358057, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.995535746216774, "step": 2794 }, { "completion_length": 607.8236846923828, "epoch": 0.8348891046225076, "grad_norm": 0.3337954878807068, "kl": 0.317138671875, "learning_rate": 1.7254712612276998e-07, "loss": 0.0127, "reward": 1.1796875596046448, "reward_std": 0.15831073001027107, "rewards/accuracy_reward": 0.1852678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 2795 }, { "completion_length": 756.1205749511719, "epoch": 0.8351878127100292, "grad_norm": 0.8505925536155701, "kl": 0.67578125, "learning_rate": 1.7229178103734943e-07, "loss": 0.0271, "reward": 1.1579241752624512, "reward_std": 0.1743631362915039, "rewards/accuracy_reward": 0.1897321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.968191996216774, "step": 2796 }, { "completion_length": 621.2120819091797, "epoch": 0.8354865207975506, "grad_norm": 0.32720932364463806, "kl": 0.36083984375, "learning_rate": 1.7203684686145156e-07, "loss": 0.0145, "reward": 1.1813616454601288, "reward_std": 0.19175775721669197, "rewards/accuracy_reward": 0.19419643376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2797 }, { "completion_length": 659.9129791259766, "epoch": 0.8357852288850721, "grad_norm": 0.41984498500823975, "kl": 0.49267578125, "learning_rate": 1.7178232387241998e-07, "loss": 0.0197, "reward": 1.2170759439468384, "reward_std": 0.19714482501149178, "rewards/accuracy_reward": 0.2388392984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 2798 }, { "completion_length": 639.8169860839844, "epoch": 0.8360839369725935, "grad_norm": 0.35981640219688416, "kl": 0.2779541015625, "learning_rate": 1.715282123471508e-07, "loss": 0.0111, "reward": 1.2812500298023224, "reward_std": 0.10979060363024473, "rewards/accuracy_reward": 0.2879464440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 2799 }, { "completion_length": 653.529052734375, "epoch": 0.836382645060115, "grad_norm": 0.553580105304718, "kl": 0.56884765625, "learning_rate": 1.7127451256209226e-07, "loss": 0.0227, "reward": 1.2003348767757416, "reward_std": 0.20391948893666267, "rewards/accuracy_reward": 0.2209821566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793526977300644, "step": 2800 }, { "completion_length": 621.5491333007812, "epoch": 0.8366813531476365, "grad_norm": 0.4273908734321594, "kl": 0.3369140625, "learning_rate": 1.7102122479324495e-07, "loss": 0.0135, "reward": 1.338727742433548, "reward_std": 0.15380230732262135, "rewards/accuracy_reward": 0.3571428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848767757416, "step": 2801 }, { "completion_length": 699.5223541259766, "epoch": 0.8369800612351579, "grad_norm": 0.5617794394493103, "kl": 0.48779296875, "learning_rate": 1.707683493161613e-07, "loss": 0.0196, "reward": 1.190290242433548, "reward_std": 0.1525091491639614, "rewards/accuracy_reward": 0.207589291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 2802 }, { "completion_length": 679.8571929931641, "epoch": 0.8372787693226794, "grad_norm": 1.3859893083572388, "kl": 0.45849609375, "learning_rate": 1.7051588640594477e-07, "loss": 0.0183, "reward": 1.1155134588479996, "reward_std": 0.1650756634771824, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205931901932, "step": 2803 }, { "completion_length": 689.8884124755859, "epoch": 0.8375774774102008, "grad_norm": 0.39676588773727417, "kl": 0.492919921875, "learning_rate": 1.7026383633725039e-07, "loss": 0.0197, "reward": 1.0686384439468384, "reward_std": 0.11439747363328934, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 2804 }, { "completion_length": 711.1562957763672, "epoch": 0.8378761854977224, "grad_norm": 0.82964026927948, "kl": 0.646240234375, "learning_rate": 1.70012199384284e-07, "loss": 0.0259, "reward": 1.2064732611179352, "reward_std": 0.1687062680721283, "rewards/accuracy_reward": 0.2388393022119999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9676339626312256, "step": 2805 }, { "completion_length": 685.4152069091797, "epoch": 0.8381748935852438, "grad_norm": 0.4083618223667145, "kl": 0.667236328125, "learning_rate": 1.6976097582080184e-07, "loss": 0.0266, "reward": 1.2472098767757416, "reward_std": 0.2260492406785488, "rewards/accuracy_reward": 0.2700892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205633878708, "step": 2806 }, { "completion_length": 612.4732360839844, "epoch": 0.8384736016727653, "grad_norm": 0.29089292883872986, "kl": 0.298828125, "learning_rate": 1.6951016592011053e-07, "loss": 0.012, "reward": 1.186383992433548, "reward_std": 0.14333034493029118, "rewards/accuracy_reward": 0.19419643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 2807 }, { "completion_length": 663.3594055175781, "epoch": 0.8387723097602867, "grad_norm": 1.1431249380111694, "kl": 1.046875, "learning_rate": 1.6925976995506674e-07, "loss": 0.0418, "reward": 1.1484375298023224, "reward_std": 0.20678482949733734, "rewards/accuracy_reward": 0.17633929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.972098246216774, "step": 2808 }, { "completion_length": 652.6451263427734, "epoch": 0.8390710178478082, "grad_norm": 0.3453381359577179, "kl": 0.398193359375, "learning_rate": 1.6900978819807664e-07, "loss": 0.0159, "reward": 1.1774553954601288, "reward_std": 0.1698911488056183, "rewards/accuracy_reward": 0.1897321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232313156128, "step": 2809 }, { "completion_length": 618.7411193847656, "epoch": 0.8393697259353297, "grad_norm": 0.6036295294761658, "kl": 0.544677734375, "learning_rate": 1.6876022092109604e-07, "loss": 0.0218, "reward": 1.1914063394069672, "reward_std": 0.10712501034140587, "rewards/accuracy_reward": 0.2008928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 2810 }, { "completion_length": 643.6629791259766, "epoch": 0.8396684340228512, "grad_norm": 0.44148367643356323, "kl": 0.35797119140625, "learning_rate": 1.685110683956294e-07, "loss": 0.0143, "reward": 1.2075893580913544, "reward_std": 0.21422108821570873, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2811 }, { "completion_length": 621.2768249511719, "epoch": 0.8399671421103726, "grad_norm": 0.5176348090171814, "kl": 0.4521484375, "learning_rate": 1.6826233089273046e-07, "loss": 0.0181, "reward": 1.159040242433548, "reward_std": 0.14297404326498508, "rewards/accuracy_reward": 0.1741071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 2812 }, { "completion_length": 711.3839569091797, "epoch": 0.8402658501978941, "grad_norm": 0.7344284057617188, "kl": 0.65234375, "learning_rate": 1.680140086830013e-07, "loss": 0.0261, "reward": 1.0803571939468384, "reward_std": 0.10455002635717392, "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 2813 }, { "completion_length": 700.6674575805664, "epoch": 0.8405645582854155, "grad_norm": 0.42229729890823364, "kl": 0.558349609375, "learning_rate": 1.6776610203659192e-07, "loss": 0.0223, "reward": 1.0954241752624512, "reward_std": 0.12632214277982712, "rewards/accuracy_reward": 0.11383928824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 2814 }, { "completion_length": 657.1584930419922, "epoch": 0.8408632663729371, "grad_norm": 0.530796229839325, "kl": 0.48095703125, "learning_rate": 1.675186112232006e-07, "loss": 0.0193, "reward": 1.0753348767757416, "reward_std": 0.11663144640624523, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2815 }, { "completion_length": 629.9196624755859, "epoch": 0.8411619744604585, "grad_norm": 0.5414686799049377, "kl": 0.82080078125, "learning_rate": 1.6727153651207313e-07, "loss": 0.0328, "reward": 1.2639509737491608, "reward_std": 0.18502621352672577, "rewards/accuracy_reward": 0.2879464402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 2816 }, { "completion_length": 715.5536041259766, "epoch": 0.84146068254798, "grad_norm": 0.4847090542316437, "kl": 0.53173828125, "learning_rate": 1.6702487817200238e-07, "loss": 0.0213, "reward": 1.1651786267757416, "reward_std": 0.15773737244307995, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 2817 }, { "completion_length": 640.4553833007812, "epoch": 0.8417593906355014, "grad_norm": 1.0882247686386108, "kl": 0.4873046875, "learning_rate": 1.6677863647132867e-07, "loss": 0.0195, "reward": 1.1344866752624512, "reward_std": 0.17392852902412415, "rewards/accuracy_reward": 0.15401786053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 2818 }, { "completion_length": 641.9241485595703, "epoch": 0.842058098723023, "grad_norm": 0.5981587171554565, "kl": 0.501220703125, "learning_rate": 1.665328116779388e-07, "loss": 0.02, "reward": 1.1227679252624512, "reward_std": 0.12941140122711658, "rewards/accuracy_reward": 0.1339285767171532, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 2819 }, { "completion_length": 688.7165374755859, "epoch": 0.8423568068105444, "grad_norm": 0.5817649364471436, "kl": 0.48828125, "learning_rate": 1.6628740405926594e-07, "loss": 0.0195, "reward": 1.0825893431901932, "reward_std": 0.11852791532874107, "rewards/accuracy_reward": 0.09821429033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2820 }, { "completion_length": 743.2857360839844, "epoch": 0.8426555148980659, "grad_norm": 0.455055296421051, "kl": 0.5673828125, "learning_rate": 1.6604241388228954e-07, "loss": 0.0227, "reward": 1.2170759439468384, "reward_std": 0.1365559622645378, "rewards/accuracy_reward": 0.22767857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2821 }, { "completion_length": 686.2254791259766, "epoch": 0.8429542229855873, "grad_norm": 0.35032832622528076, "kl": 0.60791015625, "learning_rate": 1.6579784141353508e-07, "loss": 0.0244, "reward": 1.1785714626312256, "reward_std": 0.24568448588252068, "rewards/accuracy_reward": 0.20089286682195961, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786267757416, "step": 2822 }, { "completion_length": 584.5446624755859, "epoch": 0.8432529310731088, "grad_norm": 0.618946373462677, "kl": 0.55615234375, "learning_rate": 1.6555368691907318e-07, "loss": 0.0223, "reward": 1.2946429252624512, "reward_std": 0.2000466138124466, "rewards/accuracy_reward": 0.3125000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428805589676, "step": 2823 }, { "completion_length": 591.4553833007812, "epoch": 0.8435516391606303, "grad_norm": 0.5945966243743896, "kl": 0.279052734375, "learning_rate": 1.6530995066452024e-07, "loss": 0.0112, "reward": 1.1768973767757416, "reward_std": 0.11603988334536552, "rewards/accuracy_reward": 0.191964291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 2824 }, { "completion_length": 601.4107513427734, "epoch": 0.8438503472481518, "grad_norm": 0.6822266578674316, "kl": 0.410888671875, "learning_rate": 1.650666329150372e-07, "loss": 0.0164, "reward": 1.2488840222358704, "reward_std": 0.10876931250095367, "rewards/accuracy_reward": 0.2611607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2825 }, { "completion_length": 662.2522583007812, "epoch": 0.8441490553356732, "grad_norm": 1.0449304580688477, "kl": 0.82763671875, "learning_rate": 1.6482373393533e-07, "loss": 0.033, "reward": 1.0870536267757416, "reward_std": 0.1103678671643138, "rewards/accuracy_reward": 0.1026785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 2826 }, { "completion_length": 597.9308242797852, "epoch": 0.8444477634231947, "grad_norm": 0.32517746090888977, "kl": 0.205078125, "learning_rate": 1.6458125398964908e-07, "loss": 0.0082, "reward": 1.3264510035514832, "reward_std": 0.18649015575647354, "rewards/accuracy_reward": 0.3325893059372902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.99386166036129, "step": 2827 }, { "completion_length": 630.4977951049805, "epoch": 0.8447464715107161, "grad_norm": 0.7471104264259338, "kl": 0.46875, "learning_rate": 1.643391933417886e-07, "loss": 0.0187, "reward": 1.1210937798023224, "reward_std": 0.11352422833442688, "rewards/accuracy_reward": 0.13616072246804833, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330931901932, "step": 2828 }, { "completion_length": 702.2589416503906, "epoch": 0.8450451795982377, "grad_norm": 1.111552357673645, "kl": 0.4462890625, "learning_rate": 1.6409755225508697e-07, "loss": 0.0179, "reward": 1.1205357611179352, "reward_std": 0.11303865350782871, "rewards/accuracy_reward": 0.12946429592557251, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 2829 }, { "completion_length": 614.1652145385742, "epoch": 0.8453438876857591, "grad_norm": 0.9215474724769592, "kl": 0.411865234375, "learning_rate": 1.63856330992426e-07, "loss": 0.0165, "reward": 1.2617188096046448, "reward_std": 0.14784442447125912, "rewards/accuracy_reward": 0.2700893059372902, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 2830 }, { "completion_length": 775.3326263427734, "epoch": 0.8456425957732806, "grad_norm": 0.3926113545894623, "kl": 0.69482421875, "learning_rate": 1.636155298162308e-07, "loss": 0.0278, "reward": 1.074776828289032, "reward_std": 0.13372153230011463, "rewards/accuracy_reward": 0.0892857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 2831 }, { "completion_length": 610.7366333007812, "epoch": 0.845941303860802, "grad_norm": 0.47592219710350037, "kl": 0.46826171875, "learning_rate": 1.6337514898846932e-07, "loss": 0.0187, "reward": 1.1635045111179352, "reward_std": 0.155773532576859, "rewards/accuracy_reward": 0.1830357238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 2832 }, { "completion_length": 688.7567367553711, "epoch": 0.8462400119483235, "grad_norm": 0.9929035902023315, "kl": 0.6580810546875, "learning_rate": 1.6313518877065255e-07, "loss": 0.0264, "reward": 1.2477678954601288, "reward_std": 0.13993108738213778, "rewards/accuracy_reward": 0.26116071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 2833 }, { "completion_length": 657.2969055175781, "epoch": 0.846538720035845, "grad_norm": 0.3857732117176056, "kl": 0.6396484375, "learning_rate": 1.628956494238335e-07, "loss": 0.0256, "reward": 1.1540179252624512, "reward_std": 0.20423222333192825, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 2834 }, { "completion_length": 737.5647583007812, "epoch": 0.8468374281233665, "grad_norm": 0.4380774199962616, "kl": 0.767578125, "learning_rate": 1.626565312086075e-07, "loss": 0.0307, "reward": 1.1244420111179352, "reward_std": 0.12586592324078083, "rewards/accuracy_reward": 0.1473214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977120578289032, "step": 2835 }, { "completion_length": 672.0446929931641, "epoch": 0.8471361362108879, "grad_norm": 0.6958975791931152, "kl": 0.60986328125, "learning_rate": 1.6241783438511197e-07, "loss": 0.0244, "reward": 1.0357143431901932, "reward_std": 0.12893520295619965, "rewards/accuracy_reward": 0.05133928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 2836 }, { "completion_length": 684.6897583007812, "epoch": 0.8474348442984094, "grad_norm": 0.3488898277282715, "kl": 0.388427734375, "learning_rate": 1.6217955921302537e-07, "loss": 0.0155, "reward": 1.1529018580913544, "reward_std": 0.11697725020349026, "rewards/accuracy_reward": 0.16071429569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 2837 }, { "completion_length": 663.0714569091797, "epoch": 0.8477335523859308, "grad_norm": 0.26981601119041443, "kl": 0.2332763671875, "learning_rate": 1.6194170595156798e-07, "loss": 0.0093, "reward": 1.279575914144516, "reward_std": 0.09106355113908648, "rewards/accuracy_reward": 0.2857143022119999, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 2838 }, { "completion_length": 685.5669937133789, "epoch": 0.8480322604734524, "grad_norm": 0.2502106726169586, "kl": 0.34393310546875, "learning_rate": 1.6170427485950055e-07, "loss": 0.0138, "reward": 1.2901786267757416, "reward_std": 0.17262696847319603, "rewards/accuracy_reward": 0.305803582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 2839 }, { "completion_length": 627.8526916503906, "epoch": 0.8483309685609738, "grad_norm": 0.45000818371772766, "kl": 0.27685546875, "learning_rate": 1.6146726619512504e-07, "loss": 0.0111, "reward": 1.102120578289032, "reward_std": 0.1578240804374218, "rewards/accuracy_reward": 0.11607143376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 2840 }, { "completion_length": 678.3683319091797, "epoch": 0.8486296766484953, "grad_norm": 0.5310889482498169, "kl": 0.44903564453125, "learning_rate": 1.6123068021628375e-07, "loss": 0.0179, "reward": 1.2008929252624512, "reward_std": 0.15525192581117153, "rewards/accuracy_reward": 0.21205357927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 2841 }, { "completion_length": 610.5357284545898, "epoch": 0.8489283847360167, "grad_norm": 0.19578805565834045, "kl": 0.2506103515625, "learning_rate": 1.6099451718035874e-07, "loss": 0.01, "reward": 1.1914063096046448, "reward_std": 0.09427392343059182, "rewards/accuracy_reward": 0.1941964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.997209832072258, "step": 2842 }, { "completion_length": 692.5357513427734, "epoch": 0.8492270928235381, "grad_norm": 0.42478522658348083, "kl": 0.59228515625, "learning_rate": 1.6075877734427247e-07, "loss": 0.0237, "reward": 1.2589286267757416, "reward_std": 0.1637651459313929, "rewards/accuracy_reward": 0.2767857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428805589676, "step": 2843 }, { "completion_length": 632.2143249511719, "epoch": 0.8495258009110597, "grad_norm": 0.7314503788948059, "kl": 0.39794921875, "learning_rate": 1.6052346096448673e-07, "loss": 0.0159, "reward": 1.2399554252624512, "reward_std": 0.12524473294615746, "rewards/accuracy_reward": 0.2455357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 2844 }, { "completion_length": 634.4129791259766, "epoch": 0.8498245089985811, "grad_norm": 0.3803550899028778, "kl": 0.289306640625, "learning_rate": 1.6028856829700258e-07, "loss": 0.0116, "reward": 1.1495536416769028, "reward_std": 0.1445192713290453, "rewards/accuracy_reward": 0.15848214668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 2845 }, { "completion_length": 684.5915679931641, "epoch": 0.8501232170861026, "grad_norm": 0.31084996461868286, "kl": 0.14892578125, "learning_rate": 1.6005409959736035e-07, "loss": 0.006, "reward": 1.1183035969734192, "reward_std": 0.13489623367786407, "rewards/accuracy_reward": 0.13169643376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 2846 }, { "completion_length": 544.8415374755859, "epoch": 0.850421925173624, "grad_norm": 0.3837234675884247, "kl": 0.19580078125, "learning_rate": 1.59820055120639e-07, "loss": 0.0078, "reward": 1.2728795111179352, "reward_std": 0.17963088303804398, "rewards/accuracy_reward": 0.2790178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 2847 }, { "completion_length": 644.6919860839844, "epoch": 0.8507206332611456, "grad_norm": 0.6017561554908752, "kl": 0.5615234375, "learning_rate": 1.5958643512145584e-07, "loss": 0.0225, "reward": 1.1914063096046448, "reward_std": 0.18156187236309052, "rewards/accuracy_reward": 0.20758930034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 2848 }, { "completion_length": 598.1942443847656, "epoch": 0.851019341348667, "grad_norm": 0.33263862133026123, "kl": 0.25506591796875, "learning_rate": 1.5935323985396674e-07, "loss": 0.0102, "reward": 1.1835938096046448, "reward_std": 0.13060456048697233, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937649011612, "step": 2849 }, { "completion_length": 665.3192291259766, "epoch": 0.8513180494361885, "grad_norm": 0.4823359251022339, "kl": 0.3988037109375, "learning_rate": 1.5912046957186507e-07, "loss": 0.0159, "reward": 1.2427456080913544, "reward_std": 0.18593433126807213, "rewards/accuracy_reward": 0.2589285895228386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 2850 }, { "completion_length": 581.0625305175781, "epoch": 0.8516167575237099, "grad_norm": 0.31249693036079407, "kl": 0.52490234375, "learning_rate": 1.588881245283822e-07, "loss": 0.021, "reward": 1.244977742433548, "reward_std": 0.17395310010761023, "rewards/accuracy_reward": 0.267857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205633878708, "step": 2851 }, { "completion_length": 622.3593978881836, "epoch": 0.8519154656112314, "grad_norm": 0.621161937713623, "kl": 0.78759765625, "learning_rate": 1.5865620497628683e-07, "loss": 0.0315, "reward": 1.1501116454601288, "reward_std": 0.16292607970535755, "rewards/accuracy_reward": 0.18080357951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.969308078289032, "step": 2852 }, { "completion_length": 666.3080749511719, "epoch": 0.8522141736987529, "grad_norm": 0.6218953728675842, "kl": 0.38818359375, "learning_rate": 1.5842471116788458e-07, "loss": 0.0155, "reward": 1.1953125298023224, "reward_std": 0.1748484969139099, "rewards/accuracy_reward": 0.2053571604192257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 2853 }, { "completion_length": 647.6451034545898, "epoch": 0.8525128817862744, "grad_norm": 0.4943658709526062, "kl": 0.55517578125, "learning_rate": 1.5819364335501805e-07, "loss": 0.0222, "reward": 1.2137277126312256, "reward_std": 0.2074771337211132, "rewards/accuracy_reward": 0.2254464402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812649011612, "step": 2854 }, { "completion_length": 681.5156555175781, "epoch": 0.8528115898737958, "grad_norm": 0.4086543917655945, "kl": 0.6697998046875, "learning_rate": 1.579630017890665e-07, "loss": 0.0268, "reward": 1.1484375596046448, "reward_std": 0.16491133347153664, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 2855 }, { "completion_length": 650.4308319091797, "epoch": 0.8531102979613173, "grad_norm": 0.2874155044555664, "kl": 0.2880859375, "learning_rate": 1.5773278672094515e-07, "loss": 0.0115, "reward": 1.233258992433548, "reward_std": 0.13327880576252937, "rewards/accuracy_reward": 0.23883929569274187, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9944196939468384, "step": 2856 }, { "completion_length": 698.466552734375, "epoch": 0.8534090060488387, "grad_norm": 0.8092401027679443, "kl": 0.6591796875, "learning_rate": 1.5750299840110554e-07, "loss": 0.0264, "reward": 1.112165242433548, "reward_std": 0.195165092125535, "rewards/accuracy_reward": 0.14062500861473382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9715402126312256, "step": 2857 }, { "completion_length": 682.2924346923828, "epoch": 0.8537077141363603, "grad_norm": 0.5493013262748718, "kl": 0.47998046875, "learning_rate": 1.5727363707953495e-07, "loss": 0.0192, "reward": 1.1188616454601288, "reward_std": 0.12655216455459595, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2858 }, { "completion_length": 601.122802734375, "epoch": 0.8540064222238817, "grad_norm": 0.9219896793365479, "kl": 0.171142578125, "learning_rate": 1.5704470300575572e-07, "loss": 0.0069, "reward": 1.1545759439468384, "reward_std": 0.1294299541041255, "rewards/accuracy_reward": 0.16071429708972573, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 2859 }, { "completion_length": 615.6607360839844, "epoch": 0.8543051303114032, "grad_norm": 0.5932513475418091, "kl": 0.312255859375, "learning_rate": 1.5681619642882593e-07, "loss": 0.0125, "reward": 1.1110491454601288, "reward_std": 0.10527444258332253, "rewards/accuracy_reward": 0.11830358067527413, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455931901932, "step": 2860 }, { "completion_length": 626.3504791259766, "epoch": 0.8546038383989246, "grad_norm": 0.8390100002288818, "kl": 0.3160400390625, "learning_rate": 1.5658811759733833e-07, "loss": 0.0127, "reward": 1.1010045111179352, "reward_std": 0.1262840572744608, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2861 }, { "completion_length": 633.8393096923828, "epoch": 0.8549025464864461, "grad_norm": 0.4528694152832031, "kl": 0.552734375, "learning_rate": 1.563604667594202e-07, "loss": 0.0221, "reward": 1.1456473767757416, "reward_std": 0.11591734457761049, "rewards/accuracy_reward": 0.15625000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2862 }, { "completion_length": 673.1094055175781, "epoch": 0.8552012545739676, "grad_norm": 0.5749574303627014, "kl": 0.40673828125, "learning_rate": 1.5613324416273353e-07, "loss": 0.0162, "reward": 1.2008928954601288, "reward_std": 0.1371840424835682, "rewards/accuracy_reward": 0.2165178693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 2863 }, { "completion_length": 705.4397735595703, "epoch": 0.8554999626614891, "grad_norm": 0.5530177354812622, "kl": 0.7763671875, "learning_rate": 1.5590645005447397e-07, "loss": 0.031, "reward": 1.117745578289032, "reward_std": 0.14543554559350014, "rewards/accuracy_reward": 0.1428571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884439468384, "step": 2864 }, { "completion_length": 639.966552734375, "epoch": 0.8557986707490105, "grad_norm": 0.2894631028175354, "kl": 0.38818359375, "learning_rate": 1.5568008468137148e-07, "loss": 0.0155, "reward": 1.1400670111179352, "reward_std": 0.11332071386277676, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 2865 }, { "completion_length": 690.2366333007812, "epoch": 0.856097378836532, "grad_norm": 0.7610611915588379, "kl": 0.6265869140625, "learning_rate": 1.5545414828968944e-07, "loss": 0.0251, "reward": 1.0920759439468384, "reward_std": 0.13893298245966434, "rewards/accuracy_reward": 0.11383929336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 2866 }, { "completion_length": 706.685302734375, "epoch": 0.8563960869240534, "grad_norm": 0.47658687829971313, "kl": 0.44873046875, "learning_rate": 1.5522864112522433e-07, "loss": 0.0179, "reward": 1.1272321939468384, "reward_std": 0.1719027068465948, "rewards/accuracy_reward": 0.14285715040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 2867 }, { "completion_length": 665.9754791259766, "epoch": 0.856694795011575, "grad_norm": 0.6249805092811584, "kl": 0.54443359375, "learning_rate": 1.550035634333059e-07, "loss": 0.0218, "reward": 1.2433036267757416, "reward_std": 0.13585799559950829, "rewards/accuracy_reward": 0.26116072572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98214291036129, "step": 2868 }, { "completion_length": 696.3750305175781, "epoch": 0.8569935030990964, "grad_norm": 0.5844272375106812, "kl": 0.37646484375, "learning_rate": 1.5477891545879674e-07, "loss": 0.0151, "reward": 1.1099330484867096, "reward_std": 0.10396041348576546, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 2869 }, { "completion_length": 623.9910888671875, "epoch": 0.8572922111866179, "grad_norm": 0.7756633162498474, "kl": 0.46533203125, "learning_rate": 1.5455469744609163e-07, "loss": 0.0186, "reward": 1.1780134439468384, "reward_std": 0.2278890460729599, "rewards/accuracy_reward": 0.191964291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 2870 }, { "completion_length": 651.3772735595703, "epoch": 0.8575909192741393, "grad_norm": 0.9869959950447083, "kl": 0.56787109375, "learning_rate": 1.5433090963911788e-07, "loss": 0.0227, "reward": 1.1707589775323868, "reward_std": 0.1890007834881544, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 2871 }, { "completion_length": 738.1607360839844, "epoch": 0.8578896273616609, "grad_norm": 0.9918161630630493, "kl": 0.8505859375, "learning_rate": 1.5410755228133483e-07, "loss": 0.0341, "reward": 1.194196492433548, "reward_std": 0.1657557487487793, "rewards/accuracy_reward": 0.2209821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.973214328289032, "step": 2872 }, { "completion_length": 714.7879791259766, "epoch": 0.8581883354491823, "grad_norm": 0.39134615659713745, "kl": 0.38525390625, "learning_rate": 1.5388462561573315e-07, "loss": 0.0154, "reward": 1.1400670111179352, "reward_std": 0.14576230570673943, "rewards/accuracy_reward": 0.15848215389996767, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 2873 }, { "completion_length": 661.8125152587891, "epoch": 0.8584870435367038, "grad_norm": 0.6944700479507446, "kl": 0.63623046875, "learning_rate": 1.5366212988483532e-07, "loss": 0.0254, "reward": 1.1914063096046448, "reward_std": 0.14023633766919374, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2874 }, { "completion_length": 643.4933166503906, "epoch": 0.8587857516242252, "grad_norm": 0.3254837989807129, "kl": 0.2889404296875, "learning_rate": 1.5344006533069503e-07, "loss": 0.0116, "reward": 1.1065848469734192, "reward_std": 0.136468262411654, "rewards/accuracy_reward": 0.12723214784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 2875 }, { "completion_length": 639.8393249511719, "epoch": 0.8590844597117467, "grad_norm": 0.5661990642547607, "kl": 0.443115234375, "learning_rate": 1.5321843219489645e-07, "loss": 0.0177, "reward": 1.1316964626312256, "reward_std": 0.16683066869154572, "rewards/accuracy_reward": 0.1361607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9955357313156128, "step": 2876 }, { "completion_length": 677.7924499511719, "epoch": 0.8593831677992682, "grad_norm": 0.44440895318984985, "kl": 0.690185546875, "learning_rate": 1.5299723071855498e-07, "loss": 0.0276, "reward": 1.327008992433548, "reward_std": 0.13662214204669, "rewards/accuracy_reward": 0.3459821678698063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 2877 }, { "completion_length": 688.1986846923828, "epoch": 0.8596818758867897, "grad_norm": 0.48217320442199707, "kl": 0.434326171875, "learning_rate": 1.5277646114231596e-07, "loss": 0.0174, "reward": 1.2472098767757416, "reward_std": 0.15702622383832932, "rewards/accuracy_reward": 0.2656250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 2878 }, { "completion_length": 611.4776916503906, "epoch": 0.8599805839743111, "grad_norm": 0.438822478055954, "kl": 0.2425537109375, "learning_rate": 1.5255612370635515e-07, "loss": 0.0097, "reward": 1.0837053954601288, "reward_std": 0.14511243719607592, "rewards/accuracy_reward": 0.1004464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2879 }, { "completion_length": 614.9643249511719, "epoch": 0.8602792920618326, "grad_norm": 0.4506005346775055, "kl": 0.3466796875, "learning_rate": 1.523362186503781e-07, "loss": 0.0139, "reward": 1.1289062798023224, "reward_std": 0.15908582881093025, "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 2880 }, { "completion_length": 610.678596496582, "epoch": 0.860578000149354, "grad_norm": 0.3583524823188782, "kl": 0.396484375, "learning_rate": 1.5211674621361985e-07, "loss": 0.0159, "reward": 1.0775670111179352, "reward_std": 0.1403959933668375, "rewards/accuracy_reward": 0.09151786123402417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 2881 }, { "completion_length": 752.4397888183594, "epoch": 0.8608767082368756, "grad_norm": 0.4356541037559509, "kl": 0.388916015625, "learning_rate": 1.51897706634845e-07, "loss": 0.0156, "reward": 1.1010045111179352, "reward_std": 0.1584057155996561, "rewards/accuracy_reward": 0.11607143213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2882 }, { "completion_length": 574.6897735595703, "epoch": 0.861175416324397, "grad_norm": 0.616520345211029, "kl": 0.1627197265625, "learning_rate": 1.5167910015234714e-07, "loss": 0.0065, "reward": 1.3275670111179352, "reward_std": 0.17060068622231483, "rewards/accuracy_reward": 0.3303571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9972098469734192, "step": 2883 }, { "completion_length": 623.5245819091797, "epoch": 0.8614741244119185, "grad_norm": 0.6150420308113098, "kl": 0.37677001953125, "learning_rate": 1.5146092700394864e-07, "loss": 0.0151, "reward": 1.381696492433548, "reward_std": 0.23626302182674408, "rewards/accuracy_reward": 0.3906250149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 2884 }, { "completion_length": 619.1049346923828, "epoch": 0.8617728324994399, "grad_norm": 0.9722714424133301, "kl": 0.56884765625, "learning_rate": 1.512431874270005e-07, "loss": 0.0227, "reward": 1.2427456080913544, "reward_std": 0.22624455392360687, "rewards/accuracy_reward": 0.2566964402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 2885 }, { "completion_length": 689.622802734375, "epoch": 0.8620715405869613, "grad_norm": 0.6196969151496887, "kl": 0.409912109375, "learning_rate": 1.510258816583822e-07, "loss": 0.0164, "reward": 1.0580357909202576, "reward_std": 0.138849257491529, "rewards/accuracy_reward": 0.06919643213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888392984867096, "step": 2886 }, { "completion_length": 647.8705596923828, "epoch": 0.8623702486744829, "grad_norm": 0.9157299995422363, "kl": 0.30810546875, "learning_rate": 1.5080900993450084e-07, "loss": 0.0123, "reward": 1.1685268580913544, "reward_std": 0.16222808323800564, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 2887 }, { "completion_length": 551.4732360839844, "epoch": 0.8626689567620043, "grad_norm": 0.3734751045703888, "kl": 0.2674560546875, "learning_rate": 1.5059257249129177e-07, "loss": 0.0107, "reward": 1.1819196939468384, "reward_std": 0.09151391428895295, "rewards/accuracy_reward": 0.1897321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875447034836, "step": 2888 }, { "completion_length": 690.5469131469727, "epoch": 0.8629676648495258, "grad_norm": 0.4016299545764923, "kl": 0.85205078125, "learning_rate": 1.503765695642178e-07, "loss": 0.034, "reward": 1.1456473767757416, "reward_std": 0.14085112512111664, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.969308078289032, "step": 2889 }, { "completion_length": 696.1250305175781, "epoch": 0.8632663729370472, "grad_norm": 1.194558024406433, "kl": 0.67919921875, "learning_rate": 1.5016100138826873e-07, "loss": 0.0272, "reward": 1.088169664144516, "reward_std": 0.08048598747700453, "rewards/accuracy_reward": 0.09375000232830644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 2890 }, { "completion_length": 691.5714569091797, "epoch": 0.8635650810245687, "grad_norm": 0.7213060259819031, "kl": 0.5908203125, "learning_rate": 1.4994586819796185e-07, "loss": 0.0236, "reward": 1.208147406578064, "reward_std": 0.16358369030058384, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2891 }, { "completion_length": 595.1138763427734, "epoch": 0.8638637891120902, "grad_norm": 0.6535106897354126, "kl": 0.3974609375, "learning_rate": 1.497311702273407e-07, "loss": 0.0159, "reward": 1.117745578289032, "reward_std": 0.16917128581553698, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 2892 }, { "completion_length": 620.6719055175781, "epoch": 0.8641624971996117, "grad_norm": 0.4295296370983124, "kl": 0.5498046875, "learning_rate": 1.4951690770997582e-07, "loss": 0.022, "reward": 1.1702009737491608, "reward_std": 0.160204716026783, "rewards/accuracy_reward": 0.1852678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2893 }, { "completion_length": 686.5915374755859, "epoch": 0.8644612052871331, "grad_norm": 0.47259849309921265, "kl": 0.3486328125, "learning_rate": 1.4930308087896386e-07, "loss": 0.014, "reward": 1.0892857611179352, "reward_std": 0.13254591869190335, "rewards/accuracy_reward": 0.10044643143191934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 2894 }, { "completion_length": 623.9419860839844, "epoch": 0.8647599133746546, "grad_norm": 0.37840819358825684, "kl": 0.41204833984375, "learning_rate": 1.490896899669273e-07, "loss": 0.0165, "reward": 1.231026828289032, "reward_std": 0.18030431121587753, "rewards/accuracy_reward": 0.2477678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2895 }, { "completion_length": 667.2165374755859, "epoch": 0.865058621462176, "grad_norm": 0.36461183428764343, "kl": 0.2806396484375, "learning_rate": 1.4887673520601462e-07, "loss": 0.0112, "reward": 1.1171875596046448, "reward_std": 0.08977623144164681, "rewards/accuracy_reward": 0.12276786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 2896 }, { "completion_length": 682.8102874755859, "epoch": 0.8653573295496976, "grad_norm": 0.4990701675415039, "kl": 0.443603515625, "learning_rate": 1.4866421682789985e-07, "loss": 0.0178, "reward": 1.1529018580913544, "reward_std": 0.15277290157973766, "rewards/accuracy_reward": 0.17187500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 2897 }, { "completion_length": 661.1919860839844, "epoch": 0.865656037637219, "grad_norm": 0.9073061943054199, "kl": 0.5535888671875, "learning_rate": 1.4845213506378192e-07, "loss": 0.0221, "reward": 1.0842634737491608, "reward_std": 0.09785871393978596, "rewards/accuracy_reward": 0.10044643585570157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 2898 }, { "completion_length": 665.6138610839844, "epoch": 0.8659547457247405, "grad_norm": 0.3190471827983856, "kl": 0.242431640625, "learning_rate": 1.4824049014438512e-07, "loss": 0.0097, "reward": 1.1830357313156128, "reward_std": 0.11972535960376263, "rewards/accuracy_reward": 0.18973215110599995, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933036118745804, "step": 2899 }, { "completion_length": 589.4643096923828, "epoch": 0.8662534538122619, "grad_norm": 0.6106679439544678, "kl": 0.25732421875, "learning_rate": 1.4802928229995845e-07, "loss": 0.0103, "reward": 1.1612723767757416, "reward_std": 0.16232445277273655, "rewards/accuracy_reward": 0.17633928847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2900 }, { "completion_length": 588.0335006713867, "epoch": 0.8665521618997835, "grad_norm": 0.3457079827785492, "kl": 0.337890625, "learning_rate": 1.478185117602752e-07, "loss": 0.0135, "reward": 1.2516742050647736, "reward_std": 0.1203959034755826, "rewards/accuracy_reward": 0.2544642996508628, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.997209832072258, "step": 2901 }, { "completion_length": 635.6183471679688, "epoch": 0.8668508699873049, "grad_norm": 0.3582288324832916, "kl": 0.2130126953125, "learning_rate": 1.4760817875463318e-07, "loss": 0.0085, "reward": 1.1456473469734192, "reward_std": 0.18484435975551605, "rewards/accuracy_reward": 0.15178572572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 2902 }, { "completion_length": 601.7187805175781, "epoch": 0.8671495780748264, "grad_norm": 0.5240271687507629, "kl": 0.1990966796875, "learning_rate": 1.4739828351185407e-07, "loss": 0.008, "reward": 1.1674107909202576, "reward_std": 0.14409079030156136, "rewards/accuracy_reward": 0.1741071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 2903 }, { "completion_length": 620.5156555175781, "epoch": 0.8674482861623478, "grad_norm": 0.4323769211769104, "kl": 0.3228759765625, "learning_rate": 1.4718882626028323e-07, "loss": 0.0129, "reward": 1.1674107909202576, "reward_std": 0.125728121958673, "rewards/accuracy_reward": 0.18750000977888703, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107313156128, "step": 2904 }, { "completion_length": 686.1986846923828, "epoch": 0.8677469942498693, "grad_norm": 0.46259385347366333, "kl": 0.48876953125, "learning_rate": 1.4697980722778976e-07, "loss": 0.0196, "reward": 1.1032366752624512, "reward_std": 0.13990302570164204, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 2905 }, { "completion_length": 659.1964569091797, "epoch": 0.8680457023373908, "grad_norm": 0.6586019992828369, "kl": 0.67333984375, "learning_rate": 1.4677122664176572e-07, "loss": 0.027, "reward": 1.1997768580913544, "reward_std": 0.18498924374580383, "rewards/accuracy_reward": 0.22991073224693537, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9698661118745804, "step": 2906 }, { "completion_length": 690.6317291259766, "epoch": 0.8683444104249123, "grad_norm": 0.5097805261611938, "kl": 0.390380859375, "learning_rate": 1.465630847291264e-07, "loss": 0.0156, "reward": 1.1378348469734192, "reward_std": 0.1085559562779963, "rewards/accuracy_reward": 0.1473214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 2907 }, { "completion_length": 725.6339721679688, "epoch": 0.8686431185124337, "grad_norm": 0.4606878161430359, "kl": 0.6143798828125, "learning_rate": 1.4635538171630992e-07, "loss": 0.0245, "reward": 1.0703125596046448, "reward_std": 0.20594296418130398, "rewards/accuracy_reward": 0.09598214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.97433041036129, "step": 2908 }, { "completion_length": 728.7120971679688, "epoch": 0.8689418265999552, "grad_norm": 0.4498690068721771, "kl": 0.65869140625, "learning_rate": 1.4614811782927667e-07, "loss": 0.0264, "reward": 1.125558078289032, "reward_std": 0.19443678110837936, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 2909 }, { "completion_length": 713.0848388671875, "epoch": 0.8692405346874766, "grad_norm": 0.6259542107582092, "kl": 0.31982421875, "learning_rate": 1.4594129329350944e-07, "loss": 0.0128, "reward": 1.117745578289032, "reward_std": 0.19387666508555412, "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 2910 }, { "completion_length": 707.1853179931641, "epoch": 0.8695392427749982, "grad_norm": 0.4871130883693695, "kl": 0.46630859375, "learning_rate": 1.4573490833401316e-07, "loss": 0.0187, "reward": 1.0948661267757416, "reward_std": 0.14752702228724957, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589477300644, "step": 2911 }, { "completion_length": 678.6942291259766, "epoch": 0.8698379508625196, "grad_norm": 0.4378572404384613, "kl": 0.78125, "learning_rate": 1.4552896317531436e-07, "loss": 0.0313, "reward": 1.1104911267757416, "reward_std": 0.1556637454777956, "rewards/accuracy_reward": 0.13839286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9720982611179352, "step": 2912 }, { "completion_length": 708.5312957763672, "epoch": 0.8701366589500411, "grad_norm": 0.8312204480171204, "kl": 0.6375732421875, "learning_rate": 1.4532345804146113e-07, "loss": 0.0255, "reward": 1.2103795111179352, "reward_std": 0.19985386356711388, "rewards/accuracy_reward": 0.2254464402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 2913 }, { "completion_length": 712.6986999511719, "epoch": 0.8704353670375625, "grad_norm": 0.5690721869468689, "kl": 0.476318359375, "learning_rate": 1.4511839315602308e-07, "loss": 0.0191, "reward": 1.1668527126312256, "reward_std": 0.17831147834658623, "rewards/accuracy_reward": 0.18303572572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 2914 }, { "completion_length": 671.5201110839844, "epoch": 0.870734075125084, "grad_norm": 0.6246144771575928, "kl": 0.67822265625, "learning_rate": 1.449137687420906e-07, "loss": 0.0271, "reward": 1.2438616752624512, "reward_std": 0.22081388533115387, "rewards/accuracy_reward": 0.2723214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9715402126312256, "step": 2915 }, { "completion_length": 622.6517944335938, "epoch": 0.8710327832126055, "grad_norm": 0.49702590703964233, "kl": 0.29541015625, "learning_rate": 1.4470958502227496e-07, "loss": 0.0118, "reward": 1.1529018580913544, "reward_std": 0.13315697945654392, "rewards/accuracy_reward": 0.1562500111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966518133878708, "step": 2916 }, { "completion_length": 642.5937652587891, "epoch": 0.871331491300127, "grad_norm": 0.502903401851654, "kl": 0.306396484375, "learning_rate": 1.445058422187082e-07, "loss": 0.0123, "reward": 1.231584906578064, "reward_std": 0.15422787703573704, "rewards/accuracy_reward": 0.2433035857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2917 }, { "completion_length": 748.2767944335938, "epoch": 0.8716301993876484, "grad_norm": 0.3773409128189087, "kl": 0.86572265625, "learning_rate": 1.4430254055304225e-07, "loss": 0.0347, "reward": 1.0502232909202576, "reward_std": 0.12355335429310799, "rewards/accuracy_reward": 0.0758928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.97433041036129, "step": 2918 }, { "completion_length": 634.2879791259766, "epoch": 0.8719289074751699, "grad_norm": 0.3841671645641327, "kl": 0.36083984375, "learning_rate": 1.440996802464497e-07, "loss": 0.0144, "reward": 1.1635045111179352, "reward_std": 0.14531788602471352, "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973767757416, "step": 2919 }, { "completion_length": 791.9576263427734, "epoch": 0.8722276155626913, "grad_norm": 0.8498659133911133, "kl": 0.3955078125, "learning_rate": 1.4389726151962242e-07, "loss": 0.0158, "reward": 1.0976563096046448, "reward_std": 0.1693921573460102, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 2920 }, { "completion_length": 588.0245819091797, "epoch": 0.8725263236502129, "grad_norm": 0.46196743845939636, "kl": 0.25341796875, "learning_rate": 1.4369528459277228e-07, "loss": 0.0101, "reward": 1.1735491454601288, "reward_std": 0.18612751923501492, "rewards/accuracy_reward": 0.18080358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 2921 }, { "completion_length": 758.2254638671875, "epoch": 0.8728250317377343, "grad_norm": 1.2931181192398071, "kl": 0.873046875, "learning_rate": 1.4349374968563044e-07, "loss": 0.035, "reward": 1.1344866752624512, "reward_std": 0.19537978619337082, "rewards/accuracy_reward": 0.15848215017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 2922 }, { "completion_length": 599.8415451049805, "epoch": 0.8731237398252558, "grad_norm": 0.322186678647995, "kl": 0.43310546875, "learning_rate": 1.43292657017447e-07, "loss": 0.0173, "reward": 1.1891741752624512, "reward_std": 0.14596006460487843, "rewards/accuracy_reward": 0.19866072200238705, "rewards/format_reward": 0.0022321429569274187, "rewards/tag_count_reward": 0.9882812798023224, "step": 2923 }, { "completion_length": 655.6719055175781, "epoch": 0.8734224479127772, "grad_norm": 0.4198478162288666, "kl": 0.412109375, "learning_rate": 1.4309200680699104e-07, "loss": 0.0165, "reward": 1.1568081080913544, "reward_std": 0.17972798272967339, "rewards/accuracy_reward": 0.16294643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 2924 }, { "completion_length": 674.2589569091797, "epoch": 0.8737211560002988, "grad_norm": 0.6131441593170166, "kl": 0.5224609375, "learning_rate": 1.4289179927255058e-07, "loss": 0.0209, "reward": 1.1071428954601288, "reward_std": 0.1438286453485489, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2925 }, { "completion_length": 615.2968902587891, "epoch": 0.8740198640878202, "grad_norm": 0.3839798867702484, "kl": 0.351318359375, "learning_rate": 1.4269203463193148e-07, "loss": 0.014, "reward": 1.1763393580913544, "reward_std": 0.11701468657702208, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 2926 }, { "completion_length": 678.3504791259766, "epoch": 0.8743185721753417, "grad_norm": 0.41484397649765015, "kl": 0.53173828125, "learning_rate": 1.424927131024582e-07, "loss": 0.0213, "reward": 1.1623884737491608, "reward_std": 0.14112013019621372, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 2927 }, { "completion_length": 650.5892944335938, "epoch": 0.8746172802628631, "grad_norm": 0.673841118812561, "kl": 0.557861328125, "learning_rate": 1.4229383490097325e-07, "loss": 0.0223, "reward": 1.1612723767757416, "reward_std": 0.19474537670612335, "rewards/accuracy_reward": 0.16741072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 2928 }, { "completion_length": 582.9531402587891, "epoch": 0.8749159883503845, "grad_norm": 0.42695748805999756, "kl": 0.546630859375, "learning_rate": 1.4209540024383627e-07, "loss": 0.0219, "reward": 1.2187500596046448, "reward_std": 0.1958288112655282, "rewards/accuracy_reward": 0.23883930034935474, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 2929 }, { "completion_length": 706.3661041259766, "epoch": 0.8752146964379061, "grad_norm": 0.6913446187973022, "kl": 0.4677734375, "learning_rate": 1.4189740934692497e-07, "loss": 0.0187, "reward": 1.1696429252624512, "reward_std": 0.19634848460555077, "rewards/accuracy_reward": 0.18750000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 2930 }, { "completion_length": 683.5290374755859, "epoch": 0.8755134045254275, "grad_norm": 0.33475005626678467, "kl": 0.36981201171875, "learning_rate": 1.4169986242563388e-07, "loss": 0.0148, "reward": 1.125558078289032, "reward_std": 0.19180633500218391, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616156578064, "step": 2931 }, { "completion_length": 590.4285888671875, "epoch": 0.875812112612949, "grad_norm": 0.2655635178089142, "kl": 0.21728515625, "learning_rate": 1.4150275969487472e-07, "loss": 0.0087, "reward": 1.3431920111179352, "reward_std": 0.16791347600519657, "rewards/accuracy_reward": 0.3459821678698063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.997209832072258, "step": 2932 }, { "completion_length": 673.4420013427734, "epoch": 0.8761108207004704, "grad_norm": 0.6164353489875793, "kl": 0.4268798828125, "learning_rate": 1.4130610136907606e-07, "loss": 0.0171, "reward": 1.150669664144516, "reward_std": 0.19656335189938545, "rewards/accuracy_reward": 0.16741072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 2933 }, { "completion_length": 717.2611999511719, "epoch": 0.8764095287879919, "grad_norm": 0.4627428650856018, "kl": 0.5650634765625, "learning_rate": 1.4110988766218273e-07, "loss": 0.0226, "reward": 1.1372768431901932, "reward_std": 0.10137405525892973, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.985491082072258, "step": 2934 }, { "completion_length": 660.013427734375, "epoch": 0.8767082368755134, "grad_norm": 0.6824290752410889, "kl": 0.265625, "learning_rate": 1.4091411878765611e-07, "loss": 0.0106, "reward": 1.145089328289032, "reward_std": 0.13837520219385624, "rewards/accuracy_reward": 0.14955357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.995535746216774, "step": 2935 }, { "completion_length": 700.3281555175781, "epoch": 0.8770069449630349, "grad_norm": 1.0041354894638062, "kl": 0.71435546875, "learning_rate": 1.407187949584736e-07, "loss": 0.0285, "reward": 1.2500000894069672, "reward_std": 0.16177122853696346, "rewards/accuracy_reward": 0.2767857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9732143133878708, "step": 2936 }, { "completion_length": 726.2366333007812, "epoch": 0.8773056530505563, "grad_norm": 0.3628709018230438, "kl": 0.47314453125, "learning_rate": 1.405239163871282e-07, "loss": 0.019, "reward": 1.1255581080913544, "reward_std": 0.15130606666207314, "rewards/accuracy_reward": 0.13616071990691125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2937 }, { "completion_length": 765.060302734375, "epoch": 0.8776043611380778, "grad_norm": 0.48349758982658386, "kl": 0.59375, "learning_rate": 1.4032948328562882e-07, "loss": 0.0238, "reward": 1.0931920111179352, "reward_std": 0.12619975674897432, "rewards/accuracy_reward": 0.10491072130389512, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2938 }, { "completion_length": 679.9531402587891, "epoch": 0.8779030692255992, "grad_norm": 1.312831997871399, "kl": 0.46044921875, "learning_rate": 1.4013549586549972e-07, "loss": 0.0184, "reward": 1.09542416036129, "reward_std": 0.08574641402810812, "rewards/accuracy_reward": 0.10714286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 2939 }, { "completion_length": 704.9910888671875, "epoch": 0.8782017773131208, "grad_norm": 0.43641284108161926, "kl": 0.6539306640625, "learning_rate": 1.3994195433777992e-07, "loss": 0.0262, "reward": 1.0775670111179352, "reward_std": 0.13201839290559292, "rewards/accuracy_reward": 0.09598215040750802, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 2940 }, { "completion_length": 626.7277069091797, "epoch": 0.8785004854006422, "grad_norm": 0.2605220675468445, "kl": 0.2647705078125, "learning_rate": 1.3974885891302386e-07, "loss": 0.0106, "reward": 1.2991071939468384, "reward_std": 0.13420716114342213, "rewards/accuracy_reward": 0.3080357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 2941 }, { "completion_length": 667.8393096923828, "epoch": 0.8787991934881637, "grad_norm": 1.3194628953933716, "kl": 0.32275390625, "learning_rate": 1.3955620980130042e-07, "loss": 0.0129, "reward": 1.1322545260190964, "reward_std": 0.13213700987398624, "rewards/accuracy_reward": 0.14285714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2942 }, { "completion_length": 688.6875152587891, "epoch": 0.8790979015756851, "grad_norm": 0.4247817397117615, "kl": 0.277099609375, "learning_rate": 1.3936400721219282e-07, "loss": 0.0111, "reward": 1.127790242433548, "reward_std": 0.1438201144337654, "rewards/accuracy_reward": 0.13616071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 2943 }, { "completion_length": 683.0848541259766, "epoch": 0.8793966096632067, "grad_norm": 0.5434719324111938, "kl": 0.5609130859375, "learning_rate": 1.3917225135479882e-07, "loss": 0.0224, "reward": 1.2187500596046448, "reward_std": 0.15790273621678352, "rewards/accuracy_reward": 0.2343750149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 2944 }, { "completion_length": 681.6361846923828, "epoch": 0.8796953177507281, "grad_norm": 0.7113544940948486, "kl": 0.52197265625, "learning_rate": 1.3898094243772979e-07, "loss": 0.0209, "reward": 1.1049107611179352, "reward_std": 0.09873681887984276, "rewards/accuracy_reward": 0.1183035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071939468384, "step": 2945 }, { "completion_length": 680.8214721679688, "epoch": 0.8799940258382496, "grad_norm": 0.8933870196342468, "kl": 0.650390625, "learning_rate": 1.3879008066911115e-07, "loss": 0.026, "reward": 1.1344866752624512, "reward_std": 0.15481163002550602, "rewards/accuracy_reward": 0.15178572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 2946 }, { "completion_length": 657.8772583007812, "epoch": 0.880292733925771, "grad_norm": 0.3402969241142273, "kl": 0.40625, "learning_rate": 1.3859966625658205e-07, "loss": 0.0163, "reward": 1.1300223767757416, "reward_std": 0.10119741153903306, "rewards/accuracy_reward": 0.14062500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2947 }, { "completion_length": 690.3973541259766, "epoch": 0.8805914420132925, "grad_norm": 0.9117463827133179, "kl": 0.44970703125, "learning_rate": 1.384096994072943e-07, "loss": 0.018, "reward": 1.1244420260190964, "reward_std": 0.1829302255064249, "rewards/accuracy_reward": 0.14062500861473382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 2948 }, { "completion_length": 622.9531555175781, "epoch": 0.880890150100814, "grad_norm": 0.6363980770111084, "kl": 0.2491455078125, "learning_rate": 1.3822018032791345e-07, "loss": 0.01, "reward": 1.2181920111179352, "reward_std": 0.16503675654530525, "rewards/accuracy_reward": 0.22544644214212894, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 2949 }, { "completion_length": 654.4286041259766, "epoch": 0.8811888581883355, "grad_norm": 0.6378912925720215, "kl": 0.388427734375, "learning_rate": 1.380311092246177e-07, "loss": 0.0156, "reward": 1.199776828289032, "reward_std": 0.18430953472852707, "rewards/accuracy_reward": 0.2165178693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 2950 }, { "completion_length": 635.7678756713867, "epoch": 0.8814875662758569, "grad_norm": 0.7929391860961914, "kl": 0.38525390625, "learning_rate": 1.378424863030978e-07, "loss": 0.0154, "reward": 1.1406250596046448, "reward_std": 0.15680080838501453, "rewards/accuracy_reward": 0.15625000419095159, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 2951 }, { "completion_length": 670.4152069091797, "epoch": 0.8817862743633784, "grad_norm": 0.3257909119129181, "kl": 0.2886962890625, "learning_rate": 1.3765431176855697e-07, "loss": 0.0115, "reward": 1.1305803954601288, "reward_std": 0.17076869308948517, "rewards/accuracy_reward": 0.13839286379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 2952 }, { "completion_length": 680.6964569091797, "epoch": 0.8820849824508998, "grad_norm": 1.5751053094863892, "kl": 0.5556640625, "learning_rate": 1.374665858257108e-07, "loss": 0.0223, "reward": 1.3247768580913544, "reward_std": 0.22497891075909138, "rewards/accuracy_reward": 0.3437500223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 2953 }, { "completion_length": 650.1763610839844, "epoch": 0.8823836905384214, "grad_norm": 0.7070935368537903, "kl": 0.197265625, "learning_rate": 1.3727930867878655e-07, "loss": 0.0079, "reward": 1.1104911416769028, "reward_std": 0.1423233337700367, "rewards/accuracy_reward": 0.11383928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966517984867096, "step": 2954 }, { "completion_length": 677.9152069091797, "epoch": 0.8826823986259428, "grad_norm": 0.8219942450523376, "kl": 0.644775390625, "learning_rate": 1.370924805315235e-07, "loss": 0.0258, "reward": 1.2103795111179352, "reward_std": 0.20949320495128632, "rewards/accuracy_reward": 0.2232142984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 2955 }, { "completion_length": 706.5491333007812, "epoch": 0.8829811067134643, "grad_norm": 0.7086924910545349, "kl": 0.544677734375, "learning_rate": 1.3690610158717244e-07, "loss": 0.0218, "reward": 1.097098246216774, "reward_std": 0.09439410734921694, "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 2956 }, { "completion_length": 681.3080596923828, "epoch": 0.8832798148009857, "grad_norm": 0.47921204566955566, "kl": 0.556640625, "learning_rate": 1.3672017204849521e-07, "loss": 0.0222, "reward": 1.1746652126312256, "reward_std": 0.15927874390035868, "rewards/accuracy_reward": 0.19866072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 2957 }, { "completion_length": 680.6495666503906, "epoch": 0.8835785228885072, "grad_norm": 0.47685685753822327, "kl": 0.6396484375, "learning_rate": 1.3653469211776507e-07, "loss": 0.0256, "reward": 1.0931920111179352, "reward_std": 0.16582289338111877, "rewards/accuracy_reward": 0.1116071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 2958 }, { "completion_length": 677.3906555175781, "epoch": 0.8838772309760287, "grad_norm": 0.9264742732048035, "kl": 0.46533203125, "learning_rate": 1.3634966199676586e-07, "loss": 0.0186, "reward": 1.2070313096046448, "reward_std": 0.19578429125249386, "rewards/accuracy_reward": 0.2254464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 2959 }, { "completion_length": 641.6227874755859, "epoch": 0.8841759390635502, "grad_norm": 0.6411563754081726, "kl": 0.399658203125, "learning_rate": 1.361650818867924e-07, "loss": 0.016, "reward": 1.1188616454601288, "reward_std": 0.13296953891403973, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 2960 }, { "completion_length": 713.5424346923828, "epoch": 0.8844746471510716, "grad_norm": 0.5142354369163513, "kl": 0.6669921875, "learning_rate": 1.3598095198864967e-07, "loss": 0.0267, "reward": 1.1635045111179352, "reward_std": 0.1813509836792946, "rewards/accuracy_reward": 0.1785714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2961 }, { "completion_length": 658.8036117553711, "epoch": 0.8847733552385931, "grad_norm": 0.8342605829238892, "kl": 0.885986328125, "learning_rate": 1.3579727250265285e-07, "loss": 0.0355, "reward": 1.2315848767757416, "reward_std": 0.15863304026424885, "rewards/accuracy_reward": 0.24776786682195961, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 2962 }, { "completion_length": 762.044677734375, "epoch": 0.8850720633261145, "grad_norm": 1.1848440170288086, "kl": 0.91796875, "learning_rate": 1.3561404362862736e-07, "loss": 0.0367, "reward": 1.0691964775323868, "reward_std": 0.18310396932065487, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500447034836, "step": 2963 }, { "completion_length": 638.4531402587891, "epoch": 0.8853707714136361, "grad_norm": 1.0345089435577393, "kl": 0.83642578125, "learning_rate": 1.3543126556590827e-07, "loss": 0.0334, "reward": 1.3030134737491608, "reward_std": 0.20295713562518358, "rewards/accuracy_reward": 0.3236607313156128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 2964 }, { "completion_length": 747.5268096923828, "epoch": 0.8856694795011575, "grad_norm": 1.2199174165725708, "kl": 1.1083984375, "learning_rate": 1.352489385133401e-07, "loss": 0.0444, "reward": 1.120535746216774, "reward_std": 0.20096835121512413, "rewards/accuracy_reward": 0.14285714854486287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 2965 }, { "completion_length": 661.4218902587891, "epoch": 0.885968187588679, "grad_norm": 0.5811540484428406, "kl": 0.633544921875, "learning_rate": 1.3506706266927677e-07, "loss": 0.0254, "reward": 1.1322545111179352, "reward_std": 0.1445147164631635, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 2966 }, { "completion_length": 727.747802734375, "epoch": 0.8862668956762004, "grad_norm": 1.00664484500885, "kl": 0.763671875, "learning_rate": 1.348856382315816e-07, "loss": 0.0305, "reward": 1.107700914144516, "reward_std": 0.1870480291545391, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 2967 }, { "completion_length": 629.1875152587891, "epoch": 0.886565603763722, "grad_norm": 1.453392505645752, "kl": 0.444091796875, "learning_rate": 1.3470466539762637e-07, "loss": 0.0178, "reward": 1.074776828289032, "reward_std": 0.14287113212049007, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 2968 }, { "completion_length": 693.7656555175781, "epoch": 0.8868643118512434, "grad_norm": 1.0316131114959717, "kl": 0.71240234375, "learning_rate": 1.345241443642919e-07, "loss": 0.0285, "reward": 1.0848214775323868, "reward_std": 0.07316020969301462, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 2969 }, { "completion_length": 681.1361999511719, "epoch": 0.8871630199387649, "grad_norm": 1.0764268636703491, "kl": 0.7431640625, "learning_rate": 1.3434407532796738e-07, "loss": 0.0297, "reward": 1.141741156578064, "reward_std": 0.15075691137462854, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 2970 }, { "completion_length": 645.2857513427734, "epoch": 0.8874617280262863, "grad_norm": 0.6531702280044556, "kl": 0.46673583984375, "learning_rate": 1.3416445848455015e-07, "loss": 0.0187, "reward": 1.1238839626312256, "reward_std": 0.12322123348712921, "rewards/accuracy_reward": 0.12946428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 2971 }, { "completion_length": 787.8348541259766, "epoch": 0.8877604361138077, "grad_norm": 0.6183040738105774, "kl": 0.879150390625, "learning_rate": 1.3398529402944596e-07, "loss": 0.0352, "reward": 1.0379464775323868, "reward_std": 0.1503842007368803, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9642857760190964, "step": 2972 }, { "completion_length": 712.3080596923828, "epoch": 0.8880591442013293, "grad_norm": 0.47269541025161743, "kl": 0.84423828125, "learning_rate": 1.3380658215756795e-07, "loss": 0.0338, "reward": 1.255022406578064, "reward_std": 0.18580826558172703, "rewards/accuracy_reward": 0.2745535895228386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 2973 }, { "completion_length": 616.044677734375, "epoch": 0.8883578522888507, "grad_norm": 0.812614917755127, "kl": 0.303466796875, "learning_rate": 1.3362832306333722e-07, "loss": 0.0121, "reward": 1.3856027722358704, "reward_std": 0.22523489594459534, "rewards/accuracy_reward": 0.3973214440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 2974 }, { "completion_length": 669.2076263427734, "epoch": 0.8886565603763722, "grad_norm": 0.7316824197769165, "kl": 0.6630859375, "learning_rate": 1.3345051694068222e-07, "loss": 0.0265, "reward": 1.117745578289032, "reward_std": 0.14945696108043194, "rewards/accuracy_reward": 0.13616072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848767757416, "step": 2975 }, { "completion_length": 679.9977951049805, "epoch": 0.8889552684638936, "grad_norm": 0.36228296160697937, "kl": 0.44482421875, "learning_rate": 1.332731639830385e-07, "loss": 0.0178, "reward": 1.219866156578064, "reward_std": 0.20757145062088966, "rewards/accuracy_reward": 0.2321428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 2976 }, { "completion_length": 624.3036041259766, "epoch": 0.8892539765514151, "grad_norm": 0.47508570551872253, "kl": 0.475830078125, "learning_rate": 1.3309626438334876e-07, "loss": 0.019, "reward": 1.2695313394069672, "reward_std": 0.1882698368281126, "rewards/accuracy_reward": 0.2857142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 2977 }, { "completion_length": 606.7277069091797, "epoch": 0.8895526846389366, "grad_norm": 0.7028127908706665, "kl": 0.4608154296875, "learning_rate": 1.329198183340625e-07, "loss": 0.0185, "reward": 1.3621652126312256, "reward_std": 0.20596757903695107, "rewards/accuracy_reward": 0.3750000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871651977300644, "step": 2978 }, { "completion_length": 681.5803833007812, "epoch": 0.8898513927264581, "grad_norm": 0.47391438484191895, "kl": 0.40966796875, "learning_rate": 1.327438260271355e-07, "loss": 0.0164, "reward": 1.1941964626312256, "reward_std": 0.17576156929135323, "rewards/accuracy_reward": 0.20535715157166123, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 2979 }, { "completion_length": 703.1339721679688, "epoch": 0.8901501008139795, "grad_norm": 0.4704282879829407, "kl": 0.60791015625, "learning_rate": 1.3256828765403038e-07, "loss": 0.0243, "reward": 1.0797991454601288, "reward_std": 0.14738191291689873, "rewards/accuracy_reward": 0.08928572060540318, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 2980 }, { "completion_length": 662.1518249511719, "epoch": 0.890448808901501, "grad_norm": 0.7461461424827576, "kl": 0.3994140625, "learning_rate": 1.323932034057156e-07, "loss": 0.016, "reward": 1.0781250596046448, "reward_std": 0.10763329640030861, "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 2981 }, { "completion_length": 668.1808166503906, "epoch": 0.8907475169890224, "grad_norm": 0.8389469385147095, "kl": 0.508544921875, "learning_rate": 1.322185734726656e-07, "loss": 0.0204, "reward": 1.340959906578064, "reward_std": 0.2157229259610176, "rewards/accuracy_reward": 0.3549107313156128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 2982 }, { "completion_length": 759.7790374755859, "epoch": 0.891046225076544, "grad_norm": 0.45382794737815857, "kl": 0.5517578125, "learning_rate": 1.3204439804486061e-07, "loss": 0.0221, "reward": 1.1171875596046448, "reward_std": 0.16856100596487522, "rewards/accuracy_reward": 0.13392857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 2983 }, { "completion_length": 683.7678833007812, "epoch": 0.8913449331640654, "grad_norm": 0.7013095617294312, "kl": 0.27783203125, "learning_rate": 1.3187067731178666e-07, "loss": 0.0111, "reward": 1.1060268580913544, "reward_std": 0.12396281026303768, "rewards/accuracy_reward": 0.11607143026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 2984 }, { "completion_length": 651.5089569091797, "epoch": 0.8916436412515869, "grad_norm": 0.524689793586731, "kl": 0.348876953125, "learning_rate": 1.316974114624346e-07, "loss": 0.014, "reward": 1.1456473767757416, "reward_std": 0.15787860192358494, "rewards/accuracy_reward": 0.16294643771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 2985 }, { "completion_length": 689.3750305175781, "epoch": 0.8919423493391083, "grad_norm": 0.6013699173927307, "kl": 0.42919921875, "learning_rate": 1.3152460068530089e-07, "loss": 0.0172, "reward": 1.1501116305589676, "reward_std": 0.20876407623291016, "rewards/accuracy_reward": 0.1651785832364112, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 2986 }, { "completion_length": 675.8147583007812, "epoch": 0.8922410574266298, "grad_norm": 0.5989595055580139, "kl": 0.5576171875, "learning_rate": 1.3135224516838656e-07, "loss": 0.0223, "reward": 1.1378348767757416, "reward_std": 0.1965160369873047, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 2987 }, { "completion_length": 706.232177734375, "epoch": 0.8925397655141513, "grad_norm": 1.4479360580444336, "kl": 0.473876953125, "learning_rate": 1.3118034509919759e-07, "loss": 0.0189, "reward": 1.2092634737491608, "reward_std": 0.17946698516607285, "rewards/accuracy_reward": 0.23437500931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.974888414144516, "step": 2988 }, { "completion_length": 722.5937805175781, "epoch": 0.8928384736016728, "grad_norm": 0.5766008496284485, "kl": 0.650390625, "learning_rate": 1.3100890066474454e-07, "loss": 0.026, "reward": 1.1367187798023224, "reward_std": 0.10448270663619041, "rewards/accuracy_reward": 0.1540178619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 2989 }, { "completion_length": 782.6317291259766, "epoch": 0.8931371816891942, "grad_norm": 0.6959267258644104, "kl": 0.5888671875, "learning_rate": 1.3083791205154187e-07, "loss": 0.0235, "reward": 1.1629464626312256, "reward_std": 0.21527323126792908, "rewards/accuracy_reward": 0.196428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.96651791036129, "step": 2990 }, { "completion_length": 636.8259124755859, "epoch": 0.8934358897767157, "grad_norm": 0.6181212067604065, "kl": 0.401123046875, "learning_rate": 1.3066737944560867e-07, "loss": 0.0161, "reward": 1.1088170111179352, "reward_std": 0.12079563364386559, "rewards/accuracy_reward": 0.11830357927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 2991 }, { "completion_length": 646.7567291259766, "epoch": 0.8937345978642371, "grad_norm": 0.8203044533729553, "kl": 0.426025390625, "learning_rate": 1.3049730303246761e-07, "loss": 0.017, "reward": 1.164620578289032, "reward_std": 0.11930804932489991, "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 2992 }, { "completion_length": 684.4955749511719, "epoch": 0.8940333059517587, "grad_norm": 0.6313097476959229, "kl": 0.533203125, "learning_rate": 1.3032768299714517e-07, "loss": 0.0213, "reward": 1.0870536118745804, "reward_std": 0.15873809717595577, "rewards/accuracy_reward": 0.10044643329456449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 2993 }, { "completion_length": 708.607177734375, "epoch": 0.8943320140392801, "grad_norm": 0.4030945301055908, "kl": 0.57861328125, "learning_rate": 1.3015851952417125e-07, "loss": 0.0232, "reward": 1.1607143580913544, "reward_std": 0.16144823841750622, "rewards/accuracy_reward": 0.1741071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071939468384, "step": 2994 }, { "completion_length": 645.3348541259766, "epoch": 0.8946307221268016, "grad_norm": 0.4304269552230835, "kl": 0.2882080078125, "learning_rate": 1.2998981279757932e-07, "loss": 0.0115, "reward": 1.1322545111179352, "reward_std": 0.16848171316087246, "rewards/accuracy_reward": 0.14508929336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 2995 }, { "completion_length": 738.2344055175781, "epoch": 0.894929430214323, "grad_norm": 0.7178320288658142, "kl": 0.578369140625, "learning_rate": 1.2982156300090557e-07, "loss": 0.0231, "reward": 1.1584822237491608, "reward_std": 0.19647134095430374, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464626312256, "step": 2996 }, { "completion_length": 737.8393249511719, "epoch": 0.8952281383018446, "grad_norm": 0.5610615015029907, "kl": 0.57275390625, "learning_rate": 1.2965377031718934e-07, "loss": 0.0229, "reward": 1.1077009439468384, "reward_std": 0.17155111581087112, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 2997 }, { "completion_length": 778.6652221679688, "epoch": 0.895526846389366, "grad_norm": 0.42020657658576965, "kl": 0.5322265625, "learning_rate": 1.2948643492897276e-07, "loss": 0.0213, "reward": 1.0518973469734192, "reward_std": 0.14172220043838024, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 2998 }, { "completion_length": 600.4040374755859, "epoch": 0.8958255544768875, "grad_norm": 0.6362956762313843, "kl": 0.37890625, "learning_rate": 1.293195570183001e-07, "loss": 0.0152, "reward": 1.1618303954601288, "reward_std": 0.18661121651530266, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 2999 }, { "completion_length": 631.6495819091797, "epoch": 0.8961242625644089, "grad_norm": 0.2994630038738251, "kl": 0.34967041015625, "learning_rate": 1.2915313676671838e-07, "loss": 0.014, "reward": 1.2617188096046448, "reward_std": 0.12948805280029774, "rewards/accuracy_reward": 0.2723214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 3000 }, { "completion_length": 731.185302734375, "epoch": 0.8964229706519304, "grad_norm": 0.4017874598503113, "kl": 0.57080078125, "learning_rate": 1.2898717435527636e-07, "loss": 0.0228, "reward": 1.1947545111179352, "reward_std": 0.20245779864490032, "rewards/accuracy_reward": 0.2165178693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 3001 }, { "completion_length": 659.4933319091797, "epoch": 0.8967216787394519, "grad_norm": 0.5836895108222961, "kl": 0.3299560546875, "learning_rate": 1.2882166996452497e-07, "loss": 0.0132, "reward": 1.1356027722358704, "reward_std": 0.13186628185212612, "rewards/accuracy_reward": 0.1428571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455484867096, "step": 3002 }, { "completion_length": 686.8839569091797, "epoch": 0.8970203868269734, "grad_norm": 0.4955191910266876, "kl": 0.47900390625, "learning_rate": 1.2865662377451678e-07, "loss": 0.0192, "reward": 1.1930803954601288, "reward_std": 0.21675420552492142, "rewards/accuracy_reward": 0.20758929289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 3003 }, { "completion_length": 698.2232360839844, "epoch": 0.8973190949144948, "grad_norm": 0.6087026000022888, "kl": 0.6015625, "learning_rate": 1.284920359648058e-07, "loss": 0.0241, "reward": 1.1143973767757416, "reward_std": 0.14518166612833738, "rewards/accuracy_reward": 0.12946429289877415, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 3004 }, { "completion_length": 559.6674346923828, "epoch": 0.8976178030020163, "grad_norm": 0.5471465587615967, "kl": 0.35919189453125, "learning_rate": 1.2832790671444745e-07, "loss": 0.0144, "reward": 1.1015625596046448, "reward_std": 0.09620670508593321, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 3005 }, { "completion_length": 587.7991409301758, "epoch": 0.8979165110895377, "grad_norm": 0.4946304261684418, "kl": 0.5146484375, "learning_rate": 1.2816423620199837e-07, "loss": 0.0206, "reward": 1.284040242433548, "reward_std": 0.19882894679903984, "rewards/accuracy_reward": 0.2968750149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 3006 }, { "completion_length": 692.2701110839844, "epoch": 0.8982152191770593, "grad_norm": 0.3682897686958313, "kl": 0.490234375, "learning_rate": 1.2800102460551587e-07, "loss": 0.0196, "reward": 1.1344866454601288, "reward_std": 0.1340857371687889, "rewards/accuracy_reward": 0.14732143888249993, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 3007 }, { "completion_length": 692.3348541259766, "epoch": 0.8985139272645807, "grad_norm": 0.44947656989097595, "kl": 0.41583251953125, "learning_rate": 1.2783827210255826e-07, "loss": 0.0166, "reward": 1.1378348767757416, "reward_std": 0.09986926428973675, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 3008 }, { "completion_length": 617.7611846923828, "epoch": 0.8988126353521022, "grad_norm": 0.439343124628067, "kl": 0.232177734375, "learning_rate": 1.2767597887018435e-07, "loss": 0.0093, "reward": 1.1462053805589676, "reward_std": 0.09054065495729446, "rewards/accuracy_reward": 0.15178572619333863, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9944196790456772, "step": 3009 }, { "completion_length": 731.8236999511719, "epoch": 0.8991113434396236, "grad_norm": 0.6168426871299744, "kl": 0.841796875, "learning_rate": 1.275141450849532e-07, "loss": 0.0336, "reward": 1.0864955633878708, "reward_std": 0.19511952623724937, "rewards/accuracy_reward": 0.11830357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9681920111179352, "step": 3010 }, { "completion_length": 663.6674346923828, "epoch": 0.8994100515271451, "grad_norm": 0.7737127542495728, "kl": 1.05908203125, "learning_rate": 1.2735277092292406e-07, "loss": 0.0424, "reward": 1.092075914144516, "reward_std": 0.11192755401134491, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.96261166036129, "step": 3011 }, { "completion_length": 729.6339569091797, "epoch": 0.8997087596146666, "grad_norm": 0.726513683795929, "kl": 0.42138671875, "learning_rate": 1.2719185655965643e-07, "loss": 0.0169, "reward": 1.2081473767757416, "reward_std": 0.16664831154048443, "rewards/accuracy_reward": 0.2209821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 3012 }, { "completion_length": 613.2076110839844, "epoch": 0.9000074677021881, "grad_norm": 0.34596896171569824, "kl": 0.3956298828125, "learning_rate": 1.270314021702091e-07, "loss": 0.0159, "reward": 1.2806920111179352, "reward_std": 0.15775438770651817, "rewards/accuracy_reward": 0.2968750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 3013 }, { "completion_length": 626.8102951049805, "epoch": 0.9003061757897095, "grad_norm": 0.4542238414287567, "kl": 0.62158203125, "learning_rate": 1.2687140792914095e-07, "loss": 0.0249, "reward": 1.135602742433548, "reward_std": 0.13716360554099083, "rewards/accuracy_reward": 0.15401786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 3014 }, { "completion_length": 723.4196929931641, "epoch": 0.9006048838772309, "grad_norm": 0.5313445925712585, "kl": 0.463134765625, "learning_rate": 1.267118740105098e-07, "loss": 0.0185, "reward": 1.0764509290456772, "reward_std": 0.15130847692489624, "rewards/accuracy_reward": 0.10491072200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9715402275323868, "step": 3015 }, { "completion_length": 672.060302734375, "epoch": 0.9009035919647524, "grad_norm": 0.6547677516937256, "kl": 0.76123046875, "learning_rate": 1.2655280058787305e-07, "loss": 0.0304, "reward": 1.2388393580913544, "reward_std": 0.1733754761517048, "rewards/accuracy_reward": 0.2566964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 3016 }, { "completion_length": 671.6361846923828, "epoch": 0.9012023000522739, "grad_norm": 0.4520454406738281, "kl": 0.62353515625, "learning_rate": 1.26394187834287e-07, "loss": 0.0249, "reward": 1.2014509737491608, "reward_std": 0.20988616719841957, "rewards/accuracy_reward": 0.227678582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723618745804, "step": 3017 }, { "completion_length": 639.3504867553711, "epoch": 0.9015010081397954, "grad_norm": 0.45796751976013184, "kl": 0.5592041015625, "learning_rate": 1.262360359223067e-07, "loss": 0.0224, "reward": 1.2779018431901932, "reward_std": 0.13823727518320084, "rewards/accuracy_reward": 0.29464287031441927, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589477300644, "step": 3018 }, { "completion_length": 624.3973388671875, "epoch": 0.9017997162273168, "grad_norm": 0.5012449026107788, "kl": 0.349365234375, "learning_rate": 1.2607834502398602e-07, "loss": 0.014, "reward": 1.1244420111179352, "reward_std": 0.13278574589639902, "rewards/accuracy_reward": 0.14062500558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 3019 }, { "completion_length": 672.966552734375, "epoch": 0.9020984243148383, "grad_norm": 0.3656917214393616, "kl": 0.339599609375, "learning_rate": 1.259211153108772e-07, "loss": 0.0136, "reward": 1.280133992433548, "reward_std": 0.2218577302992344, "rewards/accuracy_reward": 0.2924107350409031, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 3020 }, { "completion_length": 710.9643096923828, "epoch": 0.9023971324023597, "grad_norm": 0.5561825633049011, "kl": 0.642578125, "learning_rate": 1.2576434695403068e-07, "loss": 0.0257, "reward": 1.1417411267757416, "reward_std": 0.13798647560179234, "rewards/accuracy_reward": 0.16071428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810267984867096, "step": 3021 }, { "completion_length": 653.9977951049805, "epoch": 0.9026958404898813, "grad_norm": 0.3889049291610718, "kl": 0.62744140625, "learning_rate": 1.2560804012399512e-07, "loss": 0.0251, "reward": 1.113839328289032, "reward_std": 0.16315895318984985, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 3022 }, { "completion_length": 660.7544937133789, "epoch": 0.9029945485774027, "grad_norm": 0.56829833984375, "kl": 0.2655029296875, "learning_rate": 1.2545219499081707e-07, "loss": 0.0106, "reward": 1.2282366752624512, "reward_std": 0.17655246332287788, "rewards/accuracy_reward": 0.2343750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616454601288, "step": 3023 }, { "completion_length": 684.2031402587891, "epoch": 0.9032932566649242, "grad_norm": 1.1381456851959229, "kl": 0.480712890625, "learning_rate": 1.2529681172404063e-07, "loss": 0.0193, "reward": 1.1512277126312256, "reward_std": 0.15468637645244598, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884290456772, "step": 3024 }, { "completion_length": 597.3906555175781, "epoch": 0.9035919647524456, "grad_norm": 0.262423038482666, "kl": 0.2933349609375, "learning_rate": 1.2514189049270776e-07, "loss": 0.0117, "reward": 1.2661831080913544, "reward_std": 0.15142047638073564, "rewards/accuracy_reward": 0.2700892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937649011612, "step": 3025 }, { "completion_length": 694.0067291259766, "epoch": 0.9038906728399672, "grad_norm": 0.6735776662826538, "kl": 0.4833984375, "learning_rate": 1.2498743146535737e-07, "loss": 0.0193, "reward": 1.05245541036129, "reward_std": 0.1368760596960783, "rewards/accuracy_reward": 0.06696428707800806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 3026 }, { "completion_length": 705.6473541259766, "epoch": 0.9041893809274886, "grad_norm": 0.8301098346710205, "kl": 0.42578125, "learning_rate": 1.248334348100258e-07, "loss": 0.017, "reward": 1.0965402126312256, "reward_std": 0.17157716490328312, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 3027 }, { "completion_length": 674.654052734375, "epoch": 0.9044880890150101, "grad_norm": 0.8553975820541382, "kl": 0.70703125, "learning_rate": 1.246799006942465e-07, "loss": 0.0283, "reward": 1.109933078289032, "reward_std": 0.09677408216521144, "rewards/accuracy_reward": 0.1316964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 3028 }, { "completion_length": 667.6406402587891, "epoch": 0.9047867971025315, "grad_norm": 0.389417439699173, "kl": 0.54205322265625, "learning_rate": 1.245268292850493e-07, "loss": 0.0217, "reward": 1.1908482611179352, "reward_std": 0.18039154261350632, "rewards/accuracy_reward": 0.2008928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 3029 }, { "completion_length": 619.7991180419922, "epoch": 0.905085505190053, "grad_norm": 0.8644967079162598, "kl": 0.51904296875, "learning_rate": 1.2437422074896093e-07, "loss": 0.0208, "reward": 1.12276791036129, "reward_std": 0.12791011296212673, "rewards/accuracy_reward": 0.13839286682195961, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 3030 }, { "completion_length": 661.2812805175781, "epoch": 0.9053842132775745, "grad_norm": 0.36728209257125854, "kl": 0.4501953125, "learning_rate": 1.2422207525200468e-07, "loss": 0.018, "reward": 1.2388393580913544, "reward_std": 0.1726561076939106, "rewards/accuracy_reward": 0.2522321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 3031 }, { "completion_length": 698.888427734375, "epoch": 0.905682921365096, "grad_norm": 0.48453769087791443, "kl": 0.42333984375, "learning_rate": 1.2407039295969978e-07, "loss": 0.0169, "reward": 1.1361607611179352, "reward_std": 0.20564750209450722, "rewards/accuracy_reward": 0.14732143841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 3032 }, { "completion_length": 660.7500152587891, "epoch": 0.9059816294526174, "grad_norm": 0.6835778951644897, "kl": 0.394287109375, "learning_rate": 1.2391917403706172e-07, "loss": 0.0157, "reward": 1.1579241752624512, "reward_std": 0.19999788329005241, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 3033 }, { "completion_length": 670.9397583007812, "epoch": 0.9062803375401389, "grad_norm": 0.3545636534690857, "kl": 0.41845703125, "learning_rate": 1.2376841864860201e-07, "loss": 0.0167, "reward": 1.1171875298023224, "reward_std": 0.1655492102727294, "rewards/accuracy_reward": 0.1361607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 3034 }, { "completion_length": 664.779052734375, "epoch": 0.9065790456276603, "grad_norm": 0.7145299315452576, "kl": 0.496337890625, "learning_rate": 1.2361812695832754e-07, "loss": 0.0199, "reward": 1.212053656578064, "reward_std": 0.16443153470754623, "rewards/accuracy_reward": 0.2254464440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 3035 }, { "completion_length": 759.8259429931641, "epoch": 0.9068777537151819, "grad_norm": 0.7329167723655701, "kl": 0.6953125, "learning_rate": 1.2346829912974105e-07, "loss": 0.0278, "reward": 1.1439732611179352, "reward_std": 0.14396532624959946, "rewards/accuracy_reward": 0.1718750111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9720982611179352, "step": 3036 }, { "completion_length": 633.5670013427734, "epoch": 0.9071764618027033, "grad_norm": 0.5973883271217346, "kl": 0.3902587890625, "learning_rate": 1.233189353258405e-07, "loss": 0.0156, "reward": 1.2075893580913544, "reward_std": 0.17376151122152805, "rewards/accuracy_reward": 0.21651787124574184, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 3037 }, { "completion_length": 767.1718902587891, "epoch": 0.9074751698902248, "grad_norm": 0.39310625195503235, "kl": 0.4951171875, "learning_rate": 1.2317003570911907e-07, "loss": 0.0198, "reward": 1.238839328289032, "reward_std": 0.13826417457312346, "rewards/accuracy_reward": 0.2477678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714775323868, "step": 3038 }, { "completion_length": 666.622802734375, "epoch": 0.9077738779777462, "grad_norm": 0.41814813017845154, "kl": 0.422119140625, "learning_rate": 1.23021600441565e-07, "loss": 0.0169, "reward": 1.1311384439468384, "reward_std": 0.11688503436744213, "rewards/accuracy_reward": 0.1406250074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 3039 }, { "completion_length": 614.8281555175781, "epoch": 0.9080725860652678, "grad_norm": 0.42094099521636963, "kl": 0.41314697265625, "learning_rate": 1.2287362968466123e-07, "loss": 0.0165, "reward": 1.082589328289032, "reward_std": 0.12230224348604679, "rewards/accuracy_reward": 0.09375000325962901, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 3040 }, { "completion_length": 632.9754943847656, "epoch": 0.9083712941527892, "grad_norm": 0.8625864386558533, "kl": 0.42919921875, "learning_rate": 1.2272612359938548e-07, "loss": 0.0172, "reward": 1.111607164144516, "reward_std": 0.11902694404125214, "rewards/accuracy_reward": 0.12276786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 3041 }, { "completion_length": 684.6808166503906, "epoch": 0.9086700022403107, "grad_norm": 0.48537635803222656, "kl": 0.4884033203125, "learning_rate": 1.2257908234620993e-07, "loss": 0.0195, "reward": 1.2717634439468384, "reward_std": 0.23364892601966858, "rewards/accuracy_reward": 0.2879464440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 3042 }, { "completion_length": 669.1205749511719, "epoch": 0.9089687103278321, "grad_norm": 0.4131069779396057, "kl": 0.333984375, "learning_rate": 1.22432506085101e-07, "loss": 0.0134, "reward": 1.1222098767757416, "reward_std": 0.09560307580977678, "rewards/accuracy_reward": 0.12946428847499192, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 3043 }, { "completion_length": 644.3460159301758, "epoch": 0.9092674184153536, "grad_norm": 1.5105149745941162, "kl": 0.734130859375, "learning_rate": 1.2228639497551936e-07, "loss": 0.0294, "reward": 1.162946492433548, "reward_std": 0.17165011912584305, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 3044 }, { "completion_length": 556.5290451049805, "epoch": 0.909566126502875, "grad_norm": 0.4320223331451416, "kl": 0.40283203125, "learning_rate": 1.2214074917641957e-07, "loss": 0.0161, "reward": 1.145089328289032, "reward_std": 0.14299194887280464, "rewards/accuracy_reward": 0.15401786286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 3045 }, { "completion_length": 672.4777069091797, "epoch": 0.9098648345903966, "grad_norm": 0.3964952528476715, "kl": 0.609619140625, "learning_rate": 1.2199556884624992e-07, "loss": 0.0244, "reward": 1.0150670111179352, "reward_std": 0.11261536926031113, "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 3046 }, { "completion_length": 711.450927734375, "epoch": 0.910163542677918, "grad_norm": 0.279836505651474, "kl": 0.431884765625, "learning_rate": 1.2185085414295242e-07, "loss": 0.0172, "reward": 1.062500074505806, "reward_std": 0.12046027556061745, "rewards/accuracy_reward": 0.06919643119908869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 3047 }, { "completion_length": 650.5580673217773, "epoch": 0.9104622507654395, "grad_norm": 0.4966926872730255, "kl": 0.525146484375, "learning_rate": 1.2170660522396251e-07, "loss": 0.021, "reward": 1.2773438096046448, "reward_std": 0.16804364696145058, "rewards/accuracy_reward": 0.2924107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 3048 }, { "completion_length": 711.5647583007812, "epoch": 0.9107609588529609, "grad_norm": 0.8676764369010925, "kl": 0.57568359375, "learning_rate": 1.2156282224620884e-07, "loss": 0.023, "reward": 1.1155134439468384, "reward_std": 0.14329075999557972, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205931901932, "step": 3049 }, { "completion_length": 718.5223388671875, "epoch": 0.9110596669404825, "grad_norm": 0.9865602850914001, "kl": 0.7470703125, "learning_rate": 1.214195053661132e-07, "loss": 0.0299, "reward": 1.2042411267757416, "reward_std": 0.16305270791053772, "rewards/accuracy_reward": 0.2209821566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983258992433548, "step": 3050 }, { "completion_length": 680.1830749511719, "epoch": 0.9113583750280039, "grad_norm": 0.45884597301483154, "kl": 0.61083984375, "learning_rate": 1.212766547395904e-07, "loss": 0.0244, "reward": 1.113839328289032, "reward_std": 0.1436854489147663, "rewards/accuracy_reward": 0.13392857694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 3051 }, { "completion_length": 702.9107360839844, "epoch": 0.9116570831155254, "grad_norm": 0.5479555130004883, "kl": 0.615478515625, "learning_rate": 1.2113427052204772e-07, "loss": 0.0246, "reward": 1.073102742433548, "reward_std": 0.17045602202415466, "rewards/accuracy_reward": 0.09151786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848618745804, "step": 3052 }, { "completion_length": 696.3638763427734, "epoch": 0.9119557912030468, "grad_norm": 0.41134533286094666, "kl": 0.4677734375, "learning_rate": 1.2099235286838544e-07, "loss": 0.0187, "reward": 1.1891741454601288, "reward_std": 0.2043823003768921, "rewards/accuracy_reward": 0.20982143748551607, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979352742433548, "step": 3053 }, { "completion_length": 669.7745971679688, "epoch": 0.9122544992905683, "grad_norm": 0.5467419028282166, "kl": 0.466796875, "learning_rate": 1.2085090193299593e-07, "loss": 0.0187, "reward": 1.1049107313156128, "reward_std": 0.13108185539022088, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 3054 }, { "completion_length": 734.8861846923828, "epoch": 0.9125532073780898, "grad_norm": 0.4150436222553253, "kl": 0.65380859375, "learning_rate": 1.2070991786976397e-07, "loss": 0.0261, "reward": 1.1819196939468384, "reward_std": 0.16105080023407936, "rewards/accuracy_reward": 0.1986607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 3055 }, { "completion_length": 567.1830596923828, "epoch": 0.9128519154656113, "grad_norm": 0.8702700734138489, "kl": 0.4852294921875, "learning_rate": 1.205694008320665e-07, "loss": 0.0194, "reward": 1.234933078289032, "reward_std": 0.17496038228273392, "rewards/accuracy_reward": 0.2477678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 3056 }, { "completion_length": 660.9018249511719, "epoch": 0.9131506235531327, "grad_norm": 0.6274390816688538, "kl": 0.39697265625, "learning_rate": 1.2042935097277207e-07, "loss": 0.0159, "reward": 1.2851563096046448, "reward_std": 0.11027231439948082, "rewards/accuracy_reward": 0.2924107313156128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455484867096, "step": 3057 }, { "completion_length": 719.497802734375, "epoch": 0.9134493316406541, "grad_norm": 0.6072760224342346, "kl": 0.4716796875, "learning_rate": 1.2028976844424142e-07, "loss": 0.0188, "reward": 1.106026828289032, "reward_std": 0.14155223406851292, "rewards/accuracy_reward": 0.12500000302679837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 3058 }, { "completion_length": 726.5424499511719, "epoch": 0.9137480397281756, "grad_norm": 0.9436495304107666, "kl": 0.326904296875, "learning_rate": 1.201506533983266e-07, "loss": 0.0131, "reward": 1.1512277275323868, "reward_std": 0.16617874428629875, "rewards/accuracy_reward": 0.18080357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9704241305589676, "step": 3059 }, { "completion_length": 587.935302734375, "epoch": 0.9140467478156971, "grad_norm": 0.34538084268569946, "kl": 0.503173828125, "learning_rate": 1.2001200598637105e-07, "loss": 0.0201, "reward": 1.2421875298023224, "reward_std": 0.12596536614000797, "rewards/accuracy_reward": 0.2566964402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854911118745804, "step": 3060 }, { "completion_length": 626.9553833007812, "epoch": 0.9143454559032186, "grad_norm": 0.6821528673171997, "kl": 0.4794921875, "learning_rate": 1.1987382635920966e-07, "loss": 0.0192, "reward": 1.2299107313156128, "reward_std": 0.2232122588902712, "rewards/accuracy_reward": 0.2455357287544757, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 3061 }, { "completion_length": 680.5469055175781, "epoch": 0.91464416399074, "grad_norm": 0.7728957533836365, "kl": 0.581787109375, "learning_rate": 1.197361146671683e-07, "loss": 0.0233, "reward": 1.0351562798023224, "reward_std": 0.1331697627902031, "rewards/accuracy_reward": 0.05580357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 3062 }, { "completion_length": 642.9598617553711, "epoch": 0.9149428720782615, "grad_norm": 0.4249725639820099, "kl": 0.58544921875, "learning_rate": 1.195988710600638e-07, "loss": 0.0234, "reward": 1.1149553805589676, "reward_std": 0.15835372544825077, "rewards/accuracy_reward": 0.13169643026776612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 3063 }, { "completion_length": 658.6830749511719, "epoch": 0.9152415801657829, "grad_norm": 0.4240906834602356, "kl": 0.2894287109375, "learning_rate": 1.1946209568720367e-07, "loss": 0.0116, "reward": 1.1947545260190964, "reward_std": 0.16607406549155712, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 3064 }, { "completion_length": 778.8683319091797, "epoch": 0.9155402882533045, "grad_norm": 0.8779489398002625, "kl": 0.74609375, "learning_rate": 1.193257886973863e-07, "loss": 0.0298, "reward": 1.1662946939468384, "reward_std": 0.18835855275392532, "rewards/accuracy_reward": 0.19419643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9720982611179352, "step": 3065 }, { "completion_length": 622.6428833007812, "epoch": 0.9158389963408259, "grad_norm": 0.3144164979457855, "kl": 0.2330322265625, "learning_rate": 1.1918995023890014e-07, "loss": 0.0093, "reward": 1.246651828289032, "reward_std": 0.14433816447854042, "rewards/accuracy_reward": 0.25446430081501603, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 3066 }, { "completion_length": 611.732177734375, "epoch": 0.9161377044283474, "grad_norm": 0.7263466119766235, "kl": 0.4921875, "learning_rate": 1.1905458045952423e-07, "loss": 0.0197, "reward": 1.2377232611179352, "reward_std": 0.14509940147399902, "rewards/accuracy_reward": 0.25892857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.978794664144516, "step": 3067 }, { "completion_length": 669.0535888671875, "epoch": 0.9164364125158688, "grad_norm": 0.5697614550590515, "kl": 0.40771484375, "learning_rate": 1.1891967950652759e-07, "loss": 0.0163, "reward": 1.1618304252624512, "reward_std": 0.19818625040352345, "rewards/accuracy_reward": 0.1718750074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 3068 }, { "completion_length": 667.8236999511719, "epoch": 0.9167351206033904, "grad_norm": 0.5792361497879028, "kl": 0.422607421875, "learning_rate": 1.1878524752666918e-07, "loss": 0.0169, "reward": 1.164620578289032, "reward_std": 0.1751420646905899, "rewards/accuracy_reward": 0.176339291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 3069 }, { "completion_length": 755.5803833007812, "epoch": 0.9170338286909118, "grad_norm": 0.7140329480171204, "kl": 1.017578125, "learning_rate": 1.1865128466619794e-07, "loss": 0.0407, "reward": 1.1400670111179352, "reward_std": 0.23623988777399063, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.97042416036129, "step": 3070 }, { "completion_length": 717.9821624755859, "epoch": 0.9173325367784333, "grad_norm": 0.39042508602142334, "kl": 0.486328125, "learning_rate": 1.1851779107085223e-07, "loss": 0.0195, "reward": 1.1556919813156128, "reward_std": 0.15919862873852253, "rewards/accuracy_reward": 0.16741071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 3071 }, { "completion_length": 775.5402069091797, "epoch": 0.9176312448659547, "grad_norm": 0.45054611563682556, "kl": 0.4677734375, "learning_rate": 1.1838476688586005e-07, "loss": 0.0187, "reward": 1.069196492433548, "reward_std": 0.0716794803738594, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071939468384, "step": 3072 }, { "completion_length": 739.1741485595703, "epoch": 0.9179299529534762, "grad_norm": 0.706217885017395, "kl": 0.357421875, "learning_rate": 1.1825221225593865e-07, "loss": 0.0143, "reward": 1.2287946939468384, "reward_std": 0.14768622443079948, "rewards/accuracy_reward": 0.2366071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 3073 }, { "completion_length": 730.7656707763672, "epoch": 0.9182286610409977, "grad_norm": 0.5632269978523254, "kl": 0.80078125, "learning_rate": 1.1812012732529445e-07, "loss": 0.032, "reward": 1.1037947237491608, "reward_std": 0.18125269562005997, "rewards/accuracy_reward": 0.13169643562287092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9720982611179352, "step": 3074 }, { "completion_length": 735.0692291259766, "epoch": 0.9185273691285192, "grad_norm": 0.3545987606048584, "kl": 0.567138671875, "learning_rate": 1.17988512237623e-07, "loss": 0.0227, "reward": 1.2148438096046448, "reward_std": 0.15026693418622017, "rewards/accuracy_reward": 0.2299107275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 3075 }, { "completion_length": 751.5647583007812, "epoch": 0.9188260772160406, "grad_norm": 0.6574994921684265, "kl": 0.91650390625, "learning_rate": 1.1785736713610854e-07, "loss": 0.0367, "reward": 1.1897321939468384, "reward_std": 0.16115934774279594, "rewards/accuracy_reward": 0.2232142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9665178805589676, "step": 3076 }, { "completion_length": 710.0602874755859, "epoch": 0.9191247853035621, "grad_norm": 0.7535514235496521, "kl": 0.73876953125, "learning_rate": 1.177266921634241e-07, "loss": 0.0295, "reward": 1.1941964626312256, "reward_std": 0.18980183079838753, "rewards/accuracy_reward": 0.2120535895228386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98214291036129, "step": 3077 }, { "completion_length": 680.5379943847656, "epoch": 0.9194234933910835, "grad_norm": 11.688318252563477, "kl": 1.037353515625, "learning_rate": 1.1759648746173125e-07, "loss": 0.0416, "reward": 1.1077009439468384, "reward_std": 0.2065442092716694, "rewards/accuracy_reward": 0.12946429662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366305589676, "step": 3078 }, { "completion_length": 690.2254638671875, "epoch": 0.9197222014786051, "grad_norm": 0.5204823017120361, "kl": 0.6845703125, "learning_rate": 1.1746675317267998e-07, "loss": 0.0274, "reward": 1.1992188096046448, "reward_std": 0.18237748183310032, "rewards/accuracy_reward": 0.21875000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 3079 }, { "completion_length": 648.3750305175781, "epoch": 0.9200209095661265, "grad_norm": 0.3622267544269562, "kl": 0.6875, "learning_rate": 1.1733748943740837e-07, "loss": 0.0275, "reward": 1.2248884439468384, "reward_std": 0.18360432609915733, "rewards/accuracy_reward": 0.238839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 3080 }, { "completion_length": 715.4799499511719, "epoch": 0.920319617653648, "grad_norm": 0.6871004104614258, "kl": 0.7060546875, "learning_rate": 1.1720869639654285e-07, "loss": 0.0282, "reward": 1.0658482611179352, "reward_std": 0.12793912831693888, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.972098246216774, "step": 3081 }, { "completion_length": 653.3080596923828, "epoch": 0.9206183257411694, "grad_norm": 0.6681851148605347, "kl": 0.38427734375, "learning_rate": 1.1708037419019751e-07, "loss": 0.0154, "reward": 1.1361607611179352, "reward_std": 0.1501724235713482, "rewards/accuracy_reward": 0.14955357951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 3082 }, { "completion_length": 689.9152069091797, "epoch": 0.920917033828691, "grad_norm": 1.167242169380188, "kl": 0.8037109375, "learning_rate": 1.1695252295797432e-07, "loss": 0.0321, "reward": 1.1104911267757416, "reward_std": 0.15128926560282707, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 3083 }, { "completion_length": 720.0893249511719, "epoch": 0.9212157419162124, "grad_norm": 0.3142904043197632, "kl": 0.6904296875, "learning_rate": 1.1682514283896303e-07, "loss": 0.0276, "reward": 1.2656250298023224, "reward_std": 0.19550558552145958, "rewards/accuracy_reward": 0.2790178693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071939468384, "step": 3084 }, { "completion_length": 663.8727874755859, "epoch": 0.9215144500037339, "grad_norm": 0.4311908483505249, "kl": 0.54150390625, "learning_rate": 1.1669823397174055e-07, "loss": 0.0216, "reward": 1.217633992433548, "reward_std": 0.18187001906335354, "rewards/accuracy_reward": 0.2343750149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 3085 }, { "completion_length": 646.122802734375, "epoch": 0.9218131580912553, "grad_norm": 0.4462989866733551, "kl": 0.467041015625, "learning_rate": 1.1657179649437134e-07, "loss": 0.0187, "reward": 1.141183078289032, "reward_std": 0.08997970074415207, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871651977300644, "step": 3086 }, { "completion_length": 724.8415679931641, "epoch": 0.9221118661787768, "grad_norm": 0.6334261298179626, "kl": 0.57275390625, "learning_rate": 1.1644583054440712e-07, "loss": 0.0229, "reward": 1.1395089626312256, "reward_std": 0.17940212599933147, "rewards/accuracy_reward": 0.15848214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 3087 }, { "completion_length": 629.381721496582, "epoch": 0.9224105742662982, "grad_norm": 0.5050138831138611, "kl": 0.3289794921875, "learning_rate": 1.1632033625888633e-07, "loss": 0.0132, "reward": 1.1104911267757416, "reward_std": 0.14667676389217377, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 3088 }, { "completion_length": 712.1920166015625, "epoch": 0.9227092823538198, "grad_norm": 0.5028433799743652, "kl": 0.63427734375, "learning_rate": 1.1619531377433454e-07, "loss": 0.0254, "reward": 1.117745578289032, "reward_std": 0.1822486873716116, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 3089 }, { "completion_length": 650.0647583007812, "epoch": 0.9230079904413412, "grad_norm": 0.49457111954689026, "kl": 0.58740234375, "learning_rate": 1.1607076322676396e-07, "loss": 0.0235, "reward": 1.1116071939468384, "reward_std": 0.13868842646479607, "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 3090 }, { "completion_length": 661.1942443847656, "epoch": 0.9233066985288627, "grad_norm": 0.9663305878639221, "kl": 0.6033935546875, "learning_rate": 1.1594668475167331e-07, "loss": 0.0242, "reward": 1.1875000596046448, "reward_std": 0.15385469421744347, "rewards/accuracy_reward": 0.1986607275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393431901932, "step": 3091 }, { "completion_length": 724.5044860839844, "epoch": 0.9236054066163841, "grad_norm": 1.049139142036438, "kl": 1.076171875, "learning_rate": 1.1582307848404785e-07, "loss": 0.043, "reward": 1.092633992433548, "reward_std": 0.24338599294424057, "rewards/accuracy_reward": 0.12276786472648382, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9698661118745804, "step": 3092 }, { "completion_length": 669.7098388671875, "epoch": 0.9239041147039057, "grad_norm": 0.5849632024765015, "kl": 0.51123046875, "learning_rate": 1.1569994455835911e-07, "loss": 0.0204, "reward": 1.1690848767757416, "reward_std": 0.19076810404658318, "rewards/accuracy_reward": 0.196428582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562798023224, "step": 3093 }, { "completion_length": 576.2611770629883, "epoch": 0.9242028227914271, "grad_norm": 0.2801240384578705, "kl": 0.2864990234375, "learning_rate": 1.1557728310856467e-07, "loss": 0.0115, "reward": 1.1914063394069672, "reward_std": 0.1328593883663416, "rewards/accuracy_reward": 0.1986607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 3094 }, { "completion_length": 646.8727874755859, "epoch": 0.9245015308789486, "grad_norm": 0.9404751658439636, "kl": 0.58154296875, "learning_rate": 1.154550942681083e-07, "loss": 0.0233, "reward": 1.0764509439468384, "reward_std": 0.12437792611308396, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 3095 }, { "completion_length": 747.779052734375, "epoch": 0.92480023896647, "grad_norm": 0.6712432503700256, "kl": 0.7177734375, "learning_rate": 1.1533337816991931e-07, "loss": 0.0287, "reward": 1.0926339626312256, "reward_std": 0.09923806972801685, "rewards/accuracy_reward": 0.1049107164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 3096 }, { "completion_length": 732.0357513427734, "epoch": 0.9250989470539915, "grad_norm": 0.4004213511943817, "kl": 0.486572265625, "learning_rate": 1.1521213494641294e-07, "loss": 0.0195, "reward": 1.107700914144516, "reward_std": 0.12570742890238762, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 3097 }, { "completion_length": 689.8326416015625, "epoch": 0.925397655141513, "grad_norm": 0.6702901721000671, "kl": 0.4833984375, "learning_rate": 1.1509136472949004e-07, "loss": 0.0193, "reward": 1.1026786416769028, "reward_std": 0.14781937934458256, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 3098 }, { "completion_length": 609.2143096923828, "epoch": 0.9256963632290345, "grad_norm": 0.5152801275253296, "kl": 0.568359375, "learning_rate": 1.1497106765053663e-07, "loss": 0.0227, "reward": 1.2756696939468384, "reward_std": 0.13369571045041084, "rewards/accuracy_reward": 0.2924107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 3099 }, { "completion_length": 705.966552734375, "epoch": 0.9259950713165559, "grad_norm": 0.37503883242607117, "kl": 0.822265625, "learning_rate": 1.1485124384042418e-07, "loss": 0.0329, "reward": 1.2310268580913544, "reward_std": 0.11687940172851086, "rewards/accuracy_reward": 0.2544642984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625447034836, "step": 3100 }, { "completion_length": 690.7991333007812, "epoch": 0.9262937794040773, "grad_norm": 0.5014563202857971, "kl": 0.6171875, "learning_rate": 1.1473189342950936e-07, "loss": 0.0247, "reward": 1.1651786267757416, "reward_std": 0.20753737352788448, "rewards/accuracy_reward": 0.18303572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98214291036129, "step": 3101 }, { "completion_length": 755.4576416015625, "epoch": 0.9265924874915988, "grad_norm": 0.8820852041244507, "kl": 0.71142578125, "learning_rate": 1.1461301654763352e-07, "loss": 0.0285, "reward": 1.1333705931901932, "reward_std": 0.14301186241209507, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 3102 }, { "completion_length": 779.1808319091797, "epoch": 0.9268911955791203, "grad_norm": 0.6255428194999695, "kl": 1.0693359375, "learning_rate": 1.1449461332412321e-07, "loss": 0.0428, "reward": 1.1088170111179352, "reward_std": 0.204122893512249, "rewards/accuracy_reward": 0.14285715157166123, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9659598767757416, "step": 3103 }, { "completion_length": 679.7723693847656, "epoch": 0.9271899036666418, "grad_norm": 0.37961098551750183, "kl": 0.4326171875, "learning_rate": 1.1437668388778955e-07, "loss": 0.0173, "reward": 1.2053571939468384, "reward_std": 0.1478840783238411, "rewards/accuracy_reward": 0.21875000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 3104 }, { "completion_length": 642.3616333007812, "epoch": 0.9274886117541632, "grad_norm": 0.7025569081306458, "kl": 0.4222412109375, "learning_rate": 1.1425922836692805e-07, "loss": 0.0169, "reward": 1.1422991752624512, "reward_std": 0.18780025839805603, "rewards/accuracy_reward": 0.15848214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 3105 }, { "completion_length": 660.216552734375, "epoch": 0.9277873198416847, "grad_norm": 0.4399455487728119, "kl": 0.55908203125, "learning_rate": 1.1414224688931896e-07, "loss": 0.0223, "reward": 1.2438616454601288, "reward_std": 0.1678520254790783, "rewards/accuracy_reward": 0.2544642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 3106 }, { "completion_length": 696.0000305175781, "epoch": 0.9280860279292061, "grad_norm": 0.3733115792274475, "kl": 0.7978515625, "learning_rate": 1.1402573958222661e-07, "loss": 0.0319, "reward": 1.135044664144516, "reward_std": 0.11097768321633339, "rewards/accuracy_reward": 0.1540178656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 3107 }, { "completion_length": 672.8192443847656, "epoch": 0.9283847360167277, "grad_norm": 0.42732542753219604, "kl": 0.44873046875, "learning_rate": 1.1390970657239948e-07, "loss": 0.018, "reward": 1.135602742433548, "reward_std": 0.21500062383711338, "rewards/accuracy_reward": 0.149553582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 3108 }, { "completion_length": 644.5045013427734, "epoch": 0.9286834441042491, "grad_norm": 0.48824068903923035, "kl": 0.869140625, "learning_rate": 1.1379414798607019e-07, "loss": 0.0348, "reward": 1.2204241752624512, "reward_std": 0.2096901498734951, "rewards/accuracy_reward": 0.2410714440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 3109 }, { "completion_length": 737.8884124755859, "epoch": 0.9289821521917706, "grad_norm": 0.35772714018821716, "kl": 0.410888671875, "learning_rate": 1.1367906394895511e-07, "loss": 0.0164, "reward": 1.1595982611179352, "reward_std": 0.10078086704015732, "rewards/accuracy_reward": 0.1674107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875447034836, "step": 3110 }, { "completion_length": 604.4263610839844, "epoch": 0.929280860279292, "grad_norm": 0.587128221988678, "kl": 0.38525390625, "learning_rate": 1.1356445458625436e-07, "loss": 0.0154, "reward": 1.1417410969734192, "reward_std": 0.11921847052872181, "rewards/accuracy_reward": 0.15401785913854837, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232313156128, "step": 3111 }, { "completion_length": 678.1004791259766, "epoch": 0.9295795683668135, "grad_norm": 0.46768224239349365, "kl": 0.8095703125, "learning_rate": 1.1345032002265181e-07, "loss": 0.0324, "reward": 1.1411831080913544, "reward_std": 0.16674076206982136, "rewards/accuracy_reward": 0.1674107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723767757416, "step": 3112 }, { "completion_length": 662.8058471679688, "epoch": 0.929878276454335, "grad_norm": 0.5947797298431396, "kl": 0.5439453125, "learning_rate": 1.1333666038231457e-07, "loss": 0.0217, "reward": 1.1640625596046448, "reward_std": 0.1545518646016717, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 3113 }, { "completion_length": 624.5781555175781, "epoch": 0.9301769845418565, "grad_norm": 0.4811592698097229, "kl": 0.279052734375, "learning_rate": 1.1322347578889322e-07, "loss": 0.0112, "reward": 1.2187500298023224, "reward_std": 0.1594620645046234, "rewards/accuracy_reward": 0.2254464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 3114 }, { "completion_length": 692.5201263427734, "epoch": 0.9304756926293779, "grad_norm": 0.8712565302848816, "kl": 0.568359375, "learning_rate": 1.131107663655216e-07, "loss": 0.0228, "reward": 1.0602679252624512, "reward_std": 0.09928044117987156, "rewards/accuracy_reward": 0.06919643376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 3115 }, { "completion_length": 701.5960083007812, "epoch": 0.9307744007168994, "grad_norm": 0.6701471209526062, "kl": 0.439697265625, "learning_rate": 1.1299853223481634e-07, "loss": 0.0176, "reward": 1.2862723767757416, "reward_std": 0.1923791766166687, "rewards/accuracy_reward": 0.3035714477300644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 3116 }, { "completion_length": 656.7366333007812, "epoch": 0.9310731088044208, "grad_norm": 0.37473106384277344, "kl": 0.508544921875, "learning_rate": 1.1288677351887724e-07, "loss": 0.0204, "reward": 1.2126116752624512, "reward_std": 0.15599639201536775, "rewards/accuracy_reward": 0.2254464440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 3117 }, { "completion_length": 670.0937652587891, "epoch": 0.9313718168919424, "grad_norm": 0.9158337116241455, "kl": 0.584716796875, "learning_rate": 1.1277549033928693e-07, "loss": 0.0233, "reward": 1.1473215073347092, "reward_std": 0.10973669216036797, "rewards/accuracy_reward": 0.16071429592557251, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 3118 }, { "completion_length": 661.9464721679688, "epoch": 0.9316705249794638, "grad_norm": 0.6000661849975586, "kl": 0.451904296875, "learning_rate": 1.1266468281711048e-07, "loss": 0.0181, "reward": 1.2399554550647736, "reward_std": 0.17297777021303773, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 3119 }, { "completion_length": 630.8817138671875, "epoch": 0.9319692330669853, "grad_norm": 0.548588216304779, "kl": 0.367431640625, "learning_rate": 1.1255435107289571e-07, "loss": 0.0147, "reward": 1.129464328289032, "reward_std": 0.11323589459061623, "rewards/accuracy_reward": 0.13839286309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714775323868, "step": 3120 }, { "completion_length": 676.404052734375, "epoch": 0.9322679411545067, "grad_norm": 0.47905242443084717, "kl": 0.31982421875, "learning_rate": 1.1244449522667261e-07, "loss": 0.0128, "reward": 1.1562500596046448, "reward_std": 0.151689812541008, "rewards/accuracy_reward": 0.165178582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 3121 }, { "completion_length": 689.0803833007812, "epoch": 0.9325666492420283, "grad_norm": 0.81438809633255, "kl": 0.5924072265625, "learning_rate": 1.1233511539795367e-07, "loss": 0.0237, "reward": 1.2416295111179352, "reward_std": 0.2225109338760376, "rewards/accuracy_reward": 0.258928582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 3122 }, { "completion_length": 712.5692291259766, "epoch": 0.9328653573295497, "grad_norm": 0.40711745619773865, "kl": 0.445068359375, "learning_rate": 1.1222621170573346e-07, "loss": 0.0178, "reward": 1.1835937798023224, "reward_std": 0.19079768657684326, "rewards/accuracy_reward": 0.19419644214212894, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 3123 }, { "completion_length": 674.1719055175781, "epoch": 0.9331640654170712, "grad_norm": 0.4117521047592163, "kl": 0.59814453125, "learning_rate": 1.121177842684884e-07, "loss": 0.0239, "reward": 1.170758992433548, "reward_std": 0.15325059927999973, "rewards/accuracy_reward": 0.1830357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 3124 }, { "completion_length": 768.2411041259766, "epoch": 0.9334627735045926, "grad_norm": 0.8178843855857849, "kl": 1.0927734375, "learning_rate": 1.1200983320417704e-07, "loss": 0.0436, "reward": 1.2126116752624512, "reward_std": 0.19834491610527039, "rewards/accuracy_reward": 0.2388392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723767757416, "step": 3125 }, { "completion_length": 696.1116333007812, "epoch": 0.9337614815921141, "grad_norm": 0.48960405588150024, "kl": 0.43603515625, "learning_rate": 1.1190235863023949e-07, "loss": 0.0175, "reward": 1.172433078289032, "reward_std": 0.1195405013859272, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 3126 }, { "completion_length": 704.0513763427734, "epoch": 0.9340601896796356, "grad_norm": 0.4733785092830658, "kl": 0.526123046875, "learning_rate": 1.1179536066359757e-07, "loss": 0.021, "reward": 1.0998884439468384, "reward_std": 0.08021948579698801, "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 3127 }, { "completion_length": 700.1027221679688, "epoch": 0.9343588977671571, "grad_norm": 0.37412750720977783, "kl": 0.4814453125, "learning_rate": 1.1168883942065457e-07, "loss": 0.0193, "reward": 1.3437500596046448, "reward_std": 0.18406065180897713, "rewards/accuracy_reward": 0.3549107313156128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 3128 }, { "completion_length": 709.8393249511719, "epoch": 0.9346576058546785, "grad_norm": 0.7288710474967957, "kl": 0.6573486328125, "learning_rate": 1.1158279501729518e-07, "loss": 0.0263, "reward": 1.129464328289032, "reward_std": 0.21061231940984726, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 3129 }, { "completion_length": 684.4665374755859, "epoch": 0.9349563139422, "grad_norm": 0.6482433080673218, "kl": 0.943359375, "learning_rate": 1.1147722756888528e-07, "loss": 0.0378, "reward": 1.1361607611179352, "reward_std": 0.20225651562213898, "rewards/accuracy_reward": 0.15178571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 3130 }, { "completion_length": 642.8013610839844, "epoch": 0.9352550220297214, "grad_norm": 0.6473246216773987, "kl": 0.37841796875, "learning_rate": 1.1137213719027196e-07, "loss": 0.0151, "reward": 1.0736607760190964, "reward_std": 0.10418099630624056, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 3131 }, { "completion_length": 754.6317291259766, "epoch": 0.935553730117243, "grad_norm": 0.42892658710479736, "kl": 1.056640625, "learning_rate": 1.1126752399578324e-07, "loss": 0.0423, "reward": 1.0758928805589676, "reward_std": 0.16378143802285194, "rewards/accuracy_reward": 0.10044643050059676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464626312256, "step": 3132 }, { "completion_length": 639.5424499511719, "epoch": 0.9358524382047644, "grad_norm": 0.7167297005653381, "kl": 0.433837890625, "learning_rate": 1.11163388099228e-07, "loss": 0.0174, "reward": 1.2700893580913544, "reward_std": 0.13134286180138588, "rewards/accuracy_reward": 0.27678572572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 3133 }, { "completion_length": 632.216552734375, "epoch": 0.9361511462922859, "grad_norm": 0.35837745666503906, "kl": 0.49951171875, "learning_rate": 1.1105972961389592e-07, "loss": 0.02, "reward": 1.1478795409202576, "reward_std": 0.15508459880948067, "rewards/accuracy_reward": 0.1584821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 3134 }, { "completion_length": 627.6920013427734, "epoch": 0.9364498543798073, "grad_norm": 0.7933830618858337, "kl": 0.382568359375, "learning_rate": 1.1095654865255717e-07, "loss": 0.0153, "reward": 1.1713170111179352, "reward_std": 0.18596765771508217, "rewards/accuracy_reward": 0.18303572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 3135 }, { "completion_length": 659.3750305175781, "epoch": 0.9367485624673288, "grad_norm": 0.4743572473526001, "kl": 0.434326171875, "learning_rate": 1.1085384532746265e-07, "loss": 0.0174, "reward": 1.2477679252624512, "reward_std": 0.17381612583994865, "rewards/accuracy_reward": 0.258928582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 3136 }, { "completion_length": 661.4397430419922, "epoch": 0.9370472705548503, "grad_norm": 1.0406718254089355, "kl": 0.5087890625, "learning_rate": 1.1075161975034348e-07, "loss": 0.0204, "reward": 1.2433036267757416, "reward_std": 0.17336925771087408, "rewards/accuracy_reward": 0.2544643071014434, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 3137 }, { "completion_length": 684.0893096923828, "epoch": 0.9373459786423718, "grad_norm": 0.6542009711265564, "kl": 0.81884765625, "learning_rate": 1.10649872032411e-07, "loss": 0.0328, "reward": 1.1540178954601288, "reward_std": 0.21037576720118523, "rewards/accuracy_reward": 0.1852678693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500447034836, "step": 3138 }, { "completion_length": 686.529052734375, "epoch": 0.9376446867298932, "grad_norm": 0.8303489089012146, "kl": 0.53076171875, "learning_rate": 1.1054860228435685e-07, "loss": 0.0212, "reward": 1.1662946939468384, "reward_std": 0.15289798565208912, "rewards/accuracy_reward": 0.178571441443637, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 3139 }, { "completion_length": 687.9442291259766, "epoch": 0.9379433948174147, "grad_norm": 0.34794121980667114, "kl": 0.62939453125, "learning_rate": 1.1044781061635259e-07, "loss": 0.0252, "reward": 1.0541295260190964, "reward_std": 0.10779797285795212, "rewards/accuracy_reward": 0.06919643376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 3140 }, { "completion_length": 715.9308319091797, "epoch": 0.9382421029049361, "grad_norm": 0.6502049565315247, "kl": 0.693359375, "learning_rate": 1.1034749713804957e-07, "loss": 0.0278, "reward": 1.1316964775323868, "reward_std": 0.14749068580567837, "rewards/accuracy_reward": 0.15178571874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 3141 }, { "completion_length": 643.4330596923828, "epoch": 0.9385408109924577, "grad_norm": 0.3949260711669922, "kl": 0.375244140625, "learning_rate": 1.1024766195857908e-07, "loss": 0.015, "reward": 1.2187500596046448, "reward_std": 0.12616600282490253, "rewards/accuracy_reward": 0.2276785783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714477300644, "step": 3142 }, { "completion_length": 787.8504791259766, "epoch": 0.9388395190799791, "grad_norm": 1.0779622793197632, "kl": 0.849609375, "learning_rate": 1.1014830518655207e-07, "loss": 0.034, "reward": 1.0998884439468384, "reward_std": 0.21059847995638847, "rewards/accuracy_reward": 0.13169643399305642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9681920111179352, "step": 3143 }, { "completion_length": 660.4687957763672, "epoch": 0.9391382271675005, "grad_norm": 0.648353636264801, "kl": 0.6005859375, "learning_rate": 1.100494269300589e-07, "loss": 0.024, "reward": 1.1674107909202576, "reward_std": 0.15823505818843842, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 3144 }, { "completion_length": 772.6674499511719, "epoch": 0.939436935255022, "grad_norm": 0.40951359272003174, "kl": 0.751953125, "learning_rate": 1.0995102729666937e-07, "loss": 0.0301, "reward": 1.088727742433548, "reward_std": 0.2157306969165802, "rewards/accuracy_reward": 0.11607143213041127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726562798023224, "step": 3145 }, { "completion_length": 569.9442138671875, "epoch": 0.9397356433425434, "grad_norm": 1.0307424068450928, "kl": 0.484375, "learning_rate": 1.0985310639343281e-07, "loss": 0.0193, "reward": 1.1936384439468384, "reward_std": 0.1342666670680046, "rewards/accuracy_reward": 0.2098214440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 3146 }, { "completion_length": 737.3460235595703, "epoch": 0.940034351430065, "grad_norm": 0.6314874291419983, "kl": 0.50439453125, "learning_rate": 1.0975566432687742e-07, "loss": 0.0202, "reward": 1.1908482313156128, "reward_std": 0.2104518860578537, "rewards/accuracy_reward": 0.2075892947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 3147 }, { "completion_length": 584.2210083007812, "epoch": 0.9403330595175864, "grad_norm": 0.8867826461791992, "kl": 0.358154296875, "learning_rate": 1.0965870120301068e-07, "loss": 0.0143, "reward": 1.1261161267757416, "reward_std": 0.11435570940375328, "rewards/accuracy_reward": 0.1316964365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.994419664144516, "step": 3148 }, { "completion_length": 736.9107513427734, "epoch": 0.9406317676051079, "grad_norm": 1.0167094469070435, "kl": 0.65625, "learning_rate": 1.0956221712731892e-07, "loss": 0.0263, "reward": 1.100446492433548, "reward_std": 0.21804413944482803, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464626312256, "step": 3149 }, { "completion_length": 613.2232360839844, "epoch": 0.9409304756926293, "grad_norm": 0.6505309343338013, "kl": 0.6123046875, "learning_rate": 1.0946621220476737e-07, "loss": 0.0245, "reward": 1.2873884439468384, "reward_std": 0.1921856291592121, "rewards/accuracy_reward": 0.3013392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 3150 }, { "completion_length": 664.7701263427734, "epoch": 0.9412291837801509, "grad_norm": 0.6773907542228699, "kl": 0.493408203125, "learning_rate": 1.0937068653980005e-07, "loss": 0.0198, "reward": 1.1333706080913544, "reward_std": 0.16976536810398102, "rewards/accuracy_reward": 0.14955357694998384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 3151 }, { "completion_length": 648.8549423217773, "epoch": 0.9415278918676723, "grad_norm": 0.5828885436058044, "kl": 0.352783203125, "learning_rate": 1.0927564023633935e-07, "loss": 0.0141, "reward": 1.199776828289032, "reward_std": 0.10741953738033772, "rewards/accuracy_reward": 0.2098214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 3152 }, { "completion_length": 741.9241485595703, "epoch": 0.9418265999551938, "grad_norm": 0.7922242879867554, "kl": 0.7880859375, "learning_rate": 1.0918107339778654e-07, "loss": 0.0316, "reward": 1.0619420111179352, "reward_std": 0.17545674927532673, "rewards/accuracy_reward": 0.09375000605359674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9681920111179352, "step": 3153 }, { "completion_length": 653.1361999511719, "epoch": 0.9421253080427152, "grad_norm": 0.5569612383842468, "kl": 0.270751953125, "learning_rate": 1.0908698612702097e-07, "loss": 0.0108, "reward": 1.1434152126312256, "reward_std": 0.09795294888317585, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937649011612, "step": 3154 }, { "completion_length": 718.6741333007812, "epoch": 0.9424240161302367, "grad_norm": 0.7662002444267273, "kl": 0.6318359375, "learning_rate": 1.0899337852640033e-07, "loss": 0.0252, "reward": 1.1947545111179352, "reward_std": 0.20111023262143135, "rewards/accuracy_reward": 0.2075892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 3155 }, { "completion_length": 706.3795013427734, "epoch": 0.9427227242177582, "grad_norm": 0.6865957379341125, "kl": 0.7880859375, "learning_rate": 1.0890025069776055e-07, "loss": 0.0316, "reward": 1.207589328289032, "reward_std": 0.21150094084441662, "rewards/accuracy_reward": 0.22767858440056443, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 3156 }, { "completion_length": 659.1272583007812, "epoch": 0.9430214323052797, "grad_norm": 0.6408907771110535, "kl": 0.5615234375, "learning_rate": 1.0880760274241567e-07, "loss": 0.0225, "reward": 1.2310268580913544, "reward_std": 0.2145904004573822, "rewards/accuracy_reward": 0.243303582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 3157 }, { "completion_length": 715.4375305175781, "epoch": 0.9433201403928011, "grad_norm": 0.5028565526008606, "kl": 0.85205078125, "learning_rate": 1.0871543476115742e-07, "loss": 0.034, "reward": 1.0982143729925156, "reward_std": 0.16705871745944023, "rewards/accuracy_reward": 0.113839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 3158 }, { "completion_length": 648.3973541259766, "epoch": 0.9436188484803226, "grad_norm": 0.4346008598804474, "kl": 0.5615234375, "learning_rate": 1.0862374685425562e-07, "loss": 0.0224, "reward": 1.2282366752624512, "reward_std": 0.14689340814948082, "rewards/accuracy_reward": 0.24107144586741924, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871651977300644, "step": 3159 }, { "completion_length": 620.3884124755859, "epoch": 0.943917556567844, "grad_norm": 0.6751021146774292, "kl": 0.5673828125, "learning_rate": 1.0853253912145777e-07, "loss": 0.0227, "reward": 1.1160714775323868, "reward_std": 0.08020392758771777, "rewards/accuracy_reward": 0.1383928619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776786118745804, "step": 3160 }, { "completion_length": 623.7232513427734, "epoch": 0.9442162646553656, "grad_norm": 0.35282808542251587, "kl": 0.38916015625, "learning_rate": 1.0844181166198886e-07, "loss": 0.0156, "reward": 1.1696428954601288, "reward_std": 0.17848438769578934, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 3161 }, { "completion_length": 676.1495819091797, "epoch": 0.944514972742887, "grad_norm": 0.48619481921195984, "kl": 0.408203125, "learning_rate": 1.0835156457455151e-07, "loss": 0.0163, "reward": 1.1635045111179352, "reward_std": 0.20715971663594246, "rewards/accuracy_reward": 0.1785714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 3162 }, { "completion_length": 635.0312652587891, "epoch": 0.9448136808304085, "grad_norm": 0.287328839302063, "kl": 0.3226318359375, "learning_rate": 1.0826179795732574e-07, "loss": 0.0129, "reward": 1.1406250596046448, "reward_std": 0.09919667989015579, "rewards/accuracy_reward": 0.1473214402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 3163 }, { "completion_length": 629.3817138671875, "epoch": 0.9451123889179299, "grad_norm": 0.2709958553314209, "kl": 0.2933349609375, "learning_rate": 1.0817251190796875e-07, "loss": 0.0117, "reward": 1.1808036267757416, "reward_std": 0.1796521469950676, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 3164 }, { "completion_length": 666.0848541259766, "epoch": 0.9454110970054515, "grad_norm": 1.0727159976959229, "kl": 0.81396484375, "learning_rate": 1.080837065236151e-07, "loss": 0.0326, "reward": 1.0764509439468384, "reward_std": 0.1635999558493495, "rewards/accuracy_reward": 0.09598214761354029, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 3165 }, { "completion_length": 722.1451110839844, "epoch": 0.9457098050929729, "grad_norm": 0.632955014705658, "kl": 0.5947265625, "learning_rate": 1.0799538190087624e-07, "loss": 0.0238, "reward": 1.0959821939468384, "reward_std": 0.16381793841719627, "rewards/accuracy_reward": 0.11383929522708058, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98214291036129, "step": 3166 }, { "completion_length": 784.4241333007812, "epoch": 0.9460085131804944, "grad_norm": 0.8859069347381592, "kl": 0.82421875, "learning_rate": 1.0790753813584083e-07, "loss": 0.033, "reward": 1.086495578289032, "reward_std": 0.16917935386300087, "rewards/accuracy_reward": 0.11383929336443543, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9726563096046448, "step": 3167 }, { "completion_length": 720.310302734375, "epoch": 0.9463072212680158, "grad_norm": 0.360749214887619, "kl": 0.5546875, "learning_rate": 1.0782017532407418e-07, "loss": 0.0222, "reward": 1.1344866752624512, "reward_std": 0.17634711787104607, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 3168 }, { "completion_length": 578.7857437133789, "epoch": 0.9466059293555373, "grad_norm": 1.193648099899292, "kl": 0.255859375, "learning_rate": 1.0773329356061848e-07, "loss": 0.0102, "reward": 1.2912947237491608, "reward_std": 0.17395037971436977, "rewards/accuracy_reward": 0.2968750149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9944196790456772, "step": 3169 }, { "completion_length": 654.2589569091797, "epoch": 0.9469046374430587, "grad_norm": 0.7584988474845886, "kl": 0.5537109375, "learning_rate": 1.0764689293999263e-07, "loss": 0.0221, "reward": 1.0898438096046448, "reward_std": 0.11135431379079819, "rewards/accuracy_reward": 0.10491072107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.984933078289032, "step": 3170 }, { "completion_length": 686.372802734375, "epoch": 0.9472033455305803, "grad_norm": 0.5261210203170776, "kl": 0.40234375, "learning_rate": 1.0756097355619198e-07, "loss": 0.0161, "reward": 1.1132813096046448, "reward_std": 0.15937412530183792, "rewards/accuracy_reward": 0.12946429522708058, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 3171 }, { "completion_length": 683.4375305175781, "epoch": 0.9475020536181017, "grad_norm": 0.3585151731967926, "kl": 0.5810546875, "learning_rate": 1.074755355026884e-07, "loss": 0.0232, "reward": 1.1707589626312256, "reward_std": 0.1394311562180519, "rewards/accuracy_reward": 0.1941964328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625298023224, "step": 3172 }, { "completion_length": 597.0379791259766, "epoch": 0.9478007617056232, "grad_norm": 1.1543201208114624, "kl": 0.6162109375, "learning_rate": 1.0739057887243013e-07, "loss": 0.0247, "reward": 1.1590402126312256, "reward_std": 0.19468492455780506, "rewards/accuracy_reward": 0.1897321529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.969308078289032, "step": 3173 }, { "completion_length": 675.6094055175781, "epoch": 0.9480994697931446, "grad_norm": 1.0749974250793457, "kl": 0.525390625, "learning_rate": 1.0730610375784167e-07, "loss": 0.021, "reward": 1.1646205484867096, "reward_std": 0.18746401742100716, "rewards/accuracy_reward": 0.1830357275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 3174 }, { "completion_length": 611.9129791259766, "epoch": 0.9483981778806662, "grad_norm": 0.302261620759964, "kl": 0.36767578125, "learning_rate": 1.0722211025082367e-07, "loss": 0.0147, "reward": 1.272321492433548, "reward_std": 0.17426962032914162, "rewards/accuracy_reward": 0.28125001303851604, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 3175 }, { "completion_length": 632.825927734375, "epoch": 0.9486968859681876, "grad_norm": 0.6967043280601501, "kl": 0.342041015625, "learning_rate": 1.0713859844275286e-07, "loss": 0.0137, "reward": 1.1674107611179352, "reward_std": 0.1739404145628214, "rewards/accuracy_reward": 0.17410714738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 3176 }, { "completion_length": 656.3437805175781, "epoch": 0.9489955940557091, "grad_norm": 0.5791724324226379, "kl": 0.622314453125, "learning_rate": 1.070555684244818e-07, "loss": 0.0249, "reward": 1.2087054252624512, "reward_std": 0.15655269660055637, "rewards/accuracy_reward": 0.22767858137376606, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981026828289032, "step": 3177 }, { "completion_length": 682.8437805175781, "epoch": 0.9492943021432305, "grad_norm": 0.571182131767273, "kl": 0.42236328125, "learning_rate": 1.0697302028633907e-07, "loss": 0.0169, "reward": 1.1556920111179352, "reward_std": 0.2128986045718193, "rewards/accuracy_reward": 0.1651785746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 3178 }, { "completion_length": 656.3616333007812, "epoch": 0.949593010230752, "grad_norm": 0.4129875600337982, "kl": 0.42431640625, "learning_rate": 1.0689095411812898e-07, "loss": 0.017, "reward": 1.101004496216774, "reward_std": 0.0849489588290453, "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 3179 }, { "completion_length": 611.9107437133789, "epoch": 0.9498917183182735, "grad_norm": 0.47303125262260437, "kl": 0.28515625, "learning_rate": 1.0680937000913143e-07, "loss": 0.0114, "reward": 1.2008928954601288, "reward_std": 0.14710816484875977, "rewards/accuracy_reward": 0.2098214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714775323868, "step": 3180 }, { "completion_length": 673.6138763427734, "epoch": 0.950190426405795, "grad_norm": 0.743916928768158, "kl": 0.389404296875, "learning_rate": 1.0672826804810203e-07, "loss": 0.0156, "reward": 1.2555804252624512, "reward_std": 0.19558951631188393, "rewards/accuracy_reward": 0.2656250037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 3181 }, { "completion_length": 667.5692291259766, "epoch": 0.9504891344933164, "grad_norm": 0.547925591468811, "kl": 0.50048828125, "learning_rate": 1.0664764832327159e-07, "loss": 0.02, "reward": 1.2008928954601288, "reward_std": 0.11048132181167603, "rewards/accuracy_reward": 0.21428572572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 3182 }, { "completion_length": 697.6897659301758, "epoch": 0.9507878425808379, "grad_norm": 0.8634325265884399, "kl": 0.6275634765625, "learning_rate": 1.0656751092234664e-07, "loss": 0.0251, "reward": 1.148437574505806, "reward_std": 0.13294157851487398, "rewards/accuracy_reward": 0.17410715529695153, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9743303805589676, "step": 3183 }, { "completion_length": 628.8638763427734, "epoch": 0.9510865506683593, "grad_norm": 1.5148426294326782, "kl": 0.44921875, "learning_rate": 1.0648785593250875e-07, "loss": 0.018, "reward": 1.2628348767757416, "reward_std": 0.23454301059246063, "rewards/accuracy_reward": 0.2767857313156128, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491305589676, "step": 3184 }, { "completion_length": 727.3817291259766, "epoch": 0.9513852587558809, "grad_norm": 0.7190628051757812, "kl": 0.93994140625, "learning_rate": 1.0640868344041473e-07, "loss": 0.0376, "reward": 1.152901828289032, "reward_std": 0.2222097720950842, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.965401828289032, "step": 3185 }, { "completion_length": 718.4174346923828, "epoch": 0.9516839668434023, "grad_norm": 1.2513360977172852, "kl": 1.240234375, "learning_rate": 1.0632999353219652e-07, "loss": 0.0496, "reward": 1.0931919813156128, "reward_std": 0.23885181546211243, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9659598618745804, "step": 3186 }, { "completion_length": 699.5044860839844, "epoch": 0.9519826749309237, "grad_norm": 0.6219037771224976, "kl": 0.69677734375, "learning_rate": 1.0625178629346103e-07, "loss": 0.0279, "reward": 1.1049107760190964, "reward_std": 0.16873496770858765, "rewards/accuracy_reward": 0.1250000095460564, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107611179352, "step": 3187 }, { "completion_length": 621.6786041259766, "epoch": 0.9522813830184452, "grad_norm": 1.1051868200302124, "kl": 0.625, "learning_rate": 1.0617406180929002e-07, "loss": 0.025, "reward": 1.1852679252624512, "reward_std": 0.1952145230025053, "rewards/accuracy_reward": 0.2053571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9799107760190964, "step": 3188 }, { "completion_length": 772.0803985595703, "epoch": 0.9525800911059666, "grad_norm": 0.5549286007881165, "kl": 0.74365234375, "learning_rate": 1.0609682016424013e-07, "loss": 0.0297, "reward": 1.2204241752624512, "reward_std": 0.13687167316675186, "rewards/accuracy_reward": 0.2410714402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 3189 }, { "completion_length": 661.5580749511719, "epoch": 0.9528787991934882, "grad_norm": 0.43482837080955505, "kl": 0.478515625, "learning_rate": 1.0602006144234274e-07, "loss": 0.0192, "reward": 1.2120536267757416, "reward_std": 0.11949009913951159, "rewards/accuracy_reward": 0.2232142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 3190 }, { "completion_length": 666.2991333007812, "epoch": 0.9531775072810096, "grad_norm": 0.42658302187919617, "kl": 0.54541015625, "learning_rate": 1.059437857271038e-07, "loss": 0.0218, "reward": 1.18136166036129, "reward_std": 0.13437055051326752, "rewards/accuracy_reward": 0.20089285750873387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687649011612, "step": 3191 }, { "completion_length": 672.8839721679688, "epoch": 0.9534762153685311, "grad_norm": 1.211075782775879, "kl": 0.487060546875, "learning_rate": 1.0586799310150379e-07, "loss": 0.0195, "reward": 1.2087054252624512, "reward_std": 0.1836131103336811, "rewards/accuracy_reward": 0.2209821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 3192 }, { "completion_length": 655.9486923217773, "epoch": 0.9537749234560525, "grad_norm": 0.46373265981674194, "kl": 0.42138671875, "learning_rate": 1.0579268364799769e-07, "loss": 0.0168, "reward": 1.170758992433548, "reward_std": 0.12951808609068394, "rewards/accuracy_reward": 0.1830357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987723246216774, "step": 3193 }, { "completion_length": 637.7544860839844, "epoch": 0.954073631543574, "grad_norm": 0.407168447971344, "kl": 0.51708984375, "learning_rate": 1.0571785744851472e-07, "loss": 0.0207, "reward": 1.2064732611179352, "reward_std": 0.17893297597765923, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 3194 }, { "completion_length": 651.1562652587891, "epoch": 0.9543723396310955, "grad_norm": 0.4831772744655609, "kl": 0.506103515625, "learning_rate": 1.056435145844586e-07, "loss": 0.0203, "reward": 1.2773438096046448, "reward_std": 0.21890776604413986, "rewards/accuracy_reward": 0.2946428693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.982700914144516, "step": 3195 }, { "completion_length": 755.4553833007812, "epoch": 0.954671047718617, "grad_norm": 0.47727277874946594, "kl": 0.68701171875, "learning_rate": 1.0556965513670694e-07, "loss": 0.0275, "reward": 1.1065848916769028, "reward_std": 0.19582416955381632, "rewards/accuracy_reward": 0.1361607201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9704241454601288, "step": 3196 }, { "completion_length": 689.1830749511719, "epoch": 0.9549697558061384, "grad_norm": 0.9274694323539734, "kl": 0.36083984375, "learning_rate": 1.0549627918561161e-07, "loss": 0.0144, "reward": 1.2600446939468384, "reward_std": 0.16075196489691734, "rewards/accuracy_reward": 0.2700892984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 3197 }, { "completion_length": 700.1049346923828, "epoch": 0.9552684638936599, "grad_norm": 0.4859018623828888, "kl": 0.5615234375, "learning_rate": 1.0542338681099859e-07, "loss": 0.0225, "reward": 1.1501116454601288, "reward_std": 0.12020776234567165, "rewards/accuracy_reward": 0.1763392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723618745804, "step": 3198 }, { "completion_length": 631.2053680419922, "epoch": 0.9555671719811814, "grad_norm": 0.46504461765289307, "kl": 0.58837890625, "learning_rate": 1.0535097809216743e-07, "loss": 0.0236, "reward": 1.2081473767757416, "reward_std": 0.1993250697851181, "rewards/accuracy_reward": 0.2254464402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 3199 }, { "completion_length": 719.3884429931641, "epoch": 0.9558658800687029, "grad_norm": 0.32614845037460327, "kl": 0.51953125, "learning_rate": 1.0527905310789185e-07, "loss": 0.0208, "reward": 1.0870536267757416, "reward_std": 0.11503270268440247, "rewards/accuracy_reward": 0.10267857578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750596046448, "step": 3200 }, { "completion_length": 740.1138763427734, "epoch": 0.9561645881562243, "grad_norm": 0.4056580066680908, "kl": 0.56005859375, "learning_rate": 1.0520761193641912e-07, "loss": 0.0224, "reward": 1.0703125596046448, "reward_std": 0.12495550606399775, "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 3201 }, { "completion_length": 749.7098541259766, "epoch": 0.9564632962437458, "grad_norm": 0.6471943259239197, "kl": 0.849609375, "learning_rate": 1.051366546554703e-07, "loss": 0.034, "reward": 1.147321492433548, "reward_std": 0.1664286870509386, "rewards/accuracy_reward": 0.1741071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.973214328289032, "step": 3202 }, { "completion_length": 598.8482360839844, "epoch": 0.9567620043312672, "grad_norm": 0.3566691279411316, "kl": 0.466796875, "learning_rate": 1.050661813422399e-07, "loss": 0.0187, "reward": 1.272321492433548, "reward_std": 0.19026357121765614, "rewards/accuracy_reward": 0.283482164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393431901932, "step": 3203 }, { "completion_length": 690.388427734375, "epoch": 0.9570607124187888, "grad_norm": 0.40255773067474365, "kl": 0.5654296875, "learning_rate": 1.0499619207339604e-07, "loss": 0.0227, "reward": 1.1428572237491608, "reward_std": 0.15674144215881824, "rewards/accuracy_reward": 0.1584821529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 3204 }, { "completion_length": 659.950927734375, "epoch": 0.9573594205063102, "grad_norm": 1.0024425983428955, "kl": 0.7451171875, "learning_rate": 1.0492668692508011e-07, "loss": 0.0298, "reward": 1.264508992433548, "reward_std": 0.2516275458037853, "rewards/accuracy_reward": 0.2857142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946790456772, "step": 3205 }, { "completion_length": 648.0089569091797, "epoch": 0.9576581285938317, "grad_norm": 0.7600319385528564, "kl": 0.415283203125, "learning_rate": 1.0485766597290697e-07, "loss": 0.0166, "reward": 1.0825893431901932, "reward_std": 0.12377467285841703, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9955357313156128, "step": 3206 }, { "completion_length": 727.1652221679688, "epoch": 0.9579568366813531, "grad_norm": 0.45284733176231384, "kl": 0.591064453125, "learning_rate": 1.0478912929196455e-07, "loss": 0.0237, "reward": 1.1210937798023224, "reward_std": 0.18760379031300545, "rewards/accuracy_reward": 0.13169643771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973469734192, "step": 3207 }, { "completion_length": 672.4665374755859, "epoch": 0.9582555447688746, "grad_norm": 1.0453475713729858, "kl": 0.50439453125, "learning_rate": 1.0472107695681412e-07, "loss": 0.0202, "reward": 1.1495536267757416, "reward_std": 0.10801056772470474, "rewards/accuracy_reward": 0.15848214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 3208 }, { "completion_length": 654.3281707763672, "epoch": 0.9585542528563961, "grad_norm": 0.38242971897125244, "kl": 0.427978515625, "learning_rate": 1.0465350904148996e-07, "loss": 0.0171, "reward": 1.2204241454601288, "reward_std": 0.14963609166443348, "rewards/accuracy_reward": 0.22991072246804833, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134439468384, "step": 3209 }, { "completion_length": 675.3460235595703, "epoch": 0.9588529609439176, "grad_norm": 0.3467016816139221, "kl": 0.59423828125, "learning_rate": 1.0458642561949932e-07, "loss": 0.0237, "reward": 1.1819196939468384, "reward_std": 0.13828497380018234, "rewards/accuracy_reward": 0.1986607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 3210 }, { "completion_length": 710.4687957763672, "epoch": 0.959151669031439, "grad_norm": 0.38649407029151917, "kl": 0.3824462890625, "learning_rate": 1.0451982676382239e-07, "loss": 0.0153, "reward": 1.1378348469734192, "reward_std": 0.16011914797127247, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 3211 }, { "completion_length": 704.0067291259766, "epoch": 0.9594503771189605, "grad_norm": 0.7885622978210449, "kl": 0.512939453125, "learning_rate": 1.0445371254691217e-07, "loss": 0.0205, "reward": 1.212053656578064, "reward_std": 0.21688787266612053, "rewards/accuracy_reward": 0.2209821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 3212 }, { "completion_length": 755.654052734375, "epoch": 0.9597490852064819, "grad_norm": 0.45789915323257446, "kl": 0.46044921875, "learning_rate": 1.0438808304069443e-07, "loss": 0.0184, "reward": 1.1824777126312256, "reward_std": 0.1993030197918415, "rewards/accuracy_reward": 0.2008928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848767757416, "step": 3213 }, { "completion_length": 705.5156555175781, "epoch": 0.9600477932940035, "grad_norm": 0.9934782385826111, "kl": 0.755859375, "learning_rate": 1.0432293831656773e-07, "loss": 0.0302, "reward": 1.0931919813156128, "reward_std": 0.1352531798183918, "rewards/accuracy_reward": 0.11607143515720963, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9771205931901932, "step": 3214 }, { "completion_length": 658.2902069091797, "epoch": 0.9603465013815249, "grad_norm": 1.1205226182937622, "kl": 0.724609375, "learning_rate": 1.0425827844540311e-07, "loss": 0.029, "reward": 1.1210937798023224, "reward_std": 0.14551956579089165, "rewards/accuracy_reward": 0.14285715157166123, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 3215 }, { "completion_length": 660.5893096923828, "epoch": 0.9606452094690464, "grad_norm": 0.5372928380966187, "kl": 0.51708984375, "learning_rate": 1.0419410349754414e-07, "loss": 0.0207, "reward": 1.1277902275323868, "reward_std": 0.08046511746942997, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 3216 }, { "completion_length": 665.1585235595703, "epoch": 0.9609439175565678, "grad_norm": 0.27558332681655884, "kl": 0.42138671875, "learning_rate": 1.0413041354280689e-07, "loss": 0.0168, "reward": 1.0982143580913544, "reward_std": 0.1202563215047121, "rewards/accuracy_reward": 0.1071428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 3217 }, { "completion_length": 616.5468978881836, "epoch": 0.9612426256440894, "grad_norm": 0.5998833775520325, "kl": 0.399658203125, "learning_rate": 1.0406720865047981e-07, "loss": 0.016, "reward": 1.2036831080913544, "reward_std": 0.18708272092044353, "rewards/accuracy_reward": 0.21428572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973767757416, "step": 3218 }, { "completion_length": 674.5893249511719, "epoch": 0.9615413337316108, "grad_norm": 0.8248613476753235, "kl": 0.63427734375, "learning_rate": 1.0400448888932357e-07, "loss": 0.0254, "reward": 1.0675223469734192, "reward_std": 0.13125747814774513, "rewards/accuracy_reward": 0.08258928824216127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9849330633878708, "step": 3219 }, { "completion_length": 636.7545013427734, "epoch": 0.9618400418191323, "grad_norm": 1.2454633712768555, "kl": 0.42041015625, "learning_rate": 1.039422543275712e-07, "loss": 0.0168, "reward": 1.1601563096046448, "reward_std": 0.16912538558244705, "rewards/accuracy_reward": 0.1808035783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527275323868, "step": 3220 }, { "completion_length": 705.7522735595703, "epoch": 0.9621387499066537, "grad_norm": 0.39938676357269287, "kl": 0.575439453125, "learning_rate": 1.0388050503292772e-07, "loss": 0.023, "reward": 1.1289063096046448, "reward_std": 0.1677062101662159, "rewards/accuracy_reward": 0.14508929033763707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838170111179352, "step": 3221 }, { "completion_length": 627.0893173217773, "epoch": 0.9624374579941752, "grad_norm": 0.4370039105415344, "kl": 0.60986328125, "learning_rate": 1.0381924107257034e-07, "loss": 0.0244, "reward": 1.176339328289032, "reward_std": 0.12315678223967552, "rewards/accuracy_reward": 0.18750000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.988839328289032, "step": 3222 }, { "completion_length": 617.3727874755859, "epoch": 0.9627361660816967, "grad_norm": 0.6096247434616089, "kl": 0.389892578125, "learning_rate": 1.0375846251314833e-07, "loss": 0.0156, "reward": 1.1104911267757416, "reward_std": 0.14759712666273117, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 3223 }, { "completion_length": 622.3571624755859, "epoch": 0.9630348741692182, "grad_norm": 0.4180809557437897, "kl": 0.360595703125, "learning_rate": 1.036981694207827e-07, "loss": 0.0145, "reward": 1.2260045409202576, "reward_std": 0.1531863547861576, "rewards/accuracy_reward": 0.2388392947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 3224 }, { "completion_length": 674.6518096923828, "epoch": 0.9633335822567396, "grad_norm": 0.9052215814590454, "kl": 0.2088623046875, "learning_rate": 1.0363836186106642e-07, "loss": 0.0083, "reward": 1.2851563096046448, "reward_std": 0.15869668498635292, "rewards/accuracy_reward": 0.2924107275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455484867096, "step": 3225 }, { "completion_length": 680.0067291259766, "epoch": 0.9636322903442611, "grad_norm": 0.37923356890678406, "kl": 0.339111328125, "learning_rate": 1.0357903989906437e-07, "loss": 0.0136, "reward": 1.1077009439468384, "reward_std": 0.10567060951143503, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 3226 }, { "completion_length": 681.810302734375, "epoch": 0.9639309984317825, "grad_norm": 0.5494709610939026, "kl": 0.4404296875, "learning_rate": 1.0352020359931289e-07, "loss": 0.0176, "reward": 1.1662947237491608, "reward_std": 0.15434022806584835, "rewards/accuracy_reward": 0.1808035857975483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 3227 }, { "completion_length": 666.4620971679688, "epoch": 0.9642297065193041, "grad_norm": 0.41557833552360535, "kl": 0.5859375, "learning_rate": 1.0346185302582017e-07, "loss": 0.0234, "reward": 1.1702009737491608, "reward_std": 0.1851570587605238, "rewards/accuracy_reward": 0.1897321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 3228 }, { "completion_length": 596.966552734375, "epoch": 0.9645284146068255, "grad_norm": 0.6144517064094543, "kl": 0.3876953125, "learning_rate": 1.0340398824206595e-07, "loss": 0.0155, "reward": 1.1852678954601288, "reward_std": 0.10481588169932365, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9933035969734192, "step": 3229 }, { "completion_length": 645.6584930419922, "epoch": 0.9648271226943469, "grad_norm": 0.5839906334877014, "kl": 0.5400390625, "learning_rate": 1.033466093110014e-07, "loss": 0.0216, "reward": 1.1529018580913544, "reward_std": 0.16267375275492668, "rewards/accuracy_reward": 0.1696428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589626312256, "step": 3230 }, { "completion_length": 653.5848388671875, "epoch": 0.9651258307818684, "grad_norm": 0.3780272305011749, "kl": 0.83349609375, "learning_rate": 1.0328971629504919e-07, "loss": 0.0334, "reward": 1.1411831080913544, "reward_std": 0.17548107542097569, "rewards/accuracy_reward": 0.1607142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687798023224, "step": 3231 }, { "completion_length": 586.5602874755859, "epoch": 0.9654245388693898, "grad_norm": 0.7862274050712585, "kl": 0.382080078125, "learning_rate": 1.0323330925610333e-07, "loss": 0.0153, "reward": 1.0803571939468384, "reward_std": 0.17610958218574524, "rewards/accuracy_reward": 0.09598214644938707, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 3232 }, { "completion_length": 618.9062805175781, "epoch": 0.9657232469569114, "grad_norm": 0.5406938195228577, "kl": 0.4139404296875, "learning_rate": 1.0317738825552916e-07, "loss": 0.0166, "reward": 1.1584821939468384, "reward_std": 0.14797671884298325, "rewards/accuracy_reward": 0.17410715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 3233 }, { "completion_length": 662.1942138671875, "epoch": 0.9660219550444328, "grad_norm": 0.6231076121330261, "kl": 0.358642578125, "learning_rate": 1.0312195335416322e-07, "loss": 0.0144, "reward": 1.2053572237491608, "reward_std": 0.17452652752399445, "rewards/accuracy_reward": 0.2232142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 3234 }, { "completion_length": 670.7924346923828, "epoch": 0.9663206631319543, "grad_norm": 0.569398045539856, "kl": 0.49951171875, "learning_rate": 1.030670046123133e-07, "loss": 0.02, "reward": 1.215401828289032, "reward_std": 0.14826036989688873, "rewards/accuracy_reward": 0.22991072945296764, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854910969734192, "step": 3235 }, { "completion_length": 634.1897659301758, "epoch": 0.9666193712194757, "grad_norm": 0.461025595664978, "kl": 0.4600830078125, "learning_rate": 1.0301254208975823e-07, "loss": 0.0184, "reward": 1.18917416036129, "reward_std": 0.17400346882641315, "rewards/accuracy_reward": 0.20312501350417733, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 3236 }, { "completion_length": 577.2656555175781, "epoch": 0.9669180793069972, "grad_norm": 0.8644385933876038, "kl": 0.25628662109375, "learning_rate": 1.0295856584574785e-07, "loss": 0.0103, "reward": 1.1941964775323868, "reward_std": 0.15205716900527477, "rewards/accuracy_reward": 0.20758929592557251, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.986607164144516, "step": 3237 }, { "completion_length": 672.1964569091797, "epoch": 0.9672167873945187, "grad_norm": 0.44494929909706116, "kl": 0.3919677734375, "learning_rate": 1.0290507593900307e-07, "loss": 0.0157, "reward": 1.063058078289032, "reward_std": 0.14072944037616253, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 3238 }, { "completion_length": 717.9553985595703, "epoch": 0.9675154954820402, "grad_norm": 0.45605263113975525, "kl": 0.533203125, "learning_rate": 1.0285207242771568e-07, "loss": 0.0214, "reward": 1.1741071939468384, "reward_std": 0.16121534258127213, "rewards/accuracy_reward": 0.18973215762525797, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750447034836, "step": 3239 }, { "completion_length": 645.5424499511719, "epoch": 0.9678142035695616, "grad_norm": 0.44702982902526855, "kl": 0.409423828125, "learning_rate": 1.027995553695483e-07, "loss": 0.0163, "reward": 1.0920759439468384, "reward_std": 0.13676964864134789, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 3240 }, { "completion_length": 714.4353179931641, "epoch": 0.9681129116570831, "grad_norm": 0.5820563435554504, "kl": 0.503662109375, "learning_rate": 1.0274752482163426e-07, "loss": 0.0201, "reward": 1.164620578289032, "reward_std": 0.17436447367072105, "rewards/accuracy_reward": 0.17633929010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 3241 }, { "completion_length": 706.6741180419922, "epoch": 0.9684116197446045, "grad_norm": 0.33019694685935974, "kl": 0.419189453125, "learning_rate": 1.0269598084057783e-07, "loss": 0.0168, "reward": 1.1902902126312256, "reward_std": 0.18732066825032234, "rewards/accuracy_reward": 0.19866072572767735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.991629496216774, "step": 3242 }, { "completion_length": 701.9933319091797, "epoch": 0.9687103278321261, "grad_norm": 0.39456430077552795, "kl": 0.31640625, "learning_rate": 1.0264492348245369e-07, "loss": 0.0127, "reward": 1.2047991454601288, "reward_std": 0.16079656779766083, "rewards/accuracy_reward": 0.21205358766019344, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 3243 }, { "completion_length": 784.0580749511719, "epoch": 0.9690090359196475, "grad_norm": 1.0452932119369507, "kl": 0.83837890625, "learning_rate": 1.0259435280280732e-07, "loss": 0.0335, "reward": 1.1612723618745804, "reward_std": 0.16328326985239983, "rewards/accuracy_reward": 0.19419643958099186, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9670759439468384, "step": 3244 }, { "completion_length": 782.2455749511719, "epoch": 0.969307744007169, "grad_norm": 0.5202950239181519, "kl": 0.8740234375, "learning_rate": 1.0254426885665462e-07, "loss": 0.035, "reward": 1.0931920111179352, "reward_std": 0.14833449572324753, "rewards/accuracy_reward": 0.1183035746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884588479996, "step": 3245 }, { "completion_length": 665.2946624755859, "epoch": 0.9696064520946904, "grad_norm": 0.4445199966430664, "kl": 0.4130859375, "learning_rate": 1.0249467169848205e-07, "loss": 0.0165, "reward": 1.1110491454601288, "reward_std": 0.15180160105228424, "rewards/accuracy_reward": 0.12053572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134439468384, "step": 3246 }, { "completion_length": 660.1785888671875, "epoch": 0.969905160182212, "grad_norm": 0.29062503576278687, "kl": 0.228271484375, "learning_rate": 1.0244556138224637e-07, "loss": 0.0091, "reward": 1.147321492433548, "reward_std": 0.1369994804263115, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9977678656578064, "step": 3247 }, { "completion_length": 670.4620971679688, "epoch": 0.9702038682697334, "grad_norm": 0.5948361158370972, "kl": 0.5498046875, "learning_rate": 1.0239693796137493e-07, "loss": 0.022, "reward": 1.1171875596046448, "reward_std": 0.15949221327900887, "rewards/accuracy_reward": 0.12946428963914514, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 3248 }, { "completion_length": 638.6228179931641, "epoch": 0.9705025763572549, "grad_norm": 0.3991919159889221, "kl": 0.26904296875, "learning_rate": 1.0234880148876515e-07, "loss": 0.0108, "reward": 1.1395089626312256, "reward_std": 0.1126195676624775, "rewards/accuracy_reward": 0.14285715017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9966518133878708, "step": 3249 }, { "completion_length": 656.9888763427734, "epoch": 0.9708012844447763, "grad_norm": 0.5438184142112732, "kl": 0.41796875, "learning_rate": 1.023011520167848e-07, "loss": 0.0167, "reward": 1.1088170111179352, "reward_std": 0.1587293092161417, "rewards/accuracy_reward": 0.12053571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812649011612, "step": 3250 }, { "completion_length": 726.7455749511719, "epoch": 0.9710999925322978, "grad_norm": 0.5675935745239258, "kl": 0.669921875, "learning_rate": 1.0225398959727186e-07, "loss": 0.0268, "reward": 1.1780134439468384, "reward_std": 0.1599173741415143, "rewards/accuracy_reward": 0.19419644214212894, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 3251 }, { "completion_length": 637.7924346923828, "epoch": 0.9713987006198193, "grad_norm": 0.8720200657844543, "kl": 0.65625, "learning_rate": 1.0220731428153443e-07, "loss": 0.0263, "reward": 1.1155134439468384, "reward_std": 0.07508560223504901, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 3252 }, { "completion_length": 594.7053756713867, "epoch": 0.9716974087073408, "grad_norm": 0.5325044989585876, "kl": 0.535400390625, "learning_rate": 1.0216112612035063e-07, "loss": 0.0214, "reward": 1.225446492433548, "reward_std": 0.19851650297641754, "rewards/accuracy_reward": 0.238839291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 3253 }, { "completion_length": 655.3236999511719, "epoch": 0.9719961167948622, "grad_norm": 0.8245524764060974, "kl": 0.32958984375, "learning_rate": 1.0211542516396875e-07, "loss": 0.0132, "reward": 1.2198661267757416, "reward_std": 0.1193203404545784, "rewards/accuracy_reward": 0.2299107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 3254 }, { "completion_length": 723.029052734375, "epoch": 0.9722948248823837, "grad_norm": 0.6228832602500916, "kl": 0.8125, "learning_rate": 1.020702114621068e-07, "loss": 0.0325, "reward": 1.1534598767757416, "reward_std": 0.1628323793411255, "rewards/accuracy_reward": 0.1696428619325161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 3255 }, { "completion_length": 757.8906555175781, "epoch": 0.9725935329699051, "grad_norm": 0.540744960308075, "kl": 0.71173095703125, "learning_rate": 1.0202548506395297e-07, "loss": 0.0285, "reward": 1.119977742433548, "reward_std": 0.16531260684132576, "rewards/accuracy_reward": 0.15178572502918541, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.968191996216774, "step": 3256 }, { "completion_length": 643.1652069091797, "epoch": 0.9728922410574267, "grad_norm": 0.2892691493034363, "kl": 0.3759765625, "learning_rate": 1.0198124601816523e-07, "loss": 0.015, "reward": 1.215959906578064, "reward_std": 0.14878679905086756, "rewards/accuracy_reward": 0.2209821566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949776977300644, "step": 3257 }, { "completion_length": 726.1183242797852, "epoch": 0.9731909491449481, "grad_norm": 0.34769266843795776, "kl": 0.3524169921875, "learning_rate": 1.019374943728712e-07, "loss": 0.0141, "reward": 1.0931920111179352, "reward_std": 0.11849102284759283, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9748884290456772, "step": 3258 }, { "completion_length": 719.0937805175781, "epoch": 0.9734896572324696, "grad_norm": 0.487944632768631, "kl": 0.7314453125, "learning_rate": 1.0189423017566845e-07, "loss": 0.0293, "reward": 1.098772406578064, "reward_std": 0.14740374498069286, "rewards/accuracy_reward": 0.12053572200238705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366454601288, "step": 3259 }, { "completion_length": 620.5000228881836, "epoch": 0.973788365319991, "grad_norm": 0.7368671894073486, "kl": 0.3115234375, "learning_rate": 1.0185145347362418e-07, "loss": 0.0125, "reward": 1.1975446939468384, "reward_std": 0.15133884362876415, "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875447034836, "step": 3260 }, { "completion_length": 677.4263763427734, "epoch": 0.9740870734075125, "grad_norm": 0.7778902649879456, "kl": 0.55029296875, "learning_rate": 1.018091643132753e-07, "loss": 0.022, "reward": 1.2031250298023224, "reward_std": 0.1602900642901659, "rewards/accuracy_reward": 0.2209821492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428805589676, "step": 3261 }, { "completion_length": 664.8080673217773, "epoch": 0.974385781495034, "grad_norm": 0.6883546710014343, "kl": 0.5810546875, "learning_rate": 1.0176736274062817e-07, "loss": 0.0232, "reward": 1.1969866752624512, "reward_std": 0.21540624648332596, "rewards/accuracy_reward": 0.2142857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 3262 }, { "completion_length": 603.9286117553711, "epoch": 0.9746844895825555, "grad_norm": 0.3041197657585144, "kl": 0.2978515625, "learning_rate": 1.0172604880115888e-07, "loss": 0.0119, "reward": 1.1445312798023224, "reward_std": 0.14382916316390038, "rewards/accuracy_reward": 0.14955357555299997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949777126312256, "step": 3263 }, { "completion_length": 610.1964645385742, "epoch": 0.9749831976700769, "grad_norm": 0.39614546298980713, "kl": 0.59375, "learning_rate": 1.0168522253981293e-07, "loss": 0.0238, "reward": 1.1919643580913544, "reward_std": 0.15154967084527016, "rewards/accuracy_reward": 0.2053571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 3264 }, { "completion_length": 604.154052734375, "epoch": 0.9752819057575984, "grad_norm": 0.43901926279067993, "kl": 0.4478759765625, "learning_rate": 1.0164488400100528e-07, "loss": 0.0179, "reward": 1.1914063096046448, "reward_std": 0.1370820477604866, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 3265 }, { "completion_length": 726.2232360839844, "epoch": 0.9755806138451198, "grad_norm": 0.5588077306747437, "kl": 0.9462890625, "learning_rate": 1.0160503322862032e-07, "loss": 0.0379, "reward": 1.1344866454601288, "reward_std": 0.21177229657769203, "rewards/accuracy_reward": 0.15848214831203222, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760045111179352, "step": 3266 }, { "completion_length": 630.959846496582, "epoch": 0.9758793219326414, "grad_norm": 0.9727680087089539, "kl": 0.44805908203125, "learning_rate": 1.0156567026601176e-07, "loss": 0.018, "reward": 1.1651785969734192, "reward_std": 0.16091279685497284, "rewards/accuracy_reward": 0.18303572107106447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428805589676, "step": 3267 }, { "completion_length": 621.1361846923828, "epoch": 0.9761780300201628, "grad_norm": 0.7934719324111938, "kl": 0.2574462890625, "learning_rate": 1.015267951560027e-07, "loss": 0.0103, "reward": 1.0982143580913544, "reward_std": 0.11545385047793388, "rewards/accuracy_reward": 0.10044643376022577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9977678656578064, "step": 3268 }, { "completion_length": 590.4576263427734, "epoch": 0.9764767381076843, "grad_norm": 0.47710251808166504, "kl": 0.375244140625, "learning_rate": 1.0148840794088538e-07, "loss": 0.015, "reward": 1.2087054252624512, "reward_std": 0.16571853403002024, "rewards/accuracy_reward": 0.2165178705472499, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 3269 }, { "completion_length": 614.6562805175781, "epoch": 0.9767754461952057, "grad_norm": 0.7712462544441223, "kl": 0.34130859375, "learning_rate": 1.0145050866242139e-07, "loss": 0.0136, "reward": 1.168526828289032, "reward_std": 0.11193169839680195, "rewards/accuracy_reward": 0.18080358020961285, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9877232611179352, "step": 3270 }, { "completion_length": 644.9107513427734, "epoch": 0.9770741542827273, "grad_norm": 0.5821932554244995, "kl": 0.4327392578125, "learning_rate": 1.0141309736184135e-07, "loss": 0.0173, "reward": 1.1914062798023224, "reward_std": 0.13313812296837568, "rewards/accuracy_reward": 0.2031250111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 3271 }, { "completion_length": 678.8281555175781, "epoch": 0.9773728623702487, "grad_norm": 0.6627193093299866, "kl": 0.408935546875, "learning_rate": 1.0137617407984517e-07, "loss": 0.0164, "reward": 1.151227742433548, "reward_std": 0.18700362369418144, "rewards/accuracy_reward": 0.1674107168801129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838169813156128, "step": 3272 }, { "completion_length": 675.5469055175781, "epoch": 0.9776715704577701, "grad_norm": 0.776020348072052, "kl": 0.444091796875, "learning_rate": 1.0133973885660173e-07, "loss": 0.0177, "reward": 1.1813616454601288, "reward_std": 0.1483735740184784, "rewards/accuracy_reward": 0.19196429662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 3273 }, { "completion_length": 720.732177734375, "epoch": 0.9779702785452916, "grad_norm": 0.6790978908538818, "kl": 0.47021484375, "learning_rate": 1.0130379173174901e-07, "loss": 0.0188, "reward": 1.2243303805589676, "reward_std": 0.1916919518262148, "rewards/accuracy_reward": 0.243303582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 3274 }, { "completion_length": 630.0245819091797, "epoch": 0.978268986632813, "grad_norm": 0.7561123967170715, "kl": 0.28466796875, "learning_rate": 1.0126833274439385e-07, "loss": 0.0114, "reward": 1.205915242433548, "reward_std": 0.18197422102093697, "rewards/accuracy_reward": 0.21651786379516125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 3275 }, { "completion_length": 734.8460083007812, "epoch": 0.9785676947203346, "grad_norm": 0.5583508014678955, "kl": 0.831298828125, "learning_rate": 1.0123336193311232e-07, "loss": 0.0333, "reward": 1.1054687947034836, "reward_std": 0.1587888840585947, "rewards/accuracy_reward": 0.12276786682195961, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009439468384, "step": 3276 }, { "completion_length": 731.4866485595703, "epoch": 0.978866402807856, "grad_norm": 0.462930291891098, "kl": 0.63134765625, "learning_rate": 1.0119887933594911e-07, "loss": 0.0253, "reward": 1.2617188096046448, "reward_std": 0.19835212640464306, "rewards/accuracy_reward": 0.2790178693830967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 3277 }, { "completion_length": 695.0044860839844, "epoch": 0.9791651108953775, "grad_norm": 0.4104577302932739, "kl": 0.362548828125, "learning_rate": 1.0116488499041794e-07, "loss": 0.0145, "reward": 1.2170759439468384, "reward_std": 0.21179074048995972, "rewards/accuracy_reward": 0.2299107238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 3278 }, { "completion_length": 647.2567291259766, "epoch": 0.9794638189828989, "grad_norm": 0.7614005208015442, "kl": 0.2694091796875, "learning_rate": 1.0113137893350135e-07, "loss": 0.0108, "reward": 1.1222098767757416, "reward_std": 0.13426960073411465, "rewards/accuracy_reward": 0.12723214738070965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949777275323868, "step": 3279 }, { "completion_length": 679.6205596923828, "epoch": 0.9797625270704204, "grad_norm": 0.5755032300949097, "kl": 0.5078125, "learning_rate": 1.0109836120165059e-07, "loss": 0.0203, "reward": 1.1266741156578064, "reward_std": 0.15422235429286957, "rewards/accuracy_reward": 0.1383928614668548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 3280 }, { "completion_length": 642.2857513427734, "epoch": 0.9800612351579419, "grad_norm": 0.49366652965545654, "kl": 0.530517578125, "learning_rate": 1.0106583183078579e-07, "loss": 0.0213, "reward": 1.0558036267757416, "reward_std": 0.1130686504766345, "rewards/accuracy_reward": 0.06473214528523386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9910714626312256, "step": 3281 }, { "completion_length": 670.0513610839844, "epoch": 0.9803599432454634, "grad_norm": 0.7003733515739441, "kl": 0.63916015625, "learning_rate": 1.0103379085629569e-07, "loss": 0.0256, "reward": 1.1238839626312256, "reward_std": 0.14105781447142363, "rewards/accuracy_reward": 0.145089291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946790456772, "step": 3282 }, { "completion_length": 616.4933319091797, "epoch": 0.9806586513329848, "grad_norm": 0.5217887759208679, "kl": 0.71240234375, "learning_rate": 1.0100223831303767e-07, "loss": 0.0285, "reward": 1.2427456080913544, "reward_std": 0.09448814112693071, "rewards/accuracy_reward": 0.2544642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 3283 }, { "completion_length": 642.7611999511719, "epoch": 0.9809573594205063, "grad_norm": 0.3513982594013214, "kl": 0.450439453125, "learning_rate": 1.0097117423533792e-07, "loss": 0.018, "reward": 1.1188616454601288, "reward_std": 0.16339432634413242, "rewards/accuracy_reward": 0.13169643841683865, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 3284 }, { "completion_length": 724.247802734375, "epoch": 0.9812560675080277, "grad_norm": 1.1250015497207642, "kl": 1.3203125, "learning_rate": 1.00940598656991e-07, "loss": 0.0528, "reward": 1.14620541036129, "reward_std": 0.16834592446684837, "rewards/accuracy_reward": 0.17857143771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9676339626312256, "step": 3285 }, { "completion_length": 698.3482360839844, "epoch": 0.9815547755955493, "grad_norm": 0.5890236496925354, "kl": 0.74658203125, "learning_rate": 1.0091051161126022e-07, "loss": 0.0299, "reward": 1.1796875596046448, "reward_std": 0.21121226623654366, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9832589775323868, "step": 3286 }, { "completion_length": 678.7143249511719, "epoch": 0.9818534836830707, "grad_norm": 0.4306873679161072, "kl": 0.3759765625, "learning_rate": 1.0088091313087727e-07, "loss": 0.015, "reward": 1.1389509439468384, "reward_std": 0.14563260599970818, "rewards/accuracy_reward": 0.14732143469154835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916294813156128, "step": 3287 }, { "completion_length": 661.2611999511719, "epoch": 0.9821521917705922, "grad_norm": 0.5469516515731812, "kl": 0.4931640625, "learning_rate": 1.0085180324804246e-07, "loss": 0.0197, "reward": 1.221540242433548, "reward_std": 0.1833806335926056, "rewards/accuracy_reward": 0.2299107275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916294813156128, "step": 3288 }, { "completion_length": 680.966552734375, "epoch": 0.9824508998581136, "grad_norm": 0.3573077619075775, "kl": 0.56884765625, "learning_rate": 1.0082318199442449e-07, "loss": 0.0228, "reward": 1.063058078289032, "reward_std": 0.11638538166880608, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 3289 }, { "completion_length": 595.1897506713867, "epoch": 0.9827496079456352, "grad_norm": 1.3676965236663818, "kl": 0.314697265625, "learning_rate": 1.0079504940116038e-07, "loss": 0.0126, "reward": 1.2120536267757416, "reward_std": 0.16361346282064915, "rewards/accuracy_reward": 0.21205358393490314, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 3290 }, { "completion_length": 734.5469055175781, "epoch": 0.9830483160331566, "grad_norm": 0.6959149837493896, "kl": 0.97509765625, "learning_rate": 1.0076740549885572e-07, "loss": 0.0389, "reward": 1.1741071939468384, "reward_std": 0.16499744169414043, "rewards/accuracy_reward": 0.191964291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9821428954601288, "step": 3291 }, { "completion_length": 639.8170013427734, "epoch": 0.9833470241206781, "grad_norm": 0.6134700179100037, "kl": 0.3685302734375, "learning_rate": 1.0074025031758441e-07, "loss": 0.0148, "reward": 1.2042411267757416, "reward_std": 0.1547272503376007, "rewards/accuracy_reward": 0.2142857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553954601288, "step": 3292 }, { "completion_length": 659.5826110839844, "epoch": 0.9836457322081995, "grad_norm": 0.7768020033836365, "kl": 0.677490234375, "learning_rate": 1.0071358388688851e-07, "loss": 0.0271, "reward": 1.2020089626312256, "reward_std": 0.15358664840459824, "rewards/accuracy_reward": 0.2209821455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9810268133878708, "step": 3293 }, { "completion_length": 669.2433319091797, "epoch": 0.983944440295721, "grad_norm": 0.43759390711784363, "kl": 0.416015625, "learning_rate": 1.0068740623577857e-07, "loss": 0.0166, "reward": 1.184151828289032, "reward_std": 0.1707003153860569, "rewards/accuracy_reward": 0.191964291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 3294 }, { "completion_length": 650.2120819091797, "epoch": 0.9842431483832424, "grad_norm": 0.6432676911354065, "kl": 0.36865234375, "learning_rate": 1.0066171739273326e-07, "loss": 0.0148, "reward": 1.1835938096046448, "reward_std": 0.18679575249552727, "rewards/accuracy_reward": 0.1941964402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 3295 }, { "completion_length": 728.6004791259766, "epoch": 0.984541856470764, "grad_norm": 0.6779612302780151, "kl": 0.57080078125, "learning_rate": 1.0063651738569956e-07, "loss": 0.0228, "reward": 1.0837054252624512, "reward_std": 0.12138383463025093, "rewards/accuracy_reward": 0.10491071734577417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946790456772, "step": 3296 }, { "completion_length": 702.7857360839844, "epoch": 0.9848405645582854, "grad_norm": 0.5144370794296265, "kl": 0.709716796875, "learning_rate": 1.0061180624209255e-07, "loss": 0.0284, "reward": 1.2181920111179352, "reward_std": 0.17482155933976173, "rewards/accuracy_reward": 0.2388392984867096, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9793527126312256, "step": 3297 }, { "completion_length": 694.3928833007812, "epoch": 0.9851392726458069, "grad_norm": 0.9910531640052795, "kl": 0.83447265625, "learning_rate": 1.0058758398879562e-07, "loss": 0.0334, "reward": 1.143415242433548, "reward_std": 0.188483702018857, "rewards/accuracy_reward": 0.16071429010480642, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 3298 }, { "completion_length": 714.4598693847656, "epoch": 0.9854379807333283, "grad_norm": 0.48715341091156006, "kl": 0.9267578125, "learning_rate": 1.0056385065216011e-07, "loss": 0.037, "reward": 1.0479911118745804, "reward_std": 0.12356107961386442, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9743303954601288, "step": 3299 }, { "completion_length": 666.8437957763672, "epoch": 0.9857366888208499, "grad_norm": 0.9971656799316406, "kl": 0.406005859375, "learning_rate": 1.0054060625800564e-07, "loss": 0.0162, "reward": 1.1484375596046448, "reward_std": 0.10554770566523075, "rewards/accuracy_reward": 0.1495535783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9988839328289032, "step": 3300 }, { "completion_length": 599.6272735595703, "epoch": 0.9860353969083713, "grad_norm": 0.5332712531089783, "kl": 0.423828125, "learning_rate": 1.0051785083161985e-07, "loss": 0.0169, "reward": 1.2126116752624512, "reward_std": 0.10335674695670605, "rewards/accuracy_reward": 0.2299107275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.982700914144516, "step": 3301 }, { "completion_length": 645.6294860839844, "epoch": 0.9863341049958928, "grad_norm": 0.428698867559433, "kl": 0.34423828125, "learning_rate": 1.0049558439775828e-07, "loss": 0.0138, "reward": 1.1188616454601288, "reward_std": 0.14554532431066036, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 3302 }, { "completion_length": 677.0491333007812, "epoch": 0.9866328130834142, "grad_norm": 0.7347283959388733, "kl": 0.3985595703125, "learning_rate": 1.0047380698064481e-07, "loss": 0.0159, "reward": 1.1300223767757416, "reward_std": 0.146842448040843, "rewards/accuracy_reward": 0.14285714784637094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 3303 }, { "completion_length": 656.9375305175781, "epoch": 0.9869315211709357, "grad_norm": 0.8836311101913452, "kl": 0.264892578125, "learning_rate": 1.0045251860397098e-07, "loss": 0.0106, "reward": 1.2047991752624512, "reward_std": 0.12200374901294708, "rewards/accuracy_reward": 0.21205358393490314, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.992745578289032, "step": 3304 }, { "completion_length": 713.622802734375, "epoch": 0.9872302292584572, "grad_norm": 0.3346060514450073, "kl": 0.4075927734375, "learning_rate": 1.0043171929089653e-07, "loss": 0.0163, "reward": 1.1199777126312256, "reward_std": 0.1409262027591467, "rewards/accuracy_reward": 0.133928582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 3305 }, { "completion_length": 758.3951110839844, "epoch": 0.9875289373459787, "grad_norm": 0.5169845819473267, "kl": 0.752197265625, "learning_rate": 1.0041140906404907e-07, "loss": 0.0301, "reward": 1.0837054252624512, "reward_std": 0.15273666009306908, "rewards/accuracy_reward": 0.11383929196745157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9698660969734192, "step": 3306 }, { "completion_length": 656.7210235595703, "epoch": 0.9878276454335001, "grad_norm": 0.6688109636306763, "kl": 0.66748046875, "learning_rate": 1.0039158794552413e-07, "loss": 0.0267, "reward": 1.186383992433548, "reward_std": 0.18001675233244896, "rewards/accuracy_reward": 0.19642857927829027, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 3307 }, { "completion_length": 617.4643096923828, "epoch": 0.9881263535210216, "grad_norm": 0.5680887699127197, "kl": 0.42919921875, "learning_rate": 1.0037225595688517e-07, "loss": 0.0172, "reward": 1.174665242433548, "reward_std": 0.139063386246562, "rewards/accuracy_reward": 0.18526786682195961, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9893973618745804, "step": 3308 }, { "completion_length": 631.3147583007812, "epoch": 0.988425061608543, "grad_norm": 0.8874287009239197, "kl": 0.92138671875, "learning_rate": 1.0035341311916344e-07, "loss": 0.0368, "reward": 1.2243304252624512, "reward_std": 0.16021187417209148, "rewards/accuracy_reward": 0.2522321604192257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9720982611179352, "step": 3309 }, { "completion_length": 631.5536041259766, "epoch": 0.9887237696960646, "grad_norm": 0.4649779200553894, "kl": 0.4130859375, "learning_rate": 1.0033505945285818e-07, "loss": 0.0165, "reward": 1.2114955484867096, "reward_std": 0.22298120893537998, "rewards/accuracy_reward": 0.2254464440047741, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491752624512, "step": 3310 }, { "completion_length": 644.2210083007812, "epoch": 0.989022477783586, "grad_norm": 0.4670065641403198, "kl": 0.38232421875, "learning_rate": 1.0031719497793628e-07, "loss": 0.0153, "reward": 1.0970982611179352, "reward_std": 0.15991885773837566, "rewards/accuracy_reward": 0.10491071757860482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875298023224, "step": 3311 }, { "completion_length": 751.4911193847656, "epoch": 0.9893211858711075, "grad_norm": 0.40840163826942444, "kl": 0.5498046875, "learning_rate": 1.0029981971383263e-07, "loss": 0.022, "reward": 0.9916294813156128, "reward_std": 0.08641275390982628, "rewards/accuracy_reward": 0.0133928582072258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9782366305589676, "step": 3312 }, { "completion_length": 727.1763763427734, "epoch": 0.9896198939586289, "grad_norm": 0.6642764210700989, "kl": 0.4693603515625, "learning_rate": 1.0028293367944976e-07, "loss": 0.0188, "reward": 1.1406250298023224, "reward_std": 0.1525068636983633, "rewards/accuracy_reward": 0.15401786006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 3313 }, { "completion_length": 693.716552734375, "epoch": 0.9899186020461505, "grad_norm": 0.4347578287124634, "kl": 0.6513671875, "learning_rate": 1.0026653689315804e-07, "loss": 0.0261, "reward": 1.1395089775323868, "reward_std": 0.21665614284574986, "rewards/accuracy_reward": 0.1629464365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625447034836, "step": 3314 }, { "completion_length": 757.0826263427734, "epoch": 0.9902173101336719, "grad_norm": 1.0417391061782837, "kl": 0.7080078125, "learning_rate": 1.0025062937279558e-07, "loss": 0.0283, "reward": 1.0965402573347092, "reward_std": 0.17732898332178593, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9737723618745804, "step": 3315 }, { "completion_length": 708.0111999511719, "epoch": 0.9905160182211933, "grad_norm": 0.6856890916824341, "kl": 0.461669921875, "learning_rate": 1.0023521113566814e-07, "loss": 0.0185, "reward": 1.1378348469734192, "reward_std": 0.12528714817017317, "rewards/accuracy_reward": 0.1517857201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9860491454601288, "step": 3316 }, { "completion_length": 712.9531555175781, "epoch": 0.9908147263087148, "grad_norm": 0.7797399759292603, "kl": 0.481689453125, "learning_rate": 1.0022028219854932e-07, "loss": 0.0192, "reward": 1.1054688096046448, "reward_std": 0.14027962647378445, "rewards/accuracy_reward": 0.12723215017467737, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.97823666036129, "step": 3317 }, { "completion_length": 673.3504791259766, "epoch": 0.9911134343962362, "grad_norm": 0.3293960988521576, "kl": 0.2509765625, "learning_rate": 1.0020584257768032e-07, "loss": 0.01, "reward": 1.1623884439468384, "reward_std": 0.13047154620289803, "rewards/accuracy_reward": 0.1696428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 3318 }, { "completion_length": 633.091552734375, "epoch": 0.9914121424837578, "grad_norm": 0.33658546209335327, "kl": 0.6395263671875, "learning_rate": 1.0019189228877002e-07, "loss": 0.0255, "reward": 1.07979916036129, "reward_std": 0.12282002158463001, "rewards/accuracy_reward": 0.09821428754366934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9815848469734192, "step": 3319 }, { "completion_length": 654.2701110839844, "epoch": 0.9917108505712792, "grad_norm": 0.5524163842201233, "kl": 0.379150390625, "learning_rate": 1.00178431346995e-07, "loss": 0.0152, "reward": 1.2126116454601288, "reward_std": 0.18105307780206203, "rewards/accuracy_reward": 0.2254464402794838, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652275323868, "step": 3320 }, { "completion_length": 613.2812652587891, "epoch": 0.9920095586588007, "grad_norm": 0.2552233636379242, "kl": 0.255859375, "learning_rate": 1.001654597669994e-07, "loss": 0.0102, "reward": 1.190290242433548, "reward_std": 0.15414704009890556, "rewards/accuracy_reward": 0.19642858300358057, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9938616305589676, "step": 3321 }, { "completion_length": 591.3951263427734, "epoch": 0.9923082667463221, "grad_norm": 0.4325034022331238, "kl": 0.35498046875, "learning_rate": 1.0015297756289508e-07, "loss": 0.0142, "reward": 1.1467634439468384, "reward_std": 0.14267104025930166, "rewards/accuracy_reward": 0.16294643096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 3322 }, { "completion_length": 628.7120971679688, "epoch": 0.9926069748338436, "grad_norm": 0.6687702536582947, "kl": 0.427490234375, "learning_rate": 1.001409847482614e-07, "loss": 0.0171, "reward": 1.2494420111179352, "reward_std": 0.18250001966953278, "rewards/accuracy_reward": 0.26116072945296764, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812947034836, "step": 3323 }, { "completion_length": 664.0268096923828, "epoch": 0.992905682921365, "grad_norm": 0.3318959176540375, "kl": 0.2010498046875, "learning_rate": 1.0012948133614543e-07, "loss": 0.008, "reward": 1.1434152126312256, "reward_std": 0.2318539321422577, "rewards/accuracy_reward": 0.1473214328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9960937649011612, "step": 3324 }, { "completion_length": 645.9464416503906, "epoch": 0.9932043910088866, "grad_norm": 0.5603206753730774, "kl": 0.4716796875, "learning_rate": 1.0011846733906167e-07, "loss": 0.0189, "reward": 1.1540179252624512, "reward_std": 0.18023455515503883, "rewards/accuracy_reward": 0.1741071492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979910746216774, "step": 3325 }, { "completion_length": 638.7455596923828, "epoch": 0.993503099096408, "grad_norm": 0.4582241475582123, "kl": 0.57861328125, "learning_rate": 1.0010794276899233e-07, "loss": 0.0232, "reward": 1.131696492433548, "reward_std": 0.12400488741695881, "rewards/accuracy_reward": 0.15625000931322575, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9754464775323868, "step": 3326 }, { "completion_length": 707.7120971679688, "epoch": 0.9938018071839295, "grad_norm": 0.8007803559303284, "kl": 0.509033203125, "learning_rate": 1.0009790763738709e-07, "loss": 0.0204, "reward": 1.1093750298023224, "reward_std": 0.12550322711467743, "rewards/accuracy_reward": 0.1227678656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9866071790456772, "step": 3327 }, { "completion_length": 587.6629638671875, "epoch": 0.9941005152714509, "grad_norm": 0.5923862457275391, "kl": 0.31396484375, "learning_rate": 1.0008836195516322e-07, "loss": 0.0125, "reward": 1.1713170111179352, "reward_std": 0.1623186282813549, "rewards/accuracy_reward": 0.17633929662406445, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949776977300644, "step": 3328 }, { "completion_length": 702.3125305175781, "epoch": 0.9943992233589725, "grad_norm": 0.7529349327087402, "kl": 0.61083984375, "learning_rate": 1.0007930573270547e-07, "loss": 0.0244, "reward": 1.2114956080913544, "reward_std": 0.14818248245865107, "rewards/accuracy_reward": 0.2209821566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990513414144516, "step": 3329 }, { "completion_length": 717.9844207763672, "epoch": 0.9946979314464939, "grad_norm": 0.34972694516181946, "kl": 0.4609375, "learning_rate": 1.0007073897986607e-07, "loss": 0.0184, "reward": 1.1210937798023224, "reward_std": 0.1617631521075964, "rewards/accuracy_reward": 0.1383928656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.982700914144516, "step": 3330 }, { "completion_length": 660.0335083007812, "epoch": 0.9949966395340154, "grad_norm": 1.5100420713424683, "kl": 0.34423828125, "learning_rate": 1.0006266170596488e-07, "loss": 0.0138, "reward": 1.1914063096046448, "reward_std": 0.1990860216319561, "rewards/accuracy_reward": 0.20535715483129025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.98604916036129, "step": 3331 }, { "completion_length": 767.6473541259766, "epoch": 0.9952953476215368, "grad_norm": 0.5911885499954224, "kl": 0.6826171875, "learning_rate": 1.0005507391978915e-07, "loss": 0.0273, "reward": 1.1177456080913544, "reward_std": 0.15343384072184563, "rewards/accuracy_reward": 0.13392857951112092, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.983816996216774, "step": 3332 }, { "completion_length": 695.0335235595703, "epoch": 0.9955940557090583, "grad_norm": 0.7627778053283691, "kl": 0.443115234375, "learning_rate": 1.0004797562959367e-07, "loss": 0.0177, "reward": 1.1166295111179352, "reward_std": 0.10453727096319199, "rewards/accuracy_reward": 0.13616071874275804, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9804687947034836, "step": 3333 }, { "completion_length": 659.3415374755859, "epoch": 0.9958927637965798, "grad_norm": 1.2612519264221191, "kl": 0.4224853515625, "learning_rate": 1.0004136684310066e-07, "loss": 0.0169, "reward": 1.1305804252624512, "reward_std": 0.12400553468614817, "rewards/accuracy_reward": 0.14062500488944352, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 3334 }, { "completion_length": 667.122802734375, "epoch": 0.9961914718841013, "grad_norm": 0.6253750920295715, "kl": 0.283447265625, "learning_rate": 1.0003524756749982e-07, "loss": 0.0113, "reward": 1.1529018431901932, "reward_std": 0.19292791932821274, "rewards/accuracy_reward": 0.16294643771834671, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9899553805589676, "step": 3335 }, { "completion_length": 709.3616333007812, "epoch": 0.9964901799716227, "grad_norm": 0.7989068627357483, "kl": 0.50299072265625, "learning_rate": 1.0002961780944834e-07, "loss": 0.0201, "reward": 1.2343750596046448, "reward_std": 0.15034284070134163, "rewards/accuracy_reward": 0.2455357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9888393133878708, "step": 3336 }, { "completion_length": 720.7120971679688, "epoch": 0.9967888880591442, "grad_norm": 0.6870476007461548, "kl": 0.583984375, "learning_rate": 1.0002447757507084e-07, "loss": 0.0234, "reward": 1.123883992433548, "reward_std": 0.15438011288642883, "rewards/accuracy_reward": 0.14508929522708058, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9787946790456772, "step": 3337 }, { "completion_length": 685.9866333007812, "epoch": 0.9970875961466656, "grad_norm": 0.5313966274261475, "kl": 0.62847900390625, "learning_rate": 1.0001982686995942e-07, "loss": 0.0251, "reward": 1.1316964626312256, "reward_std": 0.12905516475439072, "rewards/accuracy_reward": 0.1473214365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 3338 }, { "completion_length": 677.7656555175781, "epoch": 0.9973863042341872, "grad_norm": 0.4901483356952667, "kl": 0.4373779296875, "learning_rate": 1.0001566569917358e-07, "loss": 0.0175, "reward": 1.1400670111179352, "reward_std": 0.09590120986104012, "rewards/accuracy_reward": 0.1517857238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 3339 }, { "completion_length": 576.9218978881836, "epoch": 0.9976850123217086, "grad_norm": 0.49290525913238525, "kl": 0.436279296875, "learning_rate": 1.0001199406724024e-07, "loss": 0.0175, "reward": 1.2315848767757416, "reward_std": 0.127636490855366, "rewards/accuracy_reward": 0.2366071529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9949776977300644, "step": 3340 }, { "completion_length": 724.9219055175781, "epoch": 0.9979837204092301, "grad_norm": 0.9241158962249756, "kl": 0.8779296875, "learning_rate": 1.0000881197815381e-07, "loss": 0.0352, "reward": 1.0892857611179352, "reward_std": 0.16726269386708736, "rewards/accuracy_reward": 0.1205357201397419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500447034836, "step": 3341 }, { "completion_length": 667.1964569091797, "epoch": 0.9982824284967515, "grad_norm": 0.4140980839729309, "kl": 0.4326171875, "learning_rate": 1.0000611943537603e-07, "loss": 0.0173, "reward": 1.3136161267757416, "reward_std": 0.25041962042450905, "rewards/accuracy_reward": 0.3214285895228386, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875149011612, "step": 3342 }, { "completion_length": 665.9620971679688, "epoch": 0.998581136584273, "grad_norm": 0.7098034620285034, "kl": 0.364990234375, "learning_rate": 1.0000391644183618e-07, "loss": 0.0146, "reward": 1.1891741454601288, "reward_std": 0.17488258332014084, "rewards/accuracy_reward": 0.1986607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9905134290456772, "step": 3343 }, { "completion_length": 622.4241333007812, "epoch": 0.9988798446717945, "grad_norm": 0.3061888813972473, "kl": 0.3590087890625, "learning_rate": 1.0000220299993092e-07, "loss": 0.0143, "reward": 1.1266741752624512, "reward_std": 0.1601984966546297, "rewards/accuracy_reward": 0.13839286286383867, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9882812798023224, "step": 3344 }, { "completion_length": 712.3460235595703, "epoch": 0.999178552759316, "grad_norm": 0.6061682105064392, "kl": 0.52001953125, "learning_rate": 1.0000097911152421e-07, "loss": 0.0208, "reward": 1.1289063096046448, "reward_std": 0.10364154353737831, "rewards/accuracy_reward": 0.1361607238650322, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927455633878708, "step": 3345 }, { "completion_length": 641.6384124755859, "epoch": 0.9994772608468374, "grad_norm": 1.4464887380599976, "kl": 0.69775390625, "learning_rate": 1.0000024477794761e-07, "loss": 0.0279, "reward": 1.2014509439468384, "reward_std": 0.1279119774699211, "rewards/accuracy_reward": 0.2142857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9871652126312256, "step": 3346 }, { "completion_length": 730.6250305175781, "epoch": 0.9997759689343589, "grad_norm": 0.42990559339523315, "kl": 0.665283203125, "learning_rate": 1e-07, "loss": 0.0266, "reward": 1.174665242433548, "reward_std": 0.1816239207983017, "rewards/accuracy_reward": 0.1919642947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9827009290456772, "step": 3347 }, { "epoch": 0.9997759689343589, "step": 3347, "total_flos": 0.0, "train_loss": 0.020902950335663653, "train_runtime": 254122.1544, "train_samples_per_second": 0.369, "train_steps_per_second": 0.013 } ], "logging_steps": 1, "max_steps": 3347, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }