{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9893390191897654, "eval_steps": 100, "global_step": 58, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 365.3112087249756, "epoch": 0.017057569296375266, "grad_norm": 1.04221985968125, "kl": 0.0, "learning_rate": 3.3333333333333333e-06, "loss": 0.0, "reward": 0.6432291865348816, "reward_std": 0.4505743272602558, "rewards/accuracy_reward": 0.11718750232830644, "rewards/format_reward": 0.5260416772216558, "step": 1 }, { "completion_length": 367.3958435058594, "epoch": 0.03411513859275053, "grad_norm": 1.1434252212815785, "kl": 0.0, "learning_rate": 6.666666666666667e-06, "loss": 0.0, "reward": 0.6783854328095913, "reward_std": 0.46897680312395096, "rewards/accuracy_reward": 0.12630208570044488, "rewards/format_reward": 0.5520833525806665, "step": 2 }, { "completion_length": 339.8724060058594, "epoch": 0.0511727078891258, "grad_norm": 0.8635963812006352, "kl": 0.0010623931884765625, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.6835937723517418, "reward_std": 0.43934670090675354, "rewards/accuracy_reward": 0.09895833639893681, "rewards/format_reward": 0.5846354383975267, "step": 3 }, { "completion_length": 149.22526454925537, "epoch": 0.06823027718550106, "grad_norm": 307.2682075780713, "kl": 2.980987548828125, "learning_rate": 1.3333333333333333e-05, "loss": 0.1193, "reward": 1.010416690260172, "reward_std": 0.2095302422530949, "rewards/accuracy_reward": 0.06640625139698386, "rewards/format_reward": 0.9440104439854622, "step": 4 }, { "completion_length": 112.50521183013916, "epoch": 0.08528784648187633, "grad_norm": 507.14990688734866, "kl": 9.401611328125, "learning_rate": 1.6666666666666667e-05, "loss": 0.3775, "reward": 1.007812526077032, "reward_std": 0.15946260537020862, "rewards/accuracy_reward": 0.04427083441987634, "rewards/format_reward": 0.9635416828095913, "step": 5 }, { "completion_length": 100.34375333786011, "epoch": 0.1023454157782516, "grad_norm": 5.424616910263027, "kl": 0.33245849609375, "learning_rate": 2e-05, "loss": 0.0133, "reward": 1.0052083618938923, "reward_std": 0.15602844418026507, "rewards/accuracy_reward": 0.05338541930541396, "rewards/format_reward": 0.9518229402601719, "step": 6 }, { "completion_length": 133.2343783378601, "epoch": 0.11940298507462686, "grad_norm": 1.7843437966022808, "kl": 0.23712158203125, "learning_rate": 1.9981755542233175e-05, "loss": 0.0095, "reward": 0.9570312686264515, "reward_std": 0.2223757691681385, "rewards/accuracy_reward": 0.04166666802484542, "rewards/format_reward": 0.9153646044433117, "step": 7 }, { "completion_length": 114.69271230697632, "epoch": 0.13646055437100213, "grad_norm": 0.623961957547949, "kl": 0.1710205078125, "learning_rate": 1.992708874098054e-05, "loss": 0.0068, "reward": 0.9895833544433117, "reward_std": 0.22396012931130826, "rewards/accuracy_reward": 0.054687500931322575, "rewards/format_reward": 0.934895858168602, "step": 8 }, { "completion_length": 104.41406488418579, "epoch": 0.1535181236673774, "grad_norm": 1.5259640643959018, "kl": 0.224365234375, "learning_rate": 1.983619906947144e-05, "loss": 0.009, "reward": 0.9765625260770321, "reward_std": 0.2754378484096378, "rewards/accuracy_reward": 0.0664062516298145, "rewards/format_reward": 0.9101562611758709, "step": 9 }, { "completion_length": 231.64844512939453, "epoch": 0.17057569296375266, "grad_norm": 40.31352492325995, "kl": 0.45977783203125, "learning_rate": 1.9709418174260523e-05, "loss": 0.0184, "reward": 0.9257812723517418, "reward_std": 0.39183398708701134, "rewards/accuracy_reward": 0.0976562516298145, "rewards/format_reward": 0.8281250186264515, "step": 10 }, { "completion_length": 613.7070560455322, "epoch": 0.18763326226012794, "grad_norm": 840.2414358802456, "kl": 3.21197509765625, "learning_rate": 1.954720866508546e-05, "loss": 0.1284, "reward": 0.5429687686264515, "reward_std": 0.4636515509337187, "rewards/accuracy_reward": 0.076822918956168, "rewards/format_reward": 0.46614584140479565, "step": 11 }, { "completion_length": 762.7265853881836, "epoch": 0.2046908315565032, "grad_norm": 10.528180991568039, "kl": 0.7625732421875, "learning_rate": 1.9350162426854152e-05, "loss": 0.0305, "reward": 0.38411459513008595, "reward_std": 0.3724043210968375, "rewards/accuracy_reward": 0.09505208616610616, "rewards/format_reward": 0.2890625102445483, "step": 12 }, { "completion_length": 382.18230056762695, "epoch": 0.22174840085287847, "grad_norm": 2.761605849015544, "kl": 0.384521484375, "learning_rate": 1.91189984599209e-05, "loss": 0.0154, "reward": 0.6484375223517418, "reward_std": 0.5331083796918392, "rewards/accuracy_reward": 0.06901041930541396, "rewards/format_reward": 0.5794271007180214, "step": 13 }, { "completion_length": 198.7330780029297, "epoch": 0.23880597014925373, "grad_norm": 0.6078465083236722, "kl": 0.23638916015625, "learning_rate": 1.8854560256532098e-05, "loss": 0.0095, "reward": 0.912760429084301, "reward_std": 0.4155522510409355, "rewards/accuracy_reward": 0.10937500302679837, "rewards/format_reward": 0.8033854328095913, "step": 14 }, { "completion_length": 136.53516006469727, "epoch": 0.255863539445629, "grad_norm": 0.8617440561534008, "kl": 0.21514892578125, "learning_rate": 1.8557812723014476e-05, "loss": 0.0086, "reward": 0.9492187723517418, "reward_std": 0.38575689122080803, "rewards/accuracy_reward": 0.09895833593327552, "rewards/format_reward": 0.8502604328095913, "step": 15 }, { "completion_length": 98.51042032241821, "epoch": 0.27292110874200426, "grad_norm": 0.6595763666425144, "kl": 0.2208251953125, "learning_rate": 1.8229838658936566e-05, "loss": 0.0088, "reward": 0.9726562686264515, "reward_std": 0.3207697505131364, "rewards/accuracy_reward": 0.08854167000390589, "rewards/format_reward": 0.8841146044433117, "step": 16 }, { "completion_length": 76.04166841506958, "epoch": 0.2899786780383795, "grad_norm": 3.048254714245794, "kl": 0.44549560546875, "learning_rate": 1.7871834806090502e-05, "loss": 0.0178, "reward": 1.0468750298023224, "reward_std": 0.15973232360556722, "rewards/accuracy_reward": 0.07552083500195295, "rewards/format_reward": 0.9713541865348816, "step": 17 }, { "completion_length": 81.40234661102295, "epoch": 0.3070362473347548, "grad_norm": 0.4832905802799025, "kl": 0.24029541015625, "learning_rate": 1.7485107481711014e-05, "loss": 0.0096, "reward": 1.0742187798023224, "reward_std": 0.20187148824334145, "rewards/accuracy_reward": 0.10026042012032121, "rewards/format_reward": 0.9739583507180214, "step": 18 }, { "completion_length": 126.81640911102295, "epoch": 0.32409381663113007, "grad_norm": 0.43834266665765215, "kl": 0.18353271484375, "learning_rate": 1.7071067811865477e-05, "loss": 0.0073, "reward": 1.0924479514360428, "reward_std": 0.2407330577261746, "rewards/accuracy_reward": 0.13151042140088975, "rewards/format_reward": 0.9609375186264515, "step": 19 }, { "completion_length": 174.13021278381348, "epoch": 0.3411513859275053, "grad_norm": 0.3691172026913083, "kl": 0.170654296875, "learning_rate": 1.6631226582407954e-05, "loss": 0.0068, "reward": 1.0807292088866234, "reward_std": 0.17400484485551715, "rewards/accuracy_reward": 0.10546875465661287, "rewards/format_reward": 0.9752604365348816, "step": 20 }, { "completion_length": 226.26042366027832, "epoch": 0.3582089552238806, "grad_norm": 0.35348063732954105, "kl": 0.162994384765625, "learning_rate": 1.6167188726285433e-05, "loss": 0.0065, "reward": 1.0768229514360428, "reward_std": 0.26323840813711286, "rewards/accuracy_reward": 0.12239583872724324, "rewards/format_reward": 0.9544271044433117, "step": 21 }, { "completion_length": 226.471360206604, "epoch": 0.3752665245202559, "grad_norm": 0.2548959416361629, "kl": 0.137939453125, "learning_rate": 1.568064746731156e-05, "loss": 0.0055, "reward": 1.0937500447034836, "reward_std": 0.19759728573262691, "rewards/accuracy_reward": 0.11588542012032121, "rewards/format_reward": 0.9778646007180214, "step": 22 }, { "completion_length": 222.7395896911621, "epoch": 0.39232409381663114, "grad_norm": 0.29372855664077896, "kl": 0.154876708984375, "learning_rate": 1.5173378141776569e-05, "loss": 0.0062, "reward": 1.1015625409781933, "reward_std": 0.21707096393220127, "rewards/accuracy_reward": 0.12890625349245965, "rewards/format_reward": 0.9726562686264515, "step": 23 }, { "completion_length": 202.705735206604, "epoch": 0.4093816631130064, "grad_norm": 0.2969744538892161, "kl": 0.155120849609375, "learning_rate": 1.4647231720437687e-05, "loss": 0.0062, "reward": 1.1184896230697632, "reward_std": 0.20930432621389627, "rewards/accuracy_reward": 0.1328125053551048, "rewards/format_reward": 0.9856770932674408, "step": 24 }, { "completion_length": 208.4622449874878, "epoch": 0.42643923240938164, "grad_norm": 0.2551264579415379, "kl": 0.145782470703125, "learning_rate": 1.410412805452757e-05, "loss": 0.0058, "reward": 1.1341146230697632, "reward_std": 0.20376197341829538, "rewards/accuracy_reward": 0.1458333374466747, "rewards/format_reward": 0.9882812611758709, "step": 25 }, { "completion_length": 194.07292079925537, "epoch": 0.44349680170575695, "grad_norm": 0.29985133201161923, "kl": 0.147552490234375, "learning_rate": 1.3546048870425356e-05, "loss": 0.0059, "reward": 1.1875000298023224, "reward_std": 0.22688957839272916, "rewards/accuracy_reward": 0.19531250500585884, "rewards/format_reward": 0.9921875074505806, "step": 26 }, { "completion_length": 199.3333396911621, "epoch": 0.4605543710021322, "grad_norm": 0.2944306259052974, "kl": 0.144775390625, "learning_rate": 1.297503053855203e-05, "loss": 0.0058, "reward": 1.1458333656191826, "reward_std": 0.19133792025968432, "rewards/accuracy_reward": 0.15494792093522847, "rewards/format_reward": 0.9908854253590107, "step": 27 }, { "completion_length": 172.79427576065063, "epoch": 0.47761194029850745, "grad_norm": 0.3362991815387882, "kl": 0.148773193359375, "learning_rate": 1.2393156642875579e-05, "loss": 0.006, "reward": 1.178385466337204, "reward_std": 0.22494715498760343, "rewards/accuracy_reward": 0.18489583837799728, "rewards/format_reward": 0.9934895895421505, "step": 28 }, { "completion_length": 176.18490028381348, "epoch": 0.4946695095948827, "grad_norm": 0.3044368690957198, "kl": 0.146514892578125, "learning_rate": 1.180255037813906e-05, "loss": 0.0059, "reward": 1.1614583730697632, "reward_std": 0.19968353700824082, "rewards/accuracy_reward": 0.16796875244472176, "rewards/format_reward": 0.9934895895421505, "step": 29 }, { "completion_length": 169.21615076065063, "epoch": 0.511727078891258, "grad_norm": 0.3050078730361449, "kl": 0.1395263671875, "learning_rate": 1.1205366802553231e-05, "loss": 0.0056, "reward": 1.1601563021540642, "reward_std": 0.19885429926216602, "rewards/accuracy_reward": 0.167968753259629, "rewards/format_reward": 0.9921875074505806, "step": 30 }, { "completion_length": 191.99479484558105, "epoch": 0.5287846481876333, "grad_norm": 0.28939261315708026, "kl": 0.145721435546875, "learning_rate": 1.0603784974222862e-05, "loss": 0.0058, "reward": 1.1354167088866234, "reward_std": 0.2171460180543363, "rewards/accuracy_reward": 0.15104167070239782, "rewards/format_reward": 0.9843750074505806, "step": 31 }, { "completion_length": 233.89974689483643, "epoch": 0.5458422174840085, "grad_norm": 0.28487537328396517, "kl": 0.13800048828125, "learning_rate": 1e-05, "loss": 0.0055, "reward": 1.123697966337204, "reward_std": 0.2813015836291015, "rewards/accuracy_reward": 0.15885417093522847, "rewards/format_reward": 0.9648437723517418, "step": 32 }, { "completion_length": 253.55078887939453, "epoch": 0.5628997867803838, "grad_norm": 0.2737431545513293, "kl": 0.137664794921875, "learning_rate": 9.39621502577714e-06, "loss": 0.0055, "reward": 1.128906287252903, "reward_std": 0.19965026015415788, "rewards/accuracy_reward": 0.1497395880287513, "rewards/format_reward": 0.9791666828095913, "step": 33 }, { "completion_length": 239.842453956604, "epoch": 0.579957356076759, "grad_norm": 0.6254452308530076, "kl": 0.14483642578125, "learning_rate": 8.79463319744677e-06, "loss": 0.0058, "reward": 1.1757812798023224, "reward_std": 0.26264199148863554, "rewards/accuracy_reward": 0.195312503259629, "rewards/format_reward": 0.9804687574505806, "step": 34 }, { "completion_length": 241.17187976837158, "epoch": 0.5970149253731343, "grad_norm": 0.3224088206959575, "kl": 0.14569091796875, "learning_rate": 8.197449621860944e-06, "loss": 0.0058, "reward": 1.1757812947034836, "reward_std": 0.27850970113649964, "rewards/accuracy_reward": 0.199218753259629, "rewards/format_reward": 0.9765625186264515, "step": 35 }, { "completion_length": 231.39062976837158, "epoch": 0.6140724946695096, "grad_norm": 0.2936768596993624, "kl": 0.142852783203125, "learning_rate": 7.606843357124426e-06, "loss": 0.0057, "reward": 1.1510417088866234, "reward_std": 0.26834863936528563, "rewards/accuracy_reward": 0.17578125349245965, "rewards/format_reward": 0.9752604328095913, "step": 36 }, { "completion_length": 200.52344417572021, "epoch": 0.6311300639658849, "grad_norm": 0.3398881371255209, "kl": 0.14337158203125, "learning_rate": 7.024969461447973e-06, "loss": 0.0057, "reward": 1.140625037252903, "reward_std": 0.30551271699368954, "rewards/accuracy_reward": 0.20312500465661287, "rewards/format_reward": 0.9375000223517418, "step": 37 }, { "completion_length": 199.080735206604, "epoch": 0.6481876332622601, "grad_norm": 0.3950051985318622, "kl": 0.15716552734375, "learning_rate": 6.453951129574644e-06, "loss": 0.0063, "reward": 1.0898437723517418, "reward_std": 0.3697906183078885, "rewards/accuracy_reward": 0.1953125053551048, "rewards/format_reward": 0.8945312686264515, "step": 38 }, { "completion_length": 176.2083396911621, "epoch": 0.6652452025586354, "grad_norm": 0.37189719351321165, "kl": 0.1624755859375, "learning_rate": 5.895871945472434e-06, "loss": 0.0065, "reward": 1.1380208805203438, "reward_std": 0.34118577465415, "rewards/accuracy_reward": 0.20703125558793545, "rewards/format_reward": 0.9309896044433117, "step": 39 }, { "completion_length": 171.10677528381348, "epoch": 0.6823027718550106, "grad_norm": 0.3867912490480322, "kl": 0.15789794921875, "learning_rate": 5.352768279562315e-06, "loss": 0.0063, "reward": 1.1210937947034836, "reward_std": 0.28729582112282515, "rewards/accuracy_reward": 0.1640625053551048, "rewards/format_reward": 0.9570312686264515, "step": 40 }, { "completion_length": 159.16016101837158, "epoch": 0.6993603411513859, "grad_norm": 0.3463972010603223, "kl": 0.162689208984375, "learning_rate": 4.826621858223431e-06, "loss": 0.0065, "reward": 1.1770833656191826, "reward_std": 0.2748530600219965, "rewards/accuracy_reward": 0.22135417209938169, "rewards/format_reward": 0.9557291865348816, "step": 41 }, { "completion_length": 155.342453956604, "epoch": 0.7164179104477612, "grad_norm": 0.31066796112570005, "kl": 0.16387939453125, "learning_rate": 4.319352532688444e-06, "loss": 0.0066, "reward": 1.1692708805203438, "reward_std": 0.24965191585943103, "rewards/accuracy_reward": 0.19010417256504297, "rewards/format_reward": 0.9791666828095913, "step": 42 }, { "completion_length": 147.61849355697632, "epoch": 0.7334754797441365, "grad_norm": 0.3883785701764709, "kl": 0.15966796875, "learning_rate": 3.832811273714569e-06, "loss": 0.0064, "reward": 1.194010466337204, "reward_std": 0.26059679966419935, "rewards/accuracy_reward": 0.20442709000781178, "rewards/format_reward": 0.9895833432674408, "step": 43 }, { "completion_length": 160.03385829925537, "epoch": 0.7505330490405118, "grad_norm": 5788.508271348362, "kl": 278.14984130859375, "learning_rate": 3.3687734175920505e-06, "loss": 11.1208, "reward": 1.1757812947034836, "reward_std": 0.23907144693657756, "rewards/accuracy_reward": 0.18880208989139646, "rewards/format_reward": 0.986979179084301, "step": 44 }, { "completion_length": 149.94141149520874, "epoch": 0.767590618336887, "grad_norm": 0.36297405142580474, "kl": 0.164794921875, "learning_rate": 2.9289321881345257e-06, "loss": 0.0066, "reward": 1.199218787252903, "reward_std": 0.2734901886433363, "rewards/accuracy_reward": 0.21744792349636555, "rewards/format_reward": 0.9817708469927311, "step": 45 }, { "completion_length": 157.68620204925537, "epoch": 0.7846481876332623, "grad_norm": 0.3108491951827798, "kl": 0.16436767578125, "learning_rate": 2.514892518288988e-06, "loss": 0.0066, "reward": 1.1744792014360428, "reward_std": 0.21565480902791023, "rewards/accuracy_reward": 0.19010417233221233, "rewards/format_reward": 0.9843750149011612, "step": 46 }, { "completion_length": 168.7890682220459, "epoch": 0.8017057569296375, "grad_norm": 1.04029324716777, "kl": 0.16632080078125, "learning_rate": 2.1281651939094996e-06, "loss": 0.0067, "reward": 1.1197917014360428, "reward_std": 0.20990665443241596, "rewards/accuracy_reward": 0.13281250500585884, "rewards/format_reward": 0.9869791753590107, "step": 47 }, { "completion_length": 167.15625381469727, "epoch": 0.8187633262260128, "grad_norm": 0.32966812871466905, "kl": 0.15936279296875, "learning_rate": 1.7701613410634367e-06, "loss": 0.0064, "reward": 1.2018229588866234, "reward_std": 0.2573004774749279, "rewards/accuracy_reward": 0.22005208767950535, "rewards/format_reward": 0.9817708469927311, "step": 48 }, { "completion_length": 171.07682704925537, "epoch": 0.835820895522388, "grad_norm": 0.32208043175249607, "kl": 0.151123046875, "learning_rate": 1.4421872769855262e-06, "loss": 0.006, "reward": 1.203125037252903, "reward_std": 0.26521802693605423, "rewards/accuracy_reward": 0.22135417209938169, "rewards/format_reward": 0.9817708469927311, "step": 49 }, { "completion_length": 166.2864637374878, "epoch": 0.8528784648187633, "grad_norm": 0.34577456114094934, "kl": 0.16162109375, "learning_rate": 1.1454397434679022e-06, "loss": 0.0065, "reward": 1.2174479588866234, "reward_std": 0.2847161674872041, "rewards/accuracy_reward": 0.23307292023673654, "rewards/format_reward": 0.9843750111758709, "step": 50 }, { "completion_length": 176.26693153381348, "epoch": 0.8699360341151386, "grad_norm": 0.32710109344570104, "kl": 0.151611328125, "learning_rate": 8.810015400790994e-07, "loss": 0.0061, "reward": 1.1367187835276127, "reward_std": 0.2029442568309605, "rewards/accuracy_reward": 0.14713542093522847, "rewards/format_reward": 0.9895833395421505, "step": 51 }, { "completion_length": 181.84505653381348, "epoch": 0.8869936034115139, "grad_norm": 0.3181109150342723, "kl": 0.177215576171875, "learning_rate": 6.498375731458529e-07, "loss": 0.0071, "reward": 1.1692708618938923, "reward_std": 0.21676216088235378, "rewards/accuracy_reward": 0.18229167233221233, "rewards/format_reward": 0.9869791753590107, "step": 52 }, { "completion_length": 171.81901359558105, "epoch": 0.9040511727078892, "grad_norm": 0.3007348356231504, "kl": 0.15460205078125, "learning_rate": 4.5279133491454406e-07, "loss": 0.0062, "reward": 1.169270858168602, "reward_std": 0.26831901678815484, "rewards/accuracy_reward": 0.18880208872724324, "rewards/format_reward": 0.9804687686264515, "step": 53 }, { "completion_length": 165.6862015724182, "epoch": 0.9211087420042644, "grad_norm": 0.3465232635146208, "kl": 0.160491943359375, "learning_rate": 2.905818257394799e-07, "loss": 0.0064, "reward": 1.2343750223517418, "reward_std": 0.2902528368867934, "rewards/accuracy_reward": 0.2513020889600739, "rewards/format_reward": 0.983072929084301, "step": 54 }, { "completion_length": 185.47396278381348, "epoch": 0.9381663113006397, "grad_norm": 0.339986245997186, "kl": 0.151763916015625, "learning_rate": 1.6380093052856482e-07, "loss": 0.0061, "reward": 1.1757812909781933, "reward_std": 0.2616687403060496, "rewards/accuracy_reward": 0.1953125058207661, "rewards/format_reward": 0.9804687611758709, "step": 55 }, { "completion_length": 168.7773494720459, "epoch": 0.9552238805970149, "grad_norm": 0.33393014556768225, "kl": 0.152862548828125, "learning_rate": 7.291125901946027e-08, "loss": 0.0061, "reward": 1.2343750447034836, "reward_std": 0.2642471818253398, "rewards/accuracy_reward": 0.24479167396202683, "rewards/format_reward": 0.9895833432674408, "step": 56 }, { "completion_length": 175.51172637939453, "epoch": 0.9722814498933902, "grad_norm": 0.30356714080443054, "kl": 0.1541748046875, "learning_rate": 1.824445776682504e-08, "loss": 0.0062, "reward": 1.2239583656191826, "reward_std": 0.2539575123228133, "rewards/accuracy_reward": 0.24739584140479565, "rewards/format_reward": 0.9765625149011612, "step": 57 }, { "completion_length": 187.67057609558105, "epoch": 0.9893390191897654, "grad_norm": 0.3138605458712969, "kl": 0.152740478515625, "learning_rate": 0.0, "loss": 0.0061, "reward": 1.208333358168602, "reward_std": 0.24711204366758466, "rewards/accuracy_reward": 0.22786458861082792, "rewards/format_reward": 0.9804687649011612, "step": 58 }, { "epoch": 0.9893390191897654, "step": 58, "total_flos": 0.0, "train_loss": 0.20938398721150356, "train_runtime": 4954.7093, "train_samples_per_second": 1.514, "train_steps_per_second": 0.012 } ], "logging_steps": 1, "max_steps": 58, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }