{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9979779125836056, "eval_steps": 100, "global_step": 401, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 445.60470695495604, "epoch": 0.02488722974023954, "grad_norm": 0.4118676483631134, "kl": 0.010468816757202149, "learning_rate": 4.8780487804878055e-06, "loss": 0.0004, "reward": 0.6772321731783449, "reward_std": 0.40852332254871726, "rewards/equation_reward_func": 0.07633928939467297, "rewards/format_reward_func": 0.6008928816765546, "step": 10 }, { "completion_length": 277.97166414260863, "epoch": 0.04977445948047908, "grad_norm": 0.3340870440006256, "kl": 0.08794097900390625, "learning_rate": 9.756097560975611e-06, "loss": 0.0035, "reward": 1.1737723745405675, "reward_std": 0.2244849470909685, "rewards/equation_reward_func": 0.18337054466828703, "rewards/format_reward_func": 0.9904018051922321, "step": 20 }, { "completion_length": 358.0676491737366, "epoch": 0.07466168922071861, "grad_norm": 403145293824.0, "kl": 251659428.747583, "learning_rate": 1.4634146341463415e-05, "loss": 10058519.2, "reward": 1.1559152286499739, "reward_std": 0.3308718525338918, "rewards/equation_reward_func": 0.20848215216537938, "rewards/format_reward_func": 0.9474330730736256, "step": 30 }, { "completion_length": 173.60179430246353, "epoch": 0.09954891896095816, "grad_norm": 33.38664627075195, "kl": 1592.3597778320313, "learning_rate": 1.9512195121951222e-05, "loss": 63.6938, "reward": 0.9162946855649352, "reward_std": 0.4809370403178036, "rewards/equation_reward_func": 0.1517857214785181, "rewards/format_reward_func": 0.7645089654251933, "step": 40 }, { "completion_length": 105.79074110984803, "epoch": 0.1244361487011977, "grad_norm": 669.1779174804688, "kl": 19.69697265625, "learning_rate": 1.9969173337331283e-05, "loss": 0.7874, "reward": 0.621205386845395, "reward_std": 0.49133867099881173, "rewards/equation_reward_func": 0.14754464910365642, "rewards/format_reward_func": 0.4736607360187918, "step": 50 }, { "completion_length": 179.40748615264891, "epoch": 0.14932337844143723, "grad_norm": 0.7297883033752441, "kl": 3413.1394775390627, "learning_rate": 1.9862856015372315e-05, "loss": 136.4553, "reward": 0.7706473573809489, "reward_std": 0.3035780142992735, "rewards/equation_reward_func": 0.1546875070547685, "rewards/format_reward_func": 0.6159598465193994, "step": 60 }, { "completion_length": 74.70770426988602, "epoch": 0.17421060818167677, "grad_norm": 7532.27197265625, "kl": 47.843603515625, "learning_rate": 1.968147640378108e-05, "loss": 1.9131, "reward": 0.9377232604194432, "reward_std": 0.23382538077421486, "rewards/equation_reward_func": 0.17332590045407414, "rewards/format_reward_func": 0.7643973479513079, "step": 70 }, { "completion_length": 107.71473670005798, "epoch": 0.19909783792191632, "grad_norm": 155.04930114746094, "kl": 0.835009765625, "learning_rate": 1.9426414910921785e-05, "loss": 0.0334, "reward": 1.1287946943193674, "reward_std": 0.2256626692134887, "rewards/equation_reward_func": 0.1746651871711947, "rewards/format_reward_func": 0.9541295003145933, "step": 80 }, { "completion_length": 149.29431443214418, "epoch": 0.22398506766215587, "grad_norm": 0.2778171896934509, "kl": 0.97703857421875, "learning_rate": 1.9099612708765432e-05, "loss": 0.039, "reward": 1.1264509385451675, "reward_std": 0.24302657530643046, "rewards/equation_reward_func": 0.1992187604191713, "rewards/format_reward_func": 0.9272321715950966, "step": 90 }, { "completion_length": 127.52388949394226, "epoch": 0.2488722974023954, "grad_norm": 0.2528281807899475, "kl": 0.5005126953125, "learning_rate": 1.8703556959398998e-05, "loss": 0.02, "reward": 1.1936384476721287, "reward_std": 0.21707446863874794, "rewards/equation_reward_func": 0.21194197433069348, "rewards/format_reward_func": 0.9816964585334063, "step": 100 }, { "epoch": 0.2488722974023954, "eval_completion_length": 159.91052135275729, "eval_kl": 0.4822254713687151, "eval_loss": 0.019292756915092468, "eval_reward": 1.2420940272634922, "eval_reward_std": 0.21879247122303733, "eval_rewards/equation_reward_func": 0.2618465799442883, "eval_rewards/format_reward_func": 0.980247448609528, "eval_runtime": 2096.9421, "eval_samples_per_second": 2.384, "eval_steps_per_second": 0.085, "step": 100 }, { "completion_length": 197.92512073516846, "epoch": 0.27375952714263496, "grad_norm": 0.1797320544719696, "kl": 0.44224853515625, "learning_rate": 1.8241261886220155e-05, "loss": 0.0177, "reward": 1.290513451397419, "reward_std": 0.20567998164333404, "rewards/equation_reward_func": 0.3102678721304983, "rewards/format_reward_func": 0.9802455704659223, "step": 110 }, { "completion_length": 362.7595027923584, "epoch": 0.29864675688287445, "grad_norm": 0.15794633328914642, "kl": 2280653.147998047, "learning_rate": 1.7716245833877202e-05, "loss": 91078.7812, "reward": 1.2590402327477932, "reward_std": 0.1803759267553687, "rewards/equation_reward_func": 0.2713169776368886, "rewards/format_reward_func": 0.9877232406288385, "step": 120 }, { "completion_length": 328.3310419082642, "epoch": 0.323533986623114, "grad_norm": 7.756823539733887, "kl": 0.4082275390625, "learning_rate": 1.713250449154182e-05, "loss": 0.0163, "reward": 1.27165184058249, "reward_std": 0.20182186937890947, "rewards/equation_reward_func": 0.2963169774971902, "rewards/format_reward_func": 0.9753348492085934, "step": 130 }, { "completion_length": 211.79688444137574, "epoch": 0.34842121636335355, "grad_norm": 0.22612209618091583, "kl": 0.353564453125, "learning_rate": 1.6494480483301836e-05, "loss": 0.0141, "reward": 1.2380580876022578, "reward_std": 0.24032357237301766, "rewards/equation_reward_func": 0.29475447952281686, "rewards/format_reward_func": 0.943303607776761, "step": 140 }, { "completion_length": 154.25938205718995, "epoch": 0.3733084461035931, "grad_norm": 0.3829492926597595, "kl": 0.32027587890625, "learning_rate": 1.5807029557109398e-05, "loss": 0.0128, "reward": 1.2484375584870577, "reward_std": 0.17919168020598591, "rewards/equation_reward_func": 0.2755580496159382, "rewards/format_reward_func": 0.9728794828057289, "step": 150 }, { "completion_length": 235.15280055999756, "epoch": 0.39819567584383264, "grad_norm": 0.23761987686157227, "kl": 0.42236328125, "learning_rate": 1.5075383629607043e-05, "loss": 0.0169, "reward": 1.2582589872181416, "reward_std": 0.19487255290150643, "rewards/equation_reward_func": 0.30011162179289386, "rewards/format_reward_func": 0.9581473525613546, "step": 160 }, { "completion_length": 428.79577960968015, "epoch": 0.4230829055840722, "grad_norm": 0.22013823688030243, "kl": 0.33797607421875, "learning_rate": 1.4305110968082953e-05, "loss": 0.0135, "reward": 1.2656250596046448, "reward_std": 0.22840066901408135, "rewards/equation_reward_func": 0.311383943259716, "rewards/format_reward_func": 0.9542411014437675, "step": 170 }, { "completion_length": 447.583056640625, "epoch": 0.44797013532431174, "grad_norm": 0.18443113565444946, "kl": 0.34371337890625, "learning_rate": 1.3502073812594677e-05, "loss": 0.0137, "reward": 1.192633979767561, "reward_std": 0.258369813952595, "rewards/equation_reward_func": 0.26785715589066966, "rewards/format_reward_func": 0.9247768245637417, "step": 180 }, { "completion_length": 468.4525882720947, "epoch": 0.4728573650645512, "grad_norm": 0.18475499749183655, "kl": 0.336151123046875, "learning_rate": 1.267238376078257e-05, "loss": 0.0134, "reward": 1.150781299173832, "reward_std": 0.31902912934310734, "rewards/equation_reward_func": 0.26283483426086607, "rewards/format_reward_func": 0.887946467846632, "step": 190 }, { "completion_length": 205.72612552642823, "epoch": 0.4977445948047908, "grad_norm": 0.177314892411232, "kl": 0.29666748046875, "learning_rate": 1.1822355254921478e-05, "loss": 0.0119, "reward": 1.2328125555068254, "reward_std": 0.21277215834707022, "rewards/equation_reward_func": 0.2745535850757733, "rewards/format_reward_func": 0.9582589630037546, "step": 200 }, { "epoch": 0.4977445948047908, "eval_completion_length": 204.1021135212989, "eval_kl": 0.29685317737430167, "eval_loss": 0.011871281079947948, "eval_reward": 1.2749651410726195, "eval_reward_std": 0.18870500346135827, "eval_rewards/equation_reward_func": 0.29678772335778403, "eval_rewards/format_reward_func": 0.9781774192548996, "eval_runtime": 3019.4392, "eval_samples_per_second": 1.656, "eval_steps_per_second": 0.059, "step": 200 }, { "completion_length": 227.66161708831788, "epoch": 0.5226318245450303, "grad_norm": 0.21695467829704285, "kl": 0.288677978515625, "learning_rate": 1.0958457525202241e-05, "loss": 0.0115, "reward": 1.2738839823752641, "reward_std": 0.2082868866622448, "rewards/equation_reward_func": 0.3056919798487797, "rewards/format_reward_func": 0.968191996216774, "step": 210 }, { "completion_length": 206.59152727127076, "epoch": 0.5475190542852699, "grad_norm": 0.17778100073337555, "kl": 0.22813720703125, "learning_rate": 1.008726535498374e-05, "loss": 0.0091, "reward": 1.2793527320027351, "reward_std": 0.19789782017469407, "rewards/equation_reward_func": 0.30334822946460915, "rewards/format_reward_func": 0.976004496589303, "step": 220 }, { "completion_length": 189.20558891296386, "epoch": 0.5724062840255094, "grad_norm": 0.2292833924293518, "kl": 0.21630859375, "learning_rate": 9.215409042721553e-06, "loss": 0.0087, "reward": 1.2931920245289803, "reward_std": 0.1636760616209358, "rewards/equation_reward_func": 0.3075892997556366, "rewards/format_reward_func": 0.985602705553174, "step": 230 }, { "completion_length": 207.87523260116578, "epoch": 0.5972935137657489, "grad_norm": 0.24823932349681854, "kl": 0.255029296875, "learning_rate": 8.349523941393224e-06, "loss": 0.0102, "reward": 1.2991071995347738, "reward_std": 0.1962086133658886, "rewards/equation_reward_func": 0.32656251625157895, "rewards/format_reward_func": 0.9725446760654449, "step": 240 }, { "completion_length": 190.70279846191406, "epoch": 0.6221807435059885, "grad_norm": 0.2746059000492096, "kl": 0.279962158203125, "learning_rate": 7.496199959455584e-06, "loss": 0.0112, "reward": 1.2677455935627222, "reward_std": 0.22690184200182556, "rewards/equation_reward_func": 0.30736608678707855, "rewards/format_reward_func": 0.9603795003145933, "step": 250 }, { "completion_length": 163.25904750823975, "epoch": 0.647067973246228, "grad_norm": 0.6814990639686584, "kl": 0.43385009765625, "learning_rate": 6.661931407662292e-06, "loss": 0.0174, "reward": 1.2481027361005546, "reward_std": 0.2705018125008792, "rewards/equation_reward_func": 0.3234375156229362, "rewards/format_reward_func": 0.9246652159839869, "step": 260 }, { "completion_length": 120.41016173362732, "epoch": 0.6719552029864676, "grad_norm": 0.49509918689727783, "kl": 0.59742431640625, "learning_rate": 5.853067573437612e-06, "loss": 0.0239, "reward": 1.2579241640865804, "reward_std": 0.2596528060734272, "rewards/equation_reward_func": 0.3313616224215366, "rewards/format_reward_func": 0.9265625372529029, "step": 270 }, { "completion_length": 245.69275612831115, "epoch": 0.6968424327267071, "grad_norm": 1.4125251770019531, "kl": 1.049755859375, "learning_rate": 5.075764398965331e-06, "loss": 0.042, "reward": 1.2562500558793546, "reward_std": 0.2570431599859148, "rewards/equation_reward_func": 0.3255580512690358, "rewards/format_reward_func": 0.9306920021772385, "step": 280 }, { "completion_length": 229.22902803421022, "epoch": 0.7217296624669467, "grad_norm": 0.2942808270454407, "kl": 0.5860107421875, "learning_rate": 4.335937630751675e-06, "loss": 0.0234, "reward": 1.2649554163217545, "reward_std": 0.27522460916079583, "rewards/equation_reward_func": 0.3424107326194644, "rewards/format_reward_func": 0.9225446809083223, "step": 290 }, { "completion_length": 196.14632596969605, "epoch": 0.7466168922071862, "grad_norm": 0.22581090033054352, "kl": 0.246856689453125, "learning_rate": 3.6392177972223596e-06, "loss": 0.0099, "reward": 1.2797991633415222, "reward_std": 0.27192874858155847, "rewards/equation_reward_func": 0.34486608650768175, "rewards/format_reward_func": 0.9349330712109805, "step": 300 }, { "epoch": 0.7466168922071862, "eval_completion_length": 176.43523820418886, "eval_kl": 0.21617493016759776, "eval_loss": 0.00864693708717823, "eval_reward": 1.308285176421011, "eval_reward_std": 0.24197927808295416, "eval_rewards/equation_reward_func": 0.36153233883767155, "eval_rewards/format_reward_func": 0.9467528382493131, "eval_runtime": 4445.2454, "eval_samples_per_second": 1.125, "eval_steps_per_second": 0.04, "step": 300 }, { "completion_length": 182.55291066169738, "epoch": 0.7715041219474257, "grad_norm": 0.3403286635875702, "kl": 0.21319580078125, "learning_rate": 2.990907357001491e-06, "loss": 0.0085, "reward": 1.281919700279832, "reward_std": 0.25035269833169876, "rewards/equation_reward_func": 0.34084823006996884, "rewards/format_reward_func": 0.9410714626312255, "step": 310 }, { "completion_length": 180.25915956497192, "epoch": 0.7963913516876653, "grad_norm": 0.1979561448097229, "kl": 0.225823974609375, "learning_rate": 2.395940343999691e-06, "loss": 0.009, "reward": 1.3169643417000771, "reward_std": 0.23920484688133, "rewards/equation_reward_func": 0.363392874202691, "rewards/format_reward_func": 0.9535714633762836, "step": 320 }, { "completion_length": 163.8195384979248, "epoch": 0.8212785814279048, "grad_norm": 0.2086048573255539, "kl": 0.3494384765625, "learning_rate": 1.858844816436809e-06, "loss": 0.014, "reward": 1.2976563081145287, "reward_std": 0.20689995544962586, "rewards/equation_reward_func": 0.3239955520257354, "rewards/format_reward_func": 0.9736607488244772, "step": 330 }, { "completion_length": 173.9462127685547, "epoch": 0.8461658111681444, "grad_norm": 0.21505990624427795, "kl": 0.2422607421875, "learning_rate": 1.3837083955847418e-06, "loss": 0.0097, "reward": 1.3164063077419996, "reward_std": 0.21665938571095467, "rewards/equation_reward_func": 0.34776787337614223, "rewards/format_reward_func": 0.968638426810503, "step": 340 }, { "completion_length": 188.86429414749145, "epoch": 0.8710530409083839, "grad_norm": 0.19320468604564667, "kl": 0.267999267578125, "learning_rate": 9.74147156501396e-07, "loss": 0.0107, "reward": 1.2994420282542705, "reward_std": 0.23502404149621725, "rewards/equation_reward_func": 0.34453126683365554, "rewards/format_reward_func": 0.954910746589303, "step": 350 }, { "completion_length": 190.60637016296386, "epoch": 0.8959402706486235, "grad_norm": 0.22971031069755554, "kl": 0.2662841796875, "learning_rate": 6.332781075160244e-07, "loss": 0.0106, "reward": 1.3205357737839223, "reward_std": 0.2396992813795805, "rewards/equation_reward_func": 0.3675223397091031, "rewards/format_reward_func": 0.9530134283006191, "step": 360 }, { "completion_length": 175.11931610107422, "epoch": 0.920827500388863, "grad_norm": 0.5891013741493225, "kl": 0.50406494140625, "learning_rate": 3.6369546791377054e-07, "loss": 0.0202, "reward": 1.3478795260190963, "reward_std": 0.22201797575689852, "rewards/equation_reward_func": 0.38828126864973456, "rewards/format_reward_func": 0.9595982495695352, "step": 370 }, { "completion_length": 164.60090026855468, "epoch": 0.9457147301291025, "grad_norm": 0.27209916710853577, "kl": 0.2961669921875, "learning_rate": 1.6745092436045495e-07, "loss": 0.0118, "reward": 1.3289063088595867, "reward_std": 0.229065563948825, "rewards/equation_reward_func": 0.36986609068699183, "rewards/format_reward_func": 0.9590402111411095, "step": 380 }, { "completion_length": 161.4010117530823, "epoch": 0.9706019598693421, "grad_norm": 0.316535085439682, "kl": 0.27200927734375, "learning_rate": 4.603801632821148e-08, "loss": 0.0109, "reward": 1.3257813084870578, "reward_std": 0.22753856042400003, "rewards/equation_reward_func": 0.36473215925507246, "rewards/format_reward_func": 0.9610491368919611, "step": 390 }, { "completion_length": 162.9387348175049, "epoch": 0.9954891896095815, "grad_norm": 0.37567567825317383, "kl": 0.29910888671875, "learning_rate": 3.807693582869032e-10, "loss": 0.012, "reward": 1.3266741678118705, "reward_std": 0.23514977092854678, "rewards/equation_reward_func": 0.3705357322003692, "rewards/format_reward_func": 0.956138427183032, "step": 400 }, { "epoch": 0.9954891896095815, "eval_completion_length": 161.6374764895306, "eval_kl": 0.3053203561452514, "eval_loss": 0.012214409187436104, "eval_reward": 1.3301078014533614, "eval_reward_std": 0.23096491629517943, "eval_rewards/equation_reward_func": 0.37320432741215775, "eval_rewards/format_reward_func": 0.9569034722930226, "eval_runtime": 3844.8591, "eval_samples_per_second": 1.3, "eval_steps_per_second": 0.047, "step": 400 }, { "completion_length": 163.43081092834473, "epoch": 0.9979779125836056, "kl": 0.3106689453125, "reward": 1.323660783469677, "reward_std": 0.228781514801085, "rewards/equation_reward_func": 0.3627232350409031, "rewards/format_reward_func": 0.9609375260770321, "step": 401, "total_flos": 0.0, "train_loss": 253112.25296123006, "train_runtime": 92260.7677, "train_samples_per_second": 0.488, "train_steps_per_second": 0.004 } ], "logging_steps": 10, "max_steps": 401, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }