qwen-2.5-3b-r1-countdown-offline_query_gen_solvable_only__train_query_gen-ckpt_175
/
trainer_state.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 0.6606716828775923, | |
"eval_steps": 500, | |
"global_step": 450, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"completion_length": 425.11497688293457, | |
"epoch": 0.0029363185905670764, | |
"grad_norm": 0.10666760082017611, | |
"kl": 0.0, | |
"learning_rate": 7.142857142857142e-08, | |
"loss": -0.0, | |
"reward": 0.2834821557626128, | |
"reward_std": 0.4256630390882492, | |
"rewards/equation_reward_func": 0.005580357392318547, | |
"rewards/format_reward_func": 0.27790179941803217, | |
"step": 2 | |
}, | |
{ | |
"completion_length": 397.3091697692871, | |
"epoch": 0.005872637181134153, | |
"grad_norm": 0.11172384920713097, | |
"kl": 0.0004100799560546875, | |
"learning_rate": 1.4285714285714285e-07, | |
"loss": 0.0, | |
"reward": 0.31696429941803217, | |
"reward_std": 0.4561923108994961, | |
"rewards/equation_reward_func": 0.004464285913854837, | |
"rewards/format_reward_func": 0.3125000139698386, | |
"step": 4 | |
}, | |
{ | |
"completion_length": 387.9654178619385, | |
"epoch": 0.00880895577170123, | |
"grad_norm": 0.12273474882696524, | |
"kl": 0.00041091442108154297, | |
"learning_rate": 2.1428571428571426e-07, | |
"loss": 0.0, | |
"reward": 0.3236607266589999, | |
"reward_std": 0.4472240339964628, | |
"rewards/equation_reward_func": 0.006696428870782256, | |
"rewards/format_reward_func": 0.3169643012806773, | |
"step": 6 | |
}, | |
{ | |
"completion_length": 397.4364013671875, | |
"epoch": 0.011745274362268306, | |
"grad_norm": 0.11874955075675916, | |
"kl": 0.0004132986068725586, | |
"learning_rate": 2.857142857142857e-07, | |
"loss": 0.0, | |
"reward": 0.3448660895228386, | |
"reward_std": 0.4638795666396618, | |
"rewards/equation_reward_func": 0.008928571827709675, | |
"rewards/format_reward_func": 0.33593751676380634, | |
"step": 8 | |
}, | |
{ | |
"completion_length": 408.18640327453613, | |
"epoch": 0.014681592952835382, | |
"grad_norm": 0.1291957962223834, | |
"kl": 0.0004401206970214844, | |
"learning_rate": 3.5714285714285716e-07, | |
"loss": 0.0, | |
"reward": 0.3448660895228386, | |
"reward_std": 0.47657241858541965, | |
"rewards/equation_reward_func": 0.015625000814907253, | |
"rewards/format_reward_func": 0.3292410857975483, | |
"step": 10 | |
}, | |
{ | |
"completion_length": 383.9297065734863, | |
"epoch": 0.01761791154340246, | |
"grad_norm": 0.11615535346233327, | |
"kl": 0.0005271434783935547, | |
"learning_rate": 4.285714285714285e-07, | |
"loss": 0.0, | |
"reward": 0.4218750223517418, | |
"reward_std": 0.4933844096958637, | |
"rewards/equation_reward_func": 0.012276786263100803, | |
"rewards/format_reward_func": 0.40959823317825794, | |
"step": 12 | |
}, | |
{ | |
"completion_length": 395.9866237640381, | |
"epoch": 0.020554230133969537, | |
"grad_norm": 0.11975361330667507, | |
"kl": 0.0011701583862304688, | |
"learning_rate": 5e-07, | |
"loss": 0.0, | |
"reward": 0.4676339440047741, | |
"reward_std": 0.5037149954587221, | |
"rewards/equation_reward_func": 0.007812500349245965, | |
"rewards/format_reward_func": 0.45982144586741924, | |
"step": 14 | |
}, | |
{ | |
"completion_length": 354.48550033569336, | |
"epoch": 0.02349054872453661, | |
"grad_norm": 0.11506849184873436, | |
"kl": 0.0016536712646484375, | |
"learning_rate": 4.999740409224932e-07, | |
"loss": 0.0, | |
"reward": 0.6116071678698063, | |
"reward_std": 0.4877460356801748, | |
"rewards/equation_reward_func": 0.011160714784637094, | |
"rewards/format_reward_func": 0.6004464514553547, | |
"step": 16 | |
}, | |
{ | |
"completion_length": 358.6852779388428, | |
"epoch": 0.02642686731510369, | |
"grad_norm": 0.09667949769991004, | |
"kl": 0.00521087646484375, | |
"learning_rate": 4.998961690809627e-07, | |
"loss": 0.0, | |
"reward": 0.7633928954601288, | |
"reward_std": 0.40501935593783855, | |
"rewards/equation_reward_func": 0.006696428870782256, | |
"rewards/format_reward_func": 0.7566964663565159, | |
"step": 18 | |
}, | |
{ | |
"completion_length": 372.5792598724365, | |
"epoch": 0.029363185905670764, | |
"grad_norm": 0.07439885917757044, | |
"kl": 0.0065975189208984375, | |
"learning_rate": 4.997664006472578e-07, | |
"loss": 0.0, | |
"reward": 0.8404018245637417, | |
"reward_std": 0.34173065423965454, | |
"rewards/equation_reward_func": 0.010044643306173384, | |
"rewards/format_reward_func": 0.8303571790456772, | |
"step": 20 | |
}, | |
{ | |
"completion_length": 371.32590675354004, | |
"epoch": 0.03229950449623784, | |
"grad_norm": 0.06516039437584277, | |
"kl": 0.0086212158203125, | |
"learning_rate": 4.995847625707292e-07, | |
"loss": 0.0, | |
"reward": 0.8906250484287739, | |
"reward_std": 0.29503875970840454, | |
"rewards/equation_reward_func": 0.013392857741564512, | |
"rewards/format_reward_func": 0.8772321939468384, | |
"step": 22 | |
}, | |
{ | |
"completion_length": 361.12055587768555, | |
"epoch": 0.03523582308680492, | |
"grad_norm": 0.06376645250487896, | |
"kl": 0.008119583129882812, | |
"learning_rate": 4.993512925726318e-07, | |
"loss": 0.0, | |
"reward": 0.9162946864962578, | |
"reward_std": 0.2622959101572633, | |
"rewards/equation_reward_func": 0.01785714377183467, | |
"rewards/format_reward_func": 0.8984375447034836, | |
"step": 24 | |
}, | |
{ | |
"completion_length": 371.7355079650879, | |
"epoch": 0.03817214167737199, | |
"grad_norm": 0.05836390189597255, | |
"kl": 0.010738372802734375, | |
"learning_rate": 4.990660391382923e-07, | |
"loss": 0.0, | |
"reward": 0.9341518208384514, | |
"reward_std": 0.2500613871961832, | |
"rewards/equation_reward_func": 0.026785715483129025, | |
"rewards/format_reward_func": 0.9073661155998707, | |
"step": 26 | |
}, | |
{ | |
"completion_length": 365.8471145629883, | |
"epoch": 0.04110846026793907, | |
"grad_norm": 0.04136502377843216, | |
"kl": 0.0107421875, | |
"learning_rate": 4.987290615070384e-07, | |
"loss": 0.0, | |
"reward": 0.9642857499420643, | |
"reward_std": 0.15522026224061847, | |
"rewards/equation_reward_func": 0.014508929336443543, | |
"rewards/format_reward_func": 0.9497768245637417, | |
"step": 28 | |
}, | |
{ | |
"completion_length": 361.58260917663574, | |
"epoch": 0.04404477885850615, | |
"grad_norm": 0.03434405791900483, | |
"kl": 0.012409210205078125, | |
"learning_rate": 4.983404296598978e-07, | |
"loss": 0.0, | |
"reward": 0.9776786081492901, | |
"reward_std": 0.11330996686592698, | |
"rewards/equation_reward_func": 0.012276786379516125, | |
"rewards/format_reward_func": 0.9654018245637417, | |
"step": 30 | |
}, | |
{ | |
"completion_length": 346.245548248291, | |
"epoch": 0.04698109744907322, | |
"grad_norm": 0.030663261177108603, | |
"kl": 0.015239715576171875, | |
"learning_rate": 4.979002243050646e-07, | |
"loss": 0.0, | |
"reward": 0.9955357573926449, | |
"reward_std": 0.1037649204954505, | |
"rewards/equation_reward_func": 0.018973215483129025, | |
"rewards/format_reward_func": 0.9765625335276127, | |
"step": 32 | |
}, | |
{ | |
"completion_length": 334.4799270629883, | |
"epoch": 0.049917416039640304, | |
"grad_norm": 0.030854717221714126, | |
"kl": 0.0161285400390625, | |
"learning_rate": 4.974085368611381e-07, | |
"loss": 0.0, | |
"reward": 1.0156250596046448, | |
"reward_std": 0.09721619635820389, | |
"rewards/equation_reward_func": 0.0290178582072258, | |
"rewards/format_reward_func": 0.9866071790456772, | |
"step": 34 | |
}, | |
{ | |
"completion_length": 336.73773765563965, | |
"epoch": 0.05285373463020738, | |
"grad_norm": 0.03327381233463755, | |
"kl": 0.014263153076171875, | |
"learning_rate": 4.968654694381379e-07, | |
"loss": 0.0, | |
"reward": 1.0122768357396126, | |
"reward_std": 0.08523162081837654, | |
"rewards/equation_reward_func": 0.021205358090810478, | |
"rewards/format_reward_func": 0.991071455180645, | |
"step": 36 | |
}, | |
{ | |
"completion_length": 332.9218864440918, | |
"epoch": 0.05579005322077445, | |
"grad_norm": 0.02236423630507305, | |
"kl": 0.0166168212890625, | |
"learning_rate": 4.962711348162987e-07, | |
"loss": 0.0, | |
"reward": 0.9966518208384514, | |
"reward_std": 0.05468874936923385, | |
"rewards/equation_reward_func": 0.008928571827709675, | |
"rewards/format_reward_func": 0.9877232536673546, | |
"step": 38 | |
}, | |
{ | |
"completion_length": 331.6495666503906, | |
"epoch": 0.05872637181134153, | |
"grad_norm": 0.02270153959336241, | |
"kl": 0.016017913818359375, | |
"learning_rate": 4.956256564226487e-07, | |
"loss": 0.0, | |
"reward": 1.0089286044239998, | |
"reward_std": 0.07463237782940269, | |
"rewards/equation_reward_func": 0.020089286845177412, | |
"rewards/format_reward_func": 0.9888393022119999, | |
"step": 40 | |
}, | |
{ | |
"completion_length": 331.0167541503906, | |
"epoch": 0.06166269040190861, | |
"grad_norm": 0.025458258107277202, | |
"kl": 0.0180816650390625, | |
"learning_rate": 4.949291683053768e-07, | |
"loss": 0.0, | |
"reward": 1.0033482536673546, | |
"reward_std": 0.0625700019299984, | |
"rewards/equation_reward_func": 0.013392857741564512, | |
"rewards/format_reward_func": 0.9899553880095482, | |
"step": 42 | |
}, | |
{ | |
"completion_length": 322.00559425354004, | |
"epoch": 0.06459900899247568, | |
"grad_norm": 0.03083960172379516, | |
"kl": 0.016857147216796875, | |
"learning_rate": 4.941818151059955e-07, | |
"loss": 0.0, | |
"reward": 1.0022321827709675, | |
"reward_std": 0.056821079924702644, | |
"rewards/equation_reward_func": 0.011160714784637094, | |
"rewards/format_reward_func": 0.9910714514553547, | |
"step": 44 | |
}, | |
{ | |
"completion_length": 336.2935447692871, | |
"epoch": 0.06753532758304276, | |
"grad_norm": 0.03395926937254199, | |
"kl": 0.01757049560546875, | |
"learning_rate": 4.933837520293017e-07, | |
"loss": 0.0, | |
"reward": 1.0044643357396126, | |
"reward_std": 0.10183899104595184, | |
"rewards/equation_reward_func": 0.02232142968568951, | |
"rewards/format_reward_func": 0.9821428880095482, | |
"step": 46 | |
}, | |
{ | |
"completion_length": 315.8538112640381, | |
"epoch": 0.07047164617360983, | |
"grad_norm": 0.03156746554424257, | |
"kl": 0.01998138427734375, | |
"learning_rate": 4.925351448111454e-07, | |
"loss": 0.0, | |
"reward": 1.0111607536673546, | |
"reward_std": 0.06636612536385655, | |
"rewards/equation_reward_func": 0.01785714365541935, | |
"rewards/format_reward_func": 0.9933035895228386, | |
"step": 48 | |
}, | |
{ | |
"completion_length": 306.9486770629883, | |
"epoch": 0.07340796476417691, | |
"grad_norm": 0.02962714783243483, | |
"kl": 0.019634246826171875, | |
"learning_rate": 4.91636169684011e-07, | |
"loss": 0.0, | |
"reward": 1.0100446864962578, | |
"reward_std": 0.06970830773934722, | |
"rewards/equation_reward_func": 0.018973215366713703, | |
"rewards/format_reward_func": 0.9910714402794838, | |
"step": 50 | |
}, | |
{ | |
"completion_length": 314.1015815734863, | |
"epoch": 0.07634428335474398, | |
"grad_norm": 0.029626162068749, | |
"kl": 0.01953125, | |
"learning_rate": 4.906870133404186e-07, | |
"loss": 0.0, | |
"reward": 1.0078125298023224, | |
"reward_std": 0.0697083086706698, | |
"rewards/equation_reward_func": 0.018973215483129025, | |
"rewards/format_reward_func": 0.9888393059372902, | |
"step": 52 | |
}, | |
{ | |
"completion_length": 324.9698791503906, | |
"epoch": 0.07928060194531107, | |
"grad_norm": 0.02581923255114933, | |
"kl": 0.03170013427734375, | |
"learning_rate": 4.896878728941531e-07, | |
"loss": 0.0, | |
"reward": 1.0055804029107094, | |
"reward_std": 0.061442055739462376, | |
"rewards/equation_reward_func": 0.014508929220028222, | |
"rewards/format_reward_func": 0.9910714477300644, | |
"step": 54 | |
}, | |
{ | |
"completion_length": 313.5446529388428, | |
"epoch": 0.08221692053587815, | |
"grad_norm": 0.0268506054426233, | |
"kl": 0.0242767333984375, | |
"learning_rate": 4.886389558393284e-07, | |
"loss": 0.0, | |
"reward": 1.0100446939468384, | |
"reward_std": 0.0723005011677742, | |
"rewards/equation_reward_func": 0.02008928672876209, | |
"rewards/format_reward_func": 0.989955373108387, | |
"step": 56 | |
}, | |
{ | |
"completion_length": 311.1138515472412, | |
"epoch": 0.08515323912644522, | |
"grad_norm": 0.026590096596338642, | |
"kl": 0.0207061767578125, | |
"learning_rate": 4.875404800072976e-07, | |
"loss": 0.0, | |
"reward": 1.0044643208384514, | |
"reward_std": 0.05610500229522586, | |
"rewards/equation_reward_func": 0.01339285762514919, | |
"rewards/format_reward_func": 0.9910714477300644, | |
"step": 58 | |
}, | |
{ | |
"completion_length": 299.32813835144043, | |
"epoch": 0.0880895577170123, | |
"grad_norm": 0.029592636449534623, | |
"kl": 0.022125244140625, | |
"learning_rate": 4.86392673521415e-07, | |
"loss": 0.0, | |
"reward": 1.020089328289032, | |
"reward_std": 0.06914377678185701, | |
"rewards/equation_reward_func": 0.024553572526201606, | |
"rewards/format_reward_func": 0.9955357313156128, | |
"step": 60 | |
}, | |
{ | |
"completion_length": 297.50894355773926, | |
"epoch": 0.09102587630757937, | |
"grad_norm": 0.0345348188639915, | |
"kl": 0.02149200439453125, | |
"learning_rate": 4.851957747496606e-07, | |
"loss": 0.0, | |
"reward": 1.0078125447034836, | |
"reward_std": 0.07301658112555742, | |
"rewards/equation_reward_func": 0.01785714365541935, | |
"rewards/format_reward_func": 0.9899553805589676, | |
"step": 62 | |
}, | |
{ | |
"completion_length": 301.83595275878906, | |
"epoch": 0.09396219489814644, | |
"grad_norm": 0.024731850879510856, | |
"kl": 0.0228271484375, | |
"learning_rate": 4.839500322551386e-07, | |
"loss": 0.0, | |
"reward": 1.0156250298023224, | |
"reward_std": 0.05877388082444668, | |
"rewards/equation_reward_func": 0.01897321513388306, | |
"rewards/format_reward_func": 0.996651791036129, | |
"step": 64 | |
}, | |
{ | |
"completion_length": 304.755597114563, | |
"epoch": 0.09689851348871352, | |
"grad_norm": 0.022582520854879787, | |
"kl": 0.022369384765625, | |
"learning_rate": 4.826557047444563e-07, | |
"loss": 0.0, | |
"reward": 1.0111607611179352, | |
"reward_std": 0.06549919955432415, | |
"rewards/equation_reward_func": 0.01897321525029838, | |
"rewards/format_reward_func": 0.9921875223517418, | |
"step": 66 | |
}, | |
{ | |
"completion_length": 299.7221088409424, | |
"epoch": 0.09983483207928061, | |
"grad_norm": 0.02378726780795042, | |
"kl": 0.02381134033203125, | |
"learning_rate": 4.813130610139993e-07, | |
"loss": 0.0, | |
"reward": 1.0133928954601288, | |
"reward_std": 0.05174434743821621, | |
"rewards/equation_reward_func": 0.017857143888249993, | |
"rewards/format_reward_func": 0.9955357238650322, | |
"step": 68 | |
}, | |
{ | |
"completion_length": 295.27344703674316, | |
"epoch": 0.10277115066984768, | |
"grad_norm": 0.03710159597518268, | |
"kl": 0.02559661865234375, | |
"learning_rate": 4.799223798941089e-07, | |
"loss": 0.0, | |
"reward": 1.0234375521540642, | |
"reward_std": 0.08428801316767931, | |
"rewards/equation_reward_func": 0.02901785832364112, | |
"rewards/format_reward_func": 0.994419664144516, | |
"step": 70 | |
}, | |
{ | |
"completion_length": 293.92858505249023, | |
"epoch": 0.10570746926041476, | |
"grad_norm": 0.030422368236480923, | |
"kl": 0.0235748291015625, | |
"learning_rate": 4.78483950191177e-07, | |
"loss": 0.0, | |
"reward": 1.016741119325161, | |
"reward_std": 0.06475032959133387, | |
"rewards/equation_reward_func": 0.02120535832364112, | |
"rewards/format_reward_func": 0.9955357238650322, | |
"step": 72 | |
}, | |
{ | |
"completion_length": 301.22099113464355, | |
"epoch": 0.10864378785098183, | |
"grad_norm": 0.027016440417082194, | |
"kl": 0.02487945556640625, | |
"learning_rate": 4.769980706276687e-07, | |
"loss": 0.0, | |
"reward": 1.0089285969734192, | |
"reward_std": 0.06354640237987041, | |
"rewards/equation_reward_func": 0.016741072293370962, | |
"rewards/format_reward_func": 0.9921875149011612, | |
"step": 74 | |
}, | |
{ | |
"completion_length": 306.91072845458984, | |
"epoch": 0.1115801064415489, | |
"grad_norm": 0.03756361261599655, | |
"kl": 0.03522491455078125, | |
"learning_rate": 4.7546504978008595e-07, | |
"loss": 0.0, | |
"reward": 1.0133929029107094, | |
"reward_std": 0.06290700566023588, | |
"rewards/equation_reward_func": 0.018973215483129025, | |
"rewards/format_reward_func": 0.994419664144516, | |
"step": 76 | |
}, | |
{ | |
"completion_length": 309.30805015563965, | |
"epoch": 0.11451642503211598, | |
"grad_norm": 0.03409602166374047, | |
"kl": 0.02864837646484375, | |
"learning_rate": 4.738852060148848e-07, | |
"loss": 0.0, | |
"reward": 1.004464328289032, | |
"reward_std": 0.07286503352224827, | |
"rewards/equation_reward_func": 0.016741072293370962, | |
"rewards/format_reward_func": 0.9877232313156128, | |
"step": 78 | |
}, | |
{ | |
"completion_length": 295.35604095458984, | |
"epoch": 0.11745274362268306, | |
"grad_norm": 0.016373157837020508, | |
"kl": 0.0261993408203125, | |
"learning_rate": 4.722588674223593e-07, | |
"loss": 0.0, | |
"reward": 1.0044643133878708, | |
"reward_std": 0.04678636882454157, | |
"rewards/equation_reward_func": 0.011160714784637094, | |
"rewards/format_reward_func": 0.9933035969734192, | |
"step": 80 | |
}, | |
{ | |
"completion_length": 294.49331283569336, | |
"epoch": 0.12038906221325013, | |
"grad_norm": 0.028656622522124857, | |
"kl": 0.02646636962890625, | |
"learning_rate": 4.70586371748506e-07, | |
"loss": 0.0, | |
"reward": 1.0145089775323868, | |
"reward_std": 0.05407622084021568, | |
"rewards/equation_reward_func": 0.01897321513388306, | |
"rewards/format_reward_func": 0.9955357313156128, | |
"step": 82 | |
}, | |
{ | |
"completion_length": 295.83595085144043, | |
"epoch": 0.12332538080381722, | |
"grad_norm": 0.042234987959512645, | |
"kl": 0.027313232421875, | |
"learning_rate": 4.6886806632488363e-07, | |
"loss": 0.0, | |
"reward": 1.022321481257677, | |
"reward_std": 0.09619954135268927, | |
"rewards/equation_reward_func": 0.03125000174622983, | |
"rewards/format_reward_func": 0.9910714440047741, | |
"step": 84 | |
}, | |
{ | |
"completion_length": 291.2533645629883, | |
"epoch": 0.12626169939438428, | |
"grad_norm": 0.023816703176273206, | |
"kl": 0.0269775390625, | |
"learning_rate": 4.6710430799648143e-07, | |
"loss": 0.0, | |
"reward": 1.0212053954601288, | |
"reward_std": 0.060498448088765144, | |
"rewards/equation_reward_func": 0.024553572409786284, | |
"rewards/format_reward_func": 0.9966517984867096, | |
"step": 86 | |
}, | |
{ | |
"completion_length": 280.7332754135132, | |
"epoch": 0.12919801798495137, | |
"grad_norm": 0.0349694305046122, | |
"kl": 0.02933502197265625, | |
"learning_rate": 4.652954630476127e-07, | |
"loss": 0.0, | |
"reward": 1.0279018357396126, | |
"reward_std": 0.08335960982367396, | |
"rewards/equation_reward_func": 0.03348214493598789, | |
"rewards/format_reward_func": 0.994419664144516, | |
"step": 88 | |
}, | |
{ | |
"completion_length": 282.4788055419922, | |
"epoch": 0.13213433657551846, | |
"grad_norm": 0.02737283392710255, | |
"kl": 0.03136444091796875, | |
"learning_rate": 4.6344190712584713e-07, | |
"loss": 0.0, | |
"reward": 1.0145089626312256, | |
"reward_std": 0.06711499718949199, | |
"rewards/equation_reward_func": 0.021205358440056443, | |
"rewards/format_reward_func": 0.9933035969734192, | |
"step": 90 | |
}, | |
{ | |
"completion_length": 286.83037185668945, | |
"epoch": 0.13507065516608552, | |
"grad_norm": 0.03769576508641649, | |
"kl": 0.03195953369140625, | |
"learning_rate": 4.615440251639995e-07, | |
"loss": 0.0, | |
"reward": 1.0279018431901932, | |
"reward_std": 0.08161913510411978, | |
"rewards/equation_reward_func": 0.031250001629814506, | |
"rewards/format_reward_func": 0.9966517984867096, | |
"step": 92 | |
}, | |
{ | |
"completion_length": 284.62947845458984, | |
"epoch": 0.1380069737566526, | |
"grad_norm": 0.026738706985139863, | |
"kl": 0.0301971435546875, | |
"learning_rate": 4.596022113001894e-07, | |
"loss": 0.0, | |
"reward": 1.0167411118745804, | |
"reward_std": 0.0534368259832263, | |
"rewards/equation_reward_func": 0.018973215366713703, | |
"rewards/format_reward_func": 0.9977678656578064, | |
"step": 94 | |
}, | |
{ | |
"completion_length": 273.97992610931396, | |
"epoch": 0.14094329234721967, | |
"grad_norm": 0.03717968288620107, | |
"kl": 0.03003692626953125, | |
"learning_rate": 4.576168687959895e-07, | |
"loss": 0.0, | |
"reward": 1.02901791036129, | |
"reward_std": 0.07455569412559271, | |
"rewards/equation_reward_func": 0.033482144703157246, | |
"rewards/format_reward_func": 0.9955357313156128, | |
"step": 96 | |
}, | |
{ | |
"completion_length": 283.43974590301514, | |
"epoch": 0.14387961093778676, | |
"grad_norm": 0.03419476151952092, | |
"kl": 0.032470703125, | |
"learning_rate": 4.555884099526793e-07, | |
"loss": 0.0, | |
"reward": 1.018973246216774, | |
"reward_std": 0.05700471764430404, | |
"rewards/equation_reward_func": 0.022321429918520153, | |
"rewards/format_reward_func": 0.9966517984867096, | |
"step": 98 | |
}, | |
{ | |
"completion_length": 278.54130840301514, | |
"epoch": 0.14681592952835382, | |
"grad_norm": 0.03878824475168632, | |
"kl": 0.03301239013671875, | |
"learning_rate": 4.5351725602562174e-07, | |
"loss": 0.0, | |
"reward": 1.025669701397419, | |
"reward_std": 0.07384143397212029, | |
"rewards/equation_reward_func": 0.0279017873108387, | |
"rewards/format_reward_func": 0.9977678656578064, | |
"step": 100 | |
}, | |
{ | |
"completion_length": 280.62055110931396, | |
"epoch": 0.1497522481189209, | |
"grad_norm": 0.02811906331283331, | |
"kl": 0.033050537109375, | |
"learning_rate": 4.514038371367791e-07, | |
"loss": 0.0, | |
"reward": 1.015625037252903, | |
"reward_std": 0.05238374415785074, | |
"rewards/equation_reward_func": 0.01897321525029838, | |
"rewards/format_reward_func": 0.9966517984867096, | |
"step": 102 | |
}, | |
{ | |
"completion_length": 271.188627243042, | |
"epoch": 0.15268856670948797, | |
"grad_norm": 0.03367420492297424, | |
"kl": 0.03389739990234375, | |
"learning_rate": 4.4924859218538936e-07, | |
"loss": 0.0, | |
"reward": 1.020089328289032, | |
"reward_std": 0.0539246741682291, | |
"rewards/equation_reward_func": 0.02120535832364112, | |
"rewards/format_reward_func": 0.9988839328289032, | |
"step": 104 | |
}, | |
{ | |
"completion_length": 274.41965198516846, | |
"epoch": 0.15562488530005505, | |
"grad_norm": 0.032704500738241556, | |
"kl": 0.0342559814453125, | |
"learning_rate": 4.470519687568185e-07, | |
"loss": 0.0, | |
"reward": 1.0156250298023224, | |
"reward_std": 0.06005267146974802, | |
"rewards/equation_reward_func": 0.021205357974395156, | |
"rewards/format_reward_func": 0.994419664144516, | |
"step": 106 | |
}, | |
{ | |
"completion_length": 267.667423248291, | |
"epoch": 0.15856120389062214, | |
"grad_norm": 0.022983000288007625, | |
"kl": 0.03482818603515625, | |
"learning_rate": 4.4481442302960923e-07, | |
"loss": 0.0, | |
"reward": 1.0145089700818062, | |
"reward_std": 0.05625654757022858, | |
"rewards/equation_reward_func": 0.01785714377183467, | |
"rewards/format_reward_func": 0.9966517984867096, | |
"step": 108 | |
}, | |
{ | |
"completion_length": 276.2734498977661, | |
"epoch": 0.1614975224811892, | |
"grad_norm": 0.03293451132142616, | |
"kl": 0.03582000732421875, | |
"learning_rate": 4.4253641968074505e-07, | |
"loss": 0.0, | |
"reward": 1.0212053954601288, | |
"reward_std": 0.051668363623321056, | |
"rewards/equation_reward_func": 0.02343750116415322, | |
"rewards/format_reward_func": 0.9977678582072258, | |
"step": 110 | |
}, | |
{ | |
"completion_length": 259.26229190826416, | |
"epoch": 0.1644338410717563, | |
"grad_norm": 0.030451551813034692, | |
"kl": 0.0374908447265625, | |
"learning_rate": 4.402184317891501e-07, | |
"loss": 0.0, | |
"reward": 1.0279018357396126, | |
"reward_std": 0.07869063876569271, | |
"rewards/equation_reward_func": 0.029017858556471765, | |
"rewards/format_reward_func": 0.9988839328289032, | |
"step": 112 | |
}, | |
{ | |
"completion_length": 263.8515748977661, | |
"epoch": 0.16737015966232335, | |
"grad_norm": 0.03266847523674648, | |
"kl": 0.03509521484375, | |
"learning_rate": 4.37860940737443e-07, | |
"loss": 0.0, | |
"reward": 1.024553619325161, | |
"reward_std": 0.06911690765991807, | |
"rewards/equation_reward_func": 0.026785715715959668, | |
"rewards/format_reward_func": 0.9977678656578064, | |
"step": 114 | |
}, | |
{ | |
"completion_length": 270.2511262893677, | |
"epoch": 0.17030647825289044, | |
"grad_norm": 0.029203115735684546, | |
"kl": 0.035980224609375, | |
"learning_rate": 4.354644361119671e-07, | |
"loss": 0.0, | |
"reward": 1.0145089700818062, | |
"reward_std": 0.0429902458563447, | |
"rewards/equation_reward_func": 0.015625000814907253, | |
"rewards/format_reward_func": 0.9988839328289032, | |
"step": 116 | |
}, | |
{ | |
"completion_length": 265.80135345458984, | |
"epoch": 0.1732427968434575, | |
"grad_norm": 0.03659231709237539, | |
"kl": 0.0381317138671875, | |
"learning_rate": 4.3302941560111716e-07, | |
"loss": 0.0, | |
"reward": 1.0189732685685158, | |
"reward_std": 0.0753056826069951, | |
"rewards/equation_reward_func": 0.025669644121080637, | |
"rewards/format_reward_func": 0.9933035895228386, | |
"step": 118 | |
}, | |
{ | |
"completion_length": 261.29130935668945, | |
"epoch": 0.1761791154340246, | |
"grad_norm": 0.01985992912461338, | |
"kl": 0.03851318359375, | |
"learning_rate": 4.3055638489198236e-07, | |
"loss": 0.0, | |
"reward": 1.007812537252903, | |
"reward_std": 0.04881514888256788, | |
"rewards/equation_reward_func": 0.013392857974395156, | |
"rewards/format_reward_func": 0.994419664144516, | |
"step": 120 | |
}, | |
{ | |
"completion_length": 258.831485748291, | |
"epoch": 0.17911543402459168, | |
"grad_norm": 0.028601661901634927, | |
"kl": 0.0393218994140625, | |
"learning_rate": 4.280458575653296e-07, | |
"loss": 0.0, | |
"reward": 1.0178571864962578, | |
"reward_std": 0.06534695206210017, | |
"rewards/equation_reward_func": 0.023437501513399184, | |
"rewards/format_reward_func": 0.994419664144516, | |
"step": 122 | |
}, | |
{ | |
"completion_length": 257.6707715988159, | |
"epoch": 0.18205175261515874, | |
"grad_norm": 0.020698279128715157, | |
"kl": 0.04107666015625, | |
"learning_rate": 4.2549835498894665e-07, | |
"loss": 0.0, | |
"reward": 1.018973246216774, | |
"reward_std": 0.057493268977850676, | |
"rewards/equation_reward_func": 0.023437501047737896, | |
"rewards/format_reward_func": 0.9955357238650322, | |
"step": 124 | |
}, | |
{ | |
"completion_length": 251.8906373977661, | |
"epoch": 0.18498807120572583, | |
"grad_norm": 0.022202862136424105, | |
"kl": 0.041351318359375, | |
"learning_rate": 4.229144062093679e-07, | |
"loss": 0.0, | |
"reward": 1.0234375447034836, | |
"reward_std": 0.0553896245546639, | |
"rewards/equation_reward_func": 0.023437501047737896, | |
"rewards/format_reward_func": 1.0, | |
"step": 126 | |
}, | |
{ | |
"completion_length": 251.66853618621826, | |
"epoch": 0.1879243897962929, | |
"grad_norm": 0.032026257478678814, | |
"kl": 0.0420074462890625, | |
"learning_rate": 4.2029454784200675e-07, | |
"loss": 0.0, | |
"reward": 1.0245536267757416, | |
"reward_std": 0.057569249998778105, | |
"rewards/equation_reward_func": 0.024553572642616928, | |
"rewards/format_reward_func": 1.0, | |
"step": 128 | |
}, | |
{ | |
"completion_length": 257.19309520721436, | |
"epoch": 0.19086070838685998, | |
"grad_norm": 0.035245007765867475, | |
"kl": 0.0410308837890625, | |
"learning_rate": 4.1763932395971433e-07, | |
"loss": 0.0, | |
"reward": 1.020089328289032, | |
"reward_std": 0.057341722305864096, | |
"rewards/equation_reward_func": 0.02343750128056854, | |
"rewards/format_reward_func": 0.9966517984867096, | |
"step": 130 | |
}, | |
{ | |
"completion_length": 263.37054920196533, | |
"epoch": 0.19379702697742704, | |
"grad_norm": 0.036559422829769254, | |
"kl": 0.04827880859375, | |
"learning_rate": 4.1494928597979117e-07, | |
"loss": 0.0, | |
"reward": 1.024553619325161, | |
"reward_std": 0.07117325672879815, | |
"rewards/equation_reward_func": 0.026785715483129025, | |
"rewards/format_reward_func": 0.9977678582072258, | |
"step": 132 | |
}, | |
{ | |
"completion_length": 250.35491847991943, | |
"epoch": 0.19673334556799413, | |
"grad_norm": 0.0355452816503214, | |
"kl": 0.04229736328125, | |
"learning_rate": 4.122249925494726e-07, | |
"loss": 0.0, | |
"reward": 1.023437537252903, | |
"reward_std": 0.07714970735833049, | |
"rewards/equation_reward_func": 0.026785715599544346, | |
"rewards/format_reward_func": 0.996651791036129, | |
"step": 134 | |
}, | |
{ | |
"completion_length": 249.38951969146729, | |
"epoch": 0.19966966415856122, | |
"grad_norm": 0.04139312520171231, | |
"kl": 0.0432281494140625, | |
"learning_rate": 4.094670094299131e-07, | |
"loss": 0.0, | |
"reward": 1.032366119325161, | |
"reward_std": 0.07797456067055464, | |
"rewards/equation_reward_func": 0.03236607275903225, | |
"rewards/format_reward_func": 1.0, | |
"step": 136 | |
}, | |
{ | |
"completion_length": 258.31921100616455, | |
"epoch": 0.20260598274912828, | |
"grad_norm": 0.04587399823733233, | |
"kl": 0.04254150390625, | |
"learning_rate": 4.066759093786931e-07, | |
"loss": 0.0, | |
"reward": 1.033482201397419, | |
"reward_std": 0.09353066422045231, | |
"rewards/equation_reward_func": 0.03794643084984273, | |
"rewards/format_reward_func": 0.9955357238650322, | |
"step": 138 | |
}, | |
{ | |
"completion_length": 257.14510345458984, | |
"epoch": 0.20554230133969537, | |
"grad_norm": 0.03418838476363922, | |
"kl": 0.04620361328125, | |
"learning_rate": 4.038522720308732e-07, | |
"loss": 0.0, | |
"reward": 1.0223214775323868, | |
"reward_std": 0.061702375300228596, | |
"rewards/equation_reward_func": 0.02455357275903225, | |
"rewards/format_reward_func": 0.9977678656578064, | |
"step": 140 | |
}, | |
{ | |
"completion_length": 257.2689838409424, | |
"epoch": 0.20847861993026243, | |
"grad_norm": 0.04383610634106021, | |
"kl": 0.0451202392578125, | |
"learning_rate": 4.009966837786194e-07, | |
"loss": 0.0, | |
"reward": 1.03683041036129, | |
"reward_std": 0.10074383299797773, | |
"rewards/equation_reward_func": 0.04241071583237499, | |
"rewards/format_reward_func": 0.9944196566939354, | |
"step": 142 | |
}, | |
{ | |
"completion_length": 257.14287090301514, | |
"epoch": 0.21141493852082952, | |
"grad_norm": 0.03707968717170861, | |
"kl": 0.046173095703125, | |
"learning_rate": 3.981097376494259e-07, | |
"loss": 0.0, | |
"reward": 1.0200893208384514, | |
"reward_std": 0.0691437772475183, | |
"rewards/equation_reward_func": 0.02455357275903225, | |
"rewards/format_reward_func": 0.9955357313156128, | |
"step": 144 | |
}, | |
{ | |
"completion_length": 256.9732303619385, | |
"epoch": 0.21435125711139658, | |
"grad_norm": 0.03163136571449981, | |
"kl": 0.0549163818359375, | |
"learning_rate": 3.951920331829592e-07, | |
"loss": 0.0001, | |
"reward": 1.0279018208384514, | |
"reward_std": 0.07846311014145613, | |
"rewards/equation_reward_func": 0.030133930034935474, | |
"rewards/format_reward_func": 0.9977678656578064, | |
"step": 146 | |
}, | |
{ | |
"completion_length": 258.79242038726807, | |
"epoch": 0.21728757570196366, | |
"grad_norm": 0.03734197816068146, | |
"kl": 0.0498809814453125, | |
"learning_rate": 3.922441763065506e-07, | |
"loss": 0.0, | |
"reward": 1.0200893357396126, | |
"reward_std": 0.05246042646467686, | |
"rewards/equation_reward_func": 0.02008928661234677, | |
"rewards/format_reward_func": 1.0, | |
"step": 148 | |
}, | |
{ | |
"completion_length": 265.50113010406494, | |
"epoch": 0.22022389429253075, | |
"grad_norm": 0.032988418690538694, | |
"kl": 0.0509033203125, | |
"learning_rate": 3.8926677920936093e-07, | |
"loss": 0.0001, | |
"reward": 1.0200893357396126, | |
"reward_std": 0.07917848788201809, | |
"rewards/equation_reward_func": 0.025669644004665315, | |
"rewards/format_reward_func": 0.9944196566939354, | |
"step": 150 | |
}, | |
{ | |
"completion_length": 262.2790298461914, | |
"epoch": 0.2231602128830978, | |
"grad_norm": 0.03947725857640478, | |
"kl": 0.05499267578125, | |
"learning_rate": 3.862604602152464e-07, | |
"loss": 0.0001, | |
"reward": 1.0412946864962578, | |
"reward_std": 0.10787475202232599, | |
"rewards/equation_reward_func": 0.04687500232830644, | |
"rewards/format_reward_func": 0.994419664144516, | |
"step": 152 | |
}, | |
{ | |
"completion_length": 270.6116189956665, | |
"epoch": 0.2260965314736649, | |
"grad_norm": 0.032374211940469313, | |
"kl": 0.0520782470703125, | |
"learning_rate": 3.8322584365434934e-07, | |
"loss": 0.0001, | |
"reward": 1.03683041036129, | |
"reward_std": 0.07872272981330752, | |
"rewards/equation_reward_func": 0.03683035902213305, | |
"rewards/format_reward_func": 1.0, | |
"step": 154 | |
}, | |
{ | |
"completion_length": 261.8973340988159, | |
"epoch": 0.22903285006423196, | |
"grad_norm": 0.03908993116978629, | |
"kl": 0.053985595703125, | |
"learning_rate": 3.8016355973344173e-07, | |
"loss": 0.0001, | |
"reward": 1.0267857611179352, | |
"reward_std": 0.07741002831608057, | |
"rewards/equation_reward_func": 0.030133930151350796, | |
"rewards/format_reward_func": 0.9966517984867096, | |
"step": 156 | |
}, | |
{ | |
"completion_length": 271.08595180511475, | |
"epoch": 0.23196916865479905, | |
"grad_norm": 0.043196101404256254, | |
"kl": 0.054473876953125, | |
"learning_rate": 3.7707424440504863e-07, | |
"loss": 0.0001, | |
"reward": 1.032366119325161, | |
"reward_std": 0.09447245439514518, | |
"rewards/equation_reward_func": 0.037946430034935474, | |
"rewards/format_reward_func": 0.9944196566939354, | |
"step": 158 | |
}, | |
{ | |
"completion_length": 265.0502338409424, | |
"epoch": 0.2349054872453661, | |
"grad_norm": 0.03806362103891916, | |
"kl": 0.0577392578125, | |
"learning_rate": 3.739585392353787e-07, | |
"loss": 0.0001, | |
"reward": 1.0212054029107094, | |
"reward_std": 0.07966633653268218, | |
"rewards/equation_reward_func": 0.026785715599544346, | |
"rewards/format_reward_func": 0.994419664144516, | |
"step": 160 | |
}, | |
{ | |
"completion_length": 268.6663064956665, | |
"epoch": 0.2378418058359332, | |
"grad_norm": 0.029412731757100335, | |
"kl": 0.0699005126953125, | |
"learning_rate": 3.7081709127108767e-07, | |
"loss": 0.0001, | |
"reward": 1.0256696827709675, | |
"reward_std": 0.08432010188698769, | |
"rewards/equation_reward_func": 0.03459821583237499, | |
"rewards/format_reward_func": 0.9910714440047741, | |
"step": 162 | |
}, | |
{ | |
"completion_length": 286.89733505249023, | |
"epoch": 0.24077812442650026, | |
"grad_norm": 0.04278121737809733, | |
"kl": 0.0554046630859375, | |
"learning_rate": 3.6765055290490513e-07, | |
"loss": 0.0001, | |
"reward": 1.025669701397419, | |
"reward_std": 0.08428801316767931, | |
"rewards/equation_reward_func": 0.030133930034935474, | |
"rewards/format_reward_func": 0.9955357313156128, | |
"step": 164 | |
}, | |
{ | |
"completion_length": 281.04465675354004, | |
"epoch": 0.24371444301706735, | |
"grad_norm": 0.03589008461446017, | |
"kl": 0.0649261474609375, | |
"learning_rate": 3.644595817401501e-07, | |
"loss": 0.0001, | |
"reward": 1.0290179178118706, | |
"reward_std": 0.08672865945845842, | |
"rewards/equation_reward_func": 0.0334821444703266, | |
"rewards/format_reward_func": 0.9955357238650322, | |
"step": 166 | |
}, | |
{ | |
"completion_length": 280.0134057998657, | |
"epoch": 0.24665076160763444, | |
"grad_norm": 0.03504932595126223, | |
"kl": 0.0573883056640625, | |
"learning_rate": 3.6124484045416483e-07, | |
"loss": 0.0001, | |
"reward": 1.0156250447034836, | |
"reward_std": 0.06922045908868313, | |
"rewards/equation_reward_func": 0.0212053582072258, | |
"rewards/format_reward_func": 0.9944196566939354, | |
"step": 168 | |
}, | |
{ | |
"completion_length": 284.7399673461914, | |
"epoch": 0.2495870801982015, | |
"grad_norm": 0.03348991366352892, | |
"kl": 0.0587158203125, | |
"learning_rate": 3.580069966606949e-07, | |
"loss": 0.0001, | |
"reward": 1.0312500521540642, | |
"reward_std": 0.08672866132110357, | |
"rewards/equation_reward_func": 0.03571428719442338, | |
"rewards/format_reward_func": 0.9955357238650322, | |
"step": 170 | |
}, | |
{ | |
"completion_length": 287.6384048461914, | |
"epoch": 0.25252339878876856, | |
"grad_norm": 0.02297432211698741, | |
"kl": 0.0548095703125, | |
"learning_rate": 3.547467227712444e-07, | |
"loss": 0.0001, | |
"reward": 1.0212054029107094, | |
"reward_std": 0.068167376331985, | |
"rewards/equation_reward_func": 0.025669644586741924, | |
"rewards/format_reward_func": 0.9955357313156128, | |
"step": 172 | |
}, | |
{ | |
"completion_length": 276.344877243042, | |
"epoch": 0.25545971737933565, | |
"grad_norm": 0.031249403617369484, | |
"kl": 0.058990478515625, | |
"learning_rate": 3.5146469585543386e-07, | |
"loss": 0.0001, | |
"reward": 1.0223214626312256, | |
"reward_std": 0.0465588397346437, | |
"rewards/equation_reward_func": 0.02232142968568951, | |
"rewards/format_reward_func": 1.0, | |
"step": 174 | |
}, | |
{ | |
"completion_length": 287.0747871398926, | |
"epoch": 0.25839603596990274, | |
"grad_norm": 0.024363445225025017, | |
"kl": 0.0567626953125, | |
"learning_rate": 3.481615975003922e-07, | |
"loss": 0.0001, | |
"reward": 1.0223214700818062, | |
"reward_std": 0.05978166777640581, | |
"rewards/equation_reward_func": 0.02678571583237499, | |
"rewards/format_reward_func": 0.9955357313156128, | |
"step": 176 | |
}, | |
{ | |
"completion_length": 273.2667541503906, | |
"epoch": 0.2613323545604698, | |
"grad_norm": 0.03673410404509551, | |
"kl": 0.0570526123046875, | |
"learning_rate": 3.448381136692089e-07, | |
"loss": 0.0001, | |
"reward": 1.0435268357396126, | |
"reward_std": 0.1020226301625371, | |
"rewards/equation_reward_func": 0.04799107392318547, | |
"rewards/format_reward_func": 0.9955357313156128, | |
"step": 178 | |
}, | |
{ | |
"completion_length": 276.5803737640381, | |
"epoch": 0.2642686731510369, | |
"grad_norm": 0.03470857489299649, | |
"kl": 0.0600433349609375, | |
"learning_rate": 3.4149493455847897e-07, | |
"loss": 0.0001, | |
"reward": 1.0223214626312256, | |
"reward_std": 0.06948007736355066, | |
"rewards/equation_reward_func": 0.027901786961592734, | |
"rewards/format_reward_func": 0.994419664144516, | |
"step": 180 | |
}, | |
{ | |
"completion_length": 283.47433948516846, | |
"epoch": 0.26720499174160395, | |
"grad_norm": 0.04127720885486165, | |
"kl": 0.0587921142578125, | |
"learning_rate": 3.3813275445496766e-07, | |
"loss": 0.0001, | |
"reward": 1.0267857611179352, | |
"reward_std": 0.08631567610427737, | |
"rewards/equation_reward_func": 0.03348214435391128, | |
"rewards/format_reward_func": 0.9933035895228386, | |
"step": 182 | |
}, | |
{ | |
"completion_length": 282.8973331451416, | |
"epoch": 0.27014131033217104, | |
"grad_norm": 0.032665399093278785, | |
"kl": 0.0596160888671875, | |
"learning_rate": 3.347522715914262e-07, | |
"loss": 0.0001, | |
"reward": 1.0256696939468384, | |
"reward_std": 0.07425330020487309, | |
"rewards/equation_reward_func": 0.02901785878930241, | |
"rewards/format_reward_func": 0.9966517984867096, | |
"step": 184 | |
}, | |
{ | |
"completion_length": 290.776798248291, | |
"epoch": 0.2730776289227381, | |
"grad_norm": 0.034926325047606295, | |
"kl": 0.0619354248046875, | |
"learning_rate": 3.313541880015877e-07, | |
"loss": 0.0001, | |
"reward": 1.0267857611179352, | |
"reward_std": 0.0910613308660686, | |
"rewards/equation_reward_func": 0.034598216065205634, | |
"rewards/format_reward_func": 0.9921875149011612, | |
"step": 186 | |
}, | |
{ | |
"completion_length": 290.6216640472412, | |
"epoch": 0.2760139475133052, | |
"grad_norm": 0.03913319668173702, | |
"kl": 0.0630340576171875, | |
"learning_rate": 3.279392093743747e-07, | |
"loss": 0.0001, | |
"reward": 1.0267857685685158, | |
"reward_std": 0.09586183680221438, | |
"rewards/equation_reward_func": 0.033482144703157246, | |
"rewards/format_reward_func": 0.9933035895228386, | |
"step": 188 | |
}, | |
{ | |
"completion_length": 286.97992515563965, | |
"epoch": 0.27895026610387225, | |
"grad_norm": 0.02680827432830071, | |
"kl": 0.0650177001953125, | |
"learning_rate": 3.245080449073459e-07, | |
"loss": 0.0001, | |
"reward": 1.0189732536673546, | |
"reward_std": 0.0477627688087523, | |
"rewards/equation_reward_func": 0.02008928661234677, | |
"rewards/format_reward_func": 0.9988839328289032, | |
"step": 190 | |
}, | |
{ | |
"completion_length": 296.9609498977661, | |
"epoch": 0.28188658469443933, | |
"grad_norm": 0.05153256098850436, | |
"kl": 0.0635528564453125, | |
"learning_rate": 3.210614071594162e-07, | |
"loss": 0.0001, | |
"reward": 1.0491071939468384, | |
"reward_std": 0.12858914118260145, | |
"rewards/equation_reward_func": 0.054687502793967724, | |
"rewards/format_reward_func": 0.9944196566939354, | |
"step": 192 | |
}, | |
{ | |
"completion_length": 289.1529150009155, | |
"epoch": 0.2848229032850064, | |
"grad_norm": 0.04303274228699889, | |
"kl": 0.0654144287109375, | |
"learning_rate": 3.1760001190287695e-07, | |
"loss": 0.0001, | |
"reward": 1.0345982611179352, | |
"reward_std": 0.10969929629936814, | |
"rewards/equation_reward_func": 0.041294644703157246, | |
"rewards/format_reward_func": 0.9933035895228386, | |
"step": 194 | |
}, | |
{ | |
"completion_length": 271.6651954650879, | |
"epoch": 0.2877592218755735, | |
"grad_norm": 0.05046112376966487, | |
"kl": 0.0657958984375, | |
"learning_rate": 3.141245779747502e-07, | |
"loss": 0.0001, | |
"reward": 1.0412946939468384, | |
"reward_std": 0.09476668341085315, | |
"rewards/equation_reward_func": 0.04352678812574595, | |
"rewards/format_reward_func": 0.9977678656578064, | |
"step": 196 | |
}, | |
{ | |
"completion_length": 281.21876430511475, | |
"epoch": 0.2906955404661406, | |
"grad_norm": 0.03956244273767483, | |
"kl": 0.0671844482421875, | |
"learning_rate": 3.106358271275056e-07, | |
"loss": 0.0001, | |
"reward": 1.0446429178118706, | |
"reward_std": 0.10356425913050771, | |
"rewards/equation_reward_func": 0.04910714505240321, | |
"rewards/format_reward_func": 0.9955357313156128, | |
"step": 198 | |
}, | |
{ | |
"completion_length": 284.19421100616455, | |
"epoch": 0.29363185905670763, | |
"grad_norm": 0.04003291703794155, | |
"kl": 0.0689239501953125, | |
"learning_rate": 3.0713448387917227e-07, | |
"loss": 0.0001, | |
"reward": 1.032366119325161, | |
"reward_std": 0.10179621493443847, | |
"rewards/equation_reward_func": 0.039062502211891115, | |
"rewards/format_reward_func": 0.9933035895228386, | |
"step": 200 | |
}, | |
{ | |
"completion_length": 289.67523765563965, | |
"epoch": 0.2965681776472747, | |
"grad_norm": 0.03818285499902066, | |
"kl": 0.066192626953125, | |
"learning_rate": 3.0362127536287636e-07, | |
"loss": 0.0001, | |
"reward": 1.0312500521540642, | |
"reward_std": 0.09165203105658293, | |
"rewards/equation_reward_func": 0.03906250128056854, | |
"rewards/format_reward_func": 0.9921875223517418, | |
"step": 202 | |
}, | |
{ | |
"completion_length": 290.83595180511475, | |
"epoch": 0.2995044962378418, | |
"grad_norm": 0.04200317878906047, | |
"kl": 0.068603515625, | |
"learning_rate": 3.0009693117583523e-07, | |
"loss": 0.0001, | |
"reward": 1.0368304029107094, | |
"reward_std": 0.08616412943229079, | |
"rewards/equation_reward_func": 0.04017857275903225, | |
"rewards/format_reward_func": 0.9966517984867096, | |
"step": 204 | |
}, | |
{ | |
"completion_length": 291.18751335144043, | |
"epoch": 0.3024408148284089, | |
"grad_norm": 0.036055247335712394, | |
"kl": 0.0648193359375, | |
"learning_rate": 2.965621832278401e-07, | |
"loss": 0.0001, | |
"reward": 1.035714328289032, | |
"reward_std": 0.08680282393470407, | |
"rewards/equation_reward_func": 0.04129464505240321, | |
"rewards/format_reward_func": 0.994419664144516, | |
"step": 206 | |
}, | |
{ | |
"completion_length": 293.39175605773926, | |
"epoch": 0.30537713341897593, | |
"grad_norm": 0.04861484810687108, | |
"kl": 0.068359375, | |
"learning_rate": 2.9301776558925875e-07, | |
"loss": 0.0001, | |
"reward": 1.0323661267757416, | |
"reward_std": 0.10604698117822409, | |
"rewards/equation_reward_func": 0.041294644586741924, | |
"rewards/format_reward_func": 0.9910714477300644, | |
"step": 208 | |
}, | |
{ | |
"completion_length": 287.9776916503906, | |
"epoch": 0.308313452009543, | |
"grad_norm": 0.03249802953930502, | |
"kl": 0.068572998046875, | |
"learning_rate": 2.894644143385885e-07, | |
"loss": 0.0001, | |
"reward": 1.0357143208384514, | |
"reward_std": 0.0992788840085268, | |
"rewards/equation_reward_func": 0.04352678789291531, | |
"rewards/format_reward_func": 0.9921875149011612, | |
"step": 210 | |
}, | |
{ | |
"completion_length": 290.8493432998657, | |
"epoch": 0.3112497706001101, | |
"grad_norm": 0.03516633650510248, | |
"kl": 0.067108154296875, | |
"learning_rate": 2.859028674095937e-07, | |
"loss": 0.0001, | |
"reward": 1.0223214849829674, | |
"reward_std": 0.06850438052788377, | |
"rewards/equation_reward_func": 0.025669644004665315, | |
"rewards/format_reward_func": 0.9966517984867096, | |
"step": 212 | |
}, | |
{ | |
"completion_length": 293.99108505249023, | |
"epoch": 0.3141860891906772, | |
"grad_norm": 0.037137142041911965, | |
"kl": 0.0703582763671875, | |
"learning_rate": 2.823338644380566e-07, | |
"loss": 0.0001, | |
"reward": 1.0435268431901932, | |
"reward_std": 0.09563360968604684, | |
"rewards/equation_reward_func": 0.045758930733427405, | |
"rewards/format_reward_func": 0.9977678656578064, | |
"step": 214 | |
}, | |
{ | |
"completion_length": 290.6272430419922, | |
"epoch": 0.3171224077812443, | |
"grad_norm": 0.03653835097188836, | |
"kl": 0.07354736328125, | |
"learning_rate": 2.7875814660817504e-07, | |
"loss": 0.0001, | |
"reward": 1.0301339626312256, | |
"reward_std": 0.07902582315728068, | |
"rewards/equation_reward_func": 0.034598216181620955, | |
"rewards/format_reward_func": 0.9955357313156128, | |
"step": 216 | |
}, | |
{ | |
"completion_length": 280.104923248291, | |
"epoch": 0.3200587263718113, | |
"grad_norm": 0.044884706205922956, | |
"kl": 0.077239990234375, | |
"learning_rate": 2.751764564986396e-07, | |
"loss": 0.0001, | |
"reward": 1.0379464700818062, | |
"reward_std": 0.10856953356415033, | |
"rewards/equation_reward_func": 0.04464285972062498, | |
"rewards/format_reward_func": 0.9933035895228386, | |
"step": 218 | |
}, | |
{ | |
"completion_length": 287.18862533569336, | |
"epoch": 0.3229950449623784, | |
"grad_norm": 0.04383008320984735, | |
"kl": 0.074615478515625, | |
"learning_rate": 2.715895379284194e-07, | |
"loss": 0.0001, | |
"reward": 1.0435268208384514, | |
"reward_std": 0.11731589119881392, | |
"rewards/equation_reward_func": 0.05357143084984273, | |
"rewards/format_reward_func": 0.9899553805589676, | |
"step": 220 | |
}, | |
{ | |
"completion_length": 276.37277603149414, | |
"epoch": 0.3259313635529455, | |
"grad_norm": 0.04837055458994073, | |
"kl": 0.076263427734375, | |
"learning_rate": 2.6799813580229174e-07, | |
"loss": 0.0001, | |
"reward": 1.0591518357396126, | |
"reward_std": 0.12979829125106335, | |
"rewards/equation_reward_func": 0.06361607415601611, | |
"rewards/format_reward_func": 0.9955357313156128, | |
"step": 222 | |
}, | |
{ | |
"completion_length": 280.39175510406494, | |
"epoch": 0.3288676821435126, | |
"grad_norm": 0.04410099259171204, | |
"kl": 0.08013916015625, | |
"learning_rate": 2.6440299595614606e-07, | |
"loss": 0.0001, | |
"reward": 1.032366119325161, | |
"reward_std": 0.09281388577073812, | |
"rewards/equation_reward_func": 0.03906250186264515, | |
"rewards/format_reward_func": 0.9933035895228386, | |
"step": 224 | |
}, | |
{ | |
"completion_length": 290.5971097946167, | |
"epoch": 0.3318040007340797, | |
"grad_norm": 0.039702613268333235, | |
"kl": 0.079376220703125, | |
"learning_rate": 2.6080486500209347e-07, | |
"loss": 0.0001, | |
"reward": 1.0267857387661934, | |
"reward_std": 0.10266906302422285, | |
"rewards/equation_reward_func": 0.039062502793967724, | |
"rewards/format_reward_func": 0.9877232424914837, | |
"step": 226 | |
}, | |
{ | |
"completion_length": 286.54019355773926, | |
"epoch": 0.3347403193246467, | |
"grad_norm": 0.043480221809629085, | |
"kl": 0.07806396484375, | |
"learning_rate": 2.572044901734166e-07, | |
"loss": 0.0001, | |
"reward": 1.0446429029107094, | |
"reward_std": 0.12140736309811473, | |
"rewards/equation_reward_func": 0.054687502793967724, | |
"rewards/format_reward_func": 0.9899553880095482, | |
"step": 228 | |
}, | |
{ | |
"completion_length": 290.90514373779297, | |
"epoch": 0.3376766379152138, | |
"grad_norm": 0.05242052531581984, | |
"kl": 0.082000732421875, | |
"learning_rate": 2.536026191693893e-07, | |
"loss": 0.0001, | |
"reward": 1.0435268357396126, | |
"reward_std": 0.11180294072255492, | |
"rewards/equation_reward_func": 0.05022321769502014, | |
"rewards/format_reward_func": 0.993303582072258, | |
"step": 230 | |
}, | |
{ | |
"completion_length": 282.847110748291, | |
"epoch": 0.3406129565057809, | |
"grad_norm": 0.05191571705379443, | |
"kl": 0.0870361328125, | |
"learning_rate": 2.5e-07, | |
"loss": 0.0001, | |
"reward": 1.0446429178118706, | |
"reward_std": 0.09753446979448199, | |
"rewards/equation_reward_func": 0.04910714563447982, | |
"rewards/format_reward_func": 0.9955357313156128, | |
"step": 232 | |
}, | |
{ | |
"completion_length": 294.40960693359375, | |
"epoch": 0.343549275096348, | |
"grad_norm": 0.0432690838716016, | |
"kl": 0.083526611328125, | |
"learning_rate": 2.4639738083061073e-07, | |
"loss": 0.0001, | |
"reward": 1.0256696864962578, | |
"reward_std": 0.08049006946384907, | |
"rewards/equation_reward_func": 0.033482144586741924, | |
"rewards/format_reward_func": 0.9921875149011612, | |
"step": 234 | |
}, | |
{ | |
"completion_length": 280.33148670196533, | |
"epoch": 0.346485593686915, | |
"grad_norm": 0.038252828001278785, | |
"kl": 0.087615966796875, | |
"learning_rate": 2.4279550982658345e-07, | |
"loss": 0.0001, | |
"reward": 1.0357143580913544, | |
"reward_std": 0.09379028435796499, | |
"rewards/equation_reward_func": 0.04017857392318547, | |
"rewards/format_reward_func": 0.9955357313156128, | |
"step": 236 | |
}, | |
{ | |
"completion_length": 295.5837182998657, | |
"epoch": 0.3494219122774821, | |
"grad_norm": 0.02719561274312582, | |
"kl": 0.086944580078125, | |
"learning_rate": 2.3919513499790646e-07, | |
"loss": 0.0001, | |
"reward": 1.0412946864962578, | |
"reward_std": 0.0848828162997961, | |
"rewards/equation_reward_func": 0.04464285972062498, | |
"rewards/format_reward_func": 0.9966517984867096, | |
"step": 238 | |
}, | |
{ | |
"completion_length": 292.82367038726807, | |
"epoch": 0.3523582308680492, | |
"grad_norm": 0.04035208487514967, | |
"kl": 0.08807373046875, | |
"learning_rate": 2.3559700404385394e-07, | |
"loss": 0.0001, | |
"reward": 1.0334821790456772, | |
"reward_std": 0.10879184119403362, | |
"rewards/equation_reward_func": 0.04017857345752418, | |
"rewards/format_reward_func": 0.9933035895228386, | |
"step": 240 | |
}, | |
{ | |
"completion_length": 301.75671005249023, | |
"epoch": 0.35529454945861627, | |
"grad_norm": 0.03527319627385377, | |
"kl": 0.080841064453125, | |
"learning_rate": 2.3200186419770823e-07, | |
"loss": 0.0001, | |
"reward": 1.047991119325161, | |
"reward_std": 0.0909366519190371, | |
"rewards/equation_reward_func": 0.05357143096625805, | |
"rewards/format_reward_func": 0.994419664144516, | |
"step": 242 | |
}, | |
{ | |
"completion_length": 299.40849781036377, | |
"epoch": 0.35823086804918336, | |
"grad_norm": 0.03761243995367514, | |
"kl": 0.084014892578125, | |
"learning_rate": 2.284104620715807e-07, | |
"loss": 0.0001, | |
"reward": 1.0267857685685158, | |
"reward_std": 0.0821825498715043, | |
"rewards/equation_reward_func": 0.03348214423749596, | |
"rewards/format_reward_func": 0.9933035895228386, | |
"step": 244 | |
}, | |
{ | |
"completion_length": 285.0535879135132, | |
"epoch": 0.3611671866397504, | |
"grad_norm": 0.03697547812539244, | |
"kl": 0.0888671875, | |
"learning_rate": 2.2482354350136043e-07, | |
"loss": 0.0001, | |
"reward": 1.0446428954601288, | |
"reward_std": 0.08375557232648134, | |
"rewards/equation_reward_func": 0.045758930733427405, | |
"rewards/format_reward_func": 0.9988839328289032, | |
"step": 246 | |
}, | |
{ | |
"completion_length": 288.14063835144043, | |
"epoch": 0.3641035052303175, | |
"grad_norm": 0.04071648544501395, | |
"kl": 0.101226806640625, | |
"learning_rate": 2.2124185339182496e-07, | |
"loss": 0.0001, | |
"reward": 1.0613839700818062, | |
"reward_std": 0.12005118513479829, | |
"rewards/equation_reward_func": 0.06584821722935885, | |
"rewards/format_reward_func": 0.9955357313156128, | |
"step": 248 | |
}, | |
{ | |
"completion_length": 291.4319305419922, | |
"epoch": 0.36703982382088457, | |
"grad_norm": 0.0419233106603903, | |
"kl": 0.083465576171875, | |
"learning_rate": 2.1766613556194344e-07, | |
"loss": 0.0001, | |
"reward": 1.0424107685685158, | |
"reward_std": 0.10954663250595331, | |
"rewards/equation_reward_func": 0.04910714505240321, | |
"rewards/format_reward_func": 0.9933035783469677, | |
"step": 250 | |
}, | |
{ | |
"completion_length": 281.173002243042, | |
"epoch": 0.36997614241145166, | |
"grad_norm": 0.045983801668794504, | |
"kl": 0.087982177734375, | |
"learning_rate": 2.1409713259040628e-07, | |
"loss": 0.0001, | |
"reward": 1.0390625447034836, | |
"reward_std": 0.09796618251129985, | |
"rewards/equation_reward_func": 0.04241071711294353, | |
"rewards/format_reward_func": 0.9966517984867096, | |
"step": 252 | |
}, | |
{ | |
"completion_length": 289.0078248977661, | |
"epoch": 0.3729124610020187, | |
"grad_norm": 0.04107695439935304, | |
"kl": 0.079986572265625, | |
"learning_rate": 2.105355856614115e-07, | |
"loss": 0.0001, | |
"reward": 1.0401786118745804, | |
"reward_std": 0.08883230574429035, | |
"rewards/equation_reward_func": 0.04575893038418144, | |
"rewards/format_reward_func": 0.994419664144516, | |
"step": 254 | |
}, | |
{ | |
"completion_length": 285.1651887893677, | |
"epoch": 0.3758487795925858, | |
"grad_norm": 0.046756981687644784, | |
"kl": 0.083038330078125, | |
"learning_rate": 2.069822344107413e-07, | |
"loss": 0.0001, | |
"reward": 1.0435268357396126, | |
"reward_std": 0.09725010581314564, | |
"rewards/equation_reward_func": 0.04799107403960079, | |
"rewards/format_reward_func": 0.9955357313156128, | |
"step": 256 | |
}, | |
{ | |
"completion_length": 297.3203296661377, | |
"epoch": 0.37878509818315287, | |
"grad_norm": 0.0392014960390643, | |
"kl": 0.080810546875, | |
"learning_rate": 2.034378167721599e-07, | |
"loss": 0.0001, | |
"reward": 1.03683041036129, | |
"reward_std": 0.09191235108301044, | |
"rewards/equation_reward_func": 0.041294644586741924, | |
"rewards/format_reward_func": 0.9955357238650322, | |
"step": 258 | |
}, | |
{ | |
"completion_length": 298.24443340301514, | |
"epoch": 0.38172141677371996, | |
"grad_norm": 0.047185735730817774, | |
"kl": 0.080322265625, | |
"learning_rate": 1.9990306882416485e-07, | |
"loss": 0.0001, | |
"reward": 1.063616119325161, | |
"reward_std": 0.11580956913530827, | |
"rewards/equation_reward_func": 0.06696428812574595, | |
"rewards/format_reward_func": 0.9966517984867096, | |
"step": 260 | |
}, | |
{ | |
"completion_length": 309.2422037124634, | |
"epoch": 0.38465773536428705, | |
"grad_norm": 0.044908671018604095, | |
"kl": 0.07867431640625, | |
"learning_rate": 1.9637872463712362e-07, | |
"loss": 0.0001, | |
"reward": 1.063616119325161, | |
"reward_std": 0.12546450505033135, | |
"rewards/equation_reward_func": 0.0680803598370403, | |
"rewards/format_reward_func": 0.9955357313156128, | |
"step": 262 | |
}, | |
{ | |
"completion_length": 322.2745637893677, | |
"epoch": 0.3875940539548541, | |
"grad_norm": 0.0532866767137098, | |
"kl": 0.080291748046875, | |
"learning_rate": 1.9286551612082773e-07, | |
"loss": 0.0001, | |
"reward": 1.0580357611179352, | |
"reward_std": 0.1573281823657453, | |
"rewards/equation_reward_func": 0.07254464575089514, | |
"rewards/format_reward_func": 0.9854911044239998, | |
"step": 264 | |
}, | |
{ | |
"completion_length": 310.11050605773926, | |
"epoch": 0.39053037254542117, | |
"grad_norm": 0.0314648384232076, | |
"kl": 0.0770263671875, | |
"learning_rate": 1.8936417287249446e-07, | |
"loss": 0.0001, | |
"reward": 1.0345982685685158, | |
"reward_std": 0.1127313463948667, | |
"rewards/equation_reward_func": 0.0424107164144516, | |
"rewards/format_reward_func": 0.9921875149011612, | |
"step": 266 | |
}, | |
{ | |
"completion_length": 316.63059425354004, | |
"epoch": 0.39346669113598826, | |
"grad_norm": 0.03920885901252771, | |
"kl": 0.080841064453125, | |
"learning_rate": 1.8587542202524985e-07, | |
"loss": 0.0001, | |
"reward": 1.0513393431901932, | |
"reward_std": 0.1442212238907814, | |
"rewards/equation_reward_func": 0.06473214633297175, | |
"rewards/format_reward_func": 0.9866071864962578, | |
"step": 268 | |
}, | |
{ | |
"completion_length": 330.75001525878906, | |
"epoch": 0.39640300972655534, | |
"grad_norm": 0.03988966842604354, | |
"kl": 0.07989501953125, | |
"learning_rate": 1.82399988097123e-07, | |
"loss": 0.0001, | |
"reward": 1.041294690221548, | |
"reward_std": 0.15050739981234074, | |
"rewards/equation_reward_func": 0.059151788242161274, | |
"rewards/format_reward_func": 0.9821428880095482, | |
"step": 270 | |
}, | |
{ | |
"completion_length": 323.02680015563965, | |
"epoch": 0.39933932831712243, | |
"grad_norm": 0.03862862085447584, | |
"kl": 0.07598876953125, | |
"learning_rate": 1.7893859284058378e-07, | |
"loss": 0.0001, | |
"reward": 1.0334821790456772, | |
"reward_std": 0.09288874920457602, | |
"rewards/equation_reward_func": 0.04129464423749596, | |
"rewards/format_reward_func": 0.9921875298023224, | |
"step": 272 | |
}, | |
{ | |
"completion_length": 313.7120609283447, | |
"epoch": 0.40227564690768947, | |
"grad_norm": 0.04412481986958316, | |
"kl": 0.07879638671875, | |
"learning_rate": 1.7549195509265407e-07, | |
"loss": 0.0001, | |
"reward": 1.0747768431901932, | |
"reward_std": 0.14350473042577505, | |
"rewards/equation_reward_func": 0.07812500454019755, | |
"rewards/format_reward_func": 0.9966517984867096, | |
"step": 274 | |
}, | |
{ | |
"completion_length": 322.9977798461914, | |
"epoch": 0.40521196549825655, | |
"grad_norm": 0.04375980104795101, | |
"kl": 0.0792236328125, | |
"learning_rate": 1.7206079062562536e-07, | |
"loss": 0.0001, | |
"reward": 1.0725446939468384, | |
"reward_std": 0.12892362661659718, | |
"rewards/equation_reward_func": 0.07924107450526208, | |
"rewards/format_reward_func": 0.9933035969734192, | |
"step": 276 | |
}, | |
{ | |
"completion_length": 320.5089416503906, | |
"epoch": 0.40814828408882364, | |
"grad_norm": 0.03561206298117896, | |
"kl": 0.079742431640625, | |
"learning_rate": 1.6864581199841226e-07, | |
"loss": 0.0001, | |
"reward": 1.0223214626312256, | |
"reward_std": 0.11314251320436597, | |
"rewards/equation_reward_func": 0.037946430034935474, | |
"rewards/format_reward_func": 0.9843750298023224, | |
"step": 278 | |
}, | |
{ | |
"completion_length": 341.03349685668945, | |
"epoch": 0.41108460267939073, | |
"grad_norm": 0.03974030944314017, | |
"kl": 0.078216552734375, | |
"learning_rate": 1.6524772840857388e-07, | |
"loss": 0.0001, | |
"reward": 1.0167411230504513, | |
"reward_std": 0.13769615534693003, | |
"rewards/equation_reward_func": 0.03906250244472176, | |
"rewards/format_reward_func": 0.9776786006987095, | |
"step": 280 | |
}, | |
{ | |
"completion_length": 334.4274673461914, | |
"epoch": 0.41402092126995776, | |
"grad_norm": 0.047293808732492636, | |
"kl": 0.079193115234375, | |
"learning_rate": 1.6186724554503237e-07, | |
"loss": 0.0001, | |
"reward": 1.0591518431901932, | |
"reward_std": 0.1618042946793139, | |
"rewards/equation_reward_func": 0.07254464610014111, | |
"rewards/format_reward_func": 0.9866071678698063, | |
"step": 282 | |
}, | |
{ | |
"completion_length": 327.1138553619385, | |
"epoch": 0.41695723986052485, | |
"grad_norm": 0.04331271460083135, | |
"kl": 0.078521728515625, | |
"learning_rate": 1.5850506544152103e-07, | |
"loss": 0.0001, | |
"reward": 1.0625000484287739, | |
"reward_std": 0.12944608414545655, | |
"rewards/equation_reward_func": 0.07142857520375401, | |
"rewards/format_reward_func": 0.9910714514553547, | |
"step": 284 | |
}, | |
{ | |
"completion_length": 335.18974685668945, | |
"epoch": 0.41989355845109194, | |
"grad_norm": 0.03802066786188892, | |
"kl": 0.080474853515625, | |
"learning_rate": 1.5516188633079107e-07, | |
"loss": 0.0001, | |
"reward": 1.0424107536673546, | |
"reward_std": 0.1544543677009642, | |
"rewards/equation_reward_func": 0.06250000279396772, | |
"rewards/format_reward_func": 0.9799107387661934, | |
"step": 286 | |
}, | |
{ | |
"completion_length": 347.41854095458984, | |
"epoch": 0.42282987704165903, | |
"grad_norm": 0.03187971705442499, | |
"kl": 0.08209228515625, | |
"learning_rate": 1.5183840249960784e-07, | |
"loss": 0.0001, | |
"reward": 1.0245536267757416, | |
"reward_std": 0.13140886183828115, | |
"rewards/equation_reward_func": 0.043526786961592734, | |
"rewards/format_reward_func": 0.9810268133878708, | |
"step": 288 | |
}, | |
{ | |
"completion_length": 343.9810371398926, | |
"epoch": 0.4257661956322261, | |
"grad_norm": 0.043022835792256374, | |
"kl": 0.0875244140625, | |
"learning_rate": 1.4853530414456612e-07, | |
"loss": 0.0001, | |
"reward": 1.0625000596046448, | |
"reward_std": 0.1669562510214746, | |
"rewards/equation_reward_func": 0.07812500395812094, | |
"rewards/format_reward_func": 0.9843750298023224, | |
"step": 290 | |
}, | |
{ | |
"completion_length": 337.49108695983887, | |
"epoch": 0.42870251422279315, | |
"grad_norm": 0.047030760078363716, | |
"kl": 0.0819091796875, | |
"learning_rate": 1.4525327722875568e-07, | |
"loss": 0.0001, | |
"reward": 1.0602679029107094, | |
"reward_std": 0.17207134095951915, | |
"rewards/equation_reward_func": 0.07589286204893142, | |
"rewards/format_reward_func": 0.9843750335276127, | |
"step": 292 | |
}, | |
{ | |
"completion_length": 351.33818435668945, | |
"epoch": 0.43163883281336024, | |
"grad_norm": 0.03668725152945335, | |
"kl": 0.082733154296875, | |
"learning_rate": 1.4199300333930515e-07, | |
"loss": 0.0001, | |
"reward": 1.0680803954601288, | |
"reward_std": 0.203396650031209, | |
"rewards/equation_reward_func": 0.09151786146685481, | |
"rewards/format_reward_func": 0.9765625223517418, | |
"step": 294 | |
}, | |
{ | |
"completion_length": 346.4051513671875, | |
"epoch": 0.43457515140392733, | |
"grad_norm": 0.04950076381454808, | |
"kl": 0.086700439453125, | |
"learning_rate": 1.3875515954583523e-07, | |
"loss": 0.0001, | |
"reward": 1.0468750521540642, | |
"reward_std": 0.18728240253403783, | |
"rewards/equation_reward_func": 0.07254464691504836, | |
"rewards/format_reward_func": 0.9743303954601288, | |
"step": 296 | |
}, | |
{ | |
"completion_length": 360.35157775878906, | |
"epoch": 0.4375114699944944, | |
"grad_norm": 0.047052508644481995, | |
"kl": 0.0931396484375, | |
"learning_rate": 1.3554041825985e-07, | |
"loss": 0.0001, | |
"reward": 1.032366119325161, | |
"reward_std": 0.1628238232806325, | |
"rewards/equation_reward_func": 0.05803571699652821, | |
"rewards/format_reward_func": 0.9743303842842579, | |
"step": 298 | |
}, | |
{ | |
"completion_length": 341.84376525878906, | |
"epoch": 0.4404477885850615, | |
"grad_norm": 0.04197536421230838, | |
"kl": 0.087127685546875, | |
"learning_rate": 1.323494470950949e-07, | |
"loss": 0.0001, | |
"reward": 1.0412946939468384, | |
"reward_std": 0.1637354022823274, | |
"rewards/equation_reward_func": 0.061383931431919336, | |
"rewards/format_reward_func": 0.9799107536673546, | |
"step": 300 | |
}, | |
{ | |
"completion_length": 340.64063835144043, | |
"epoch": 0.44338410717562854, | |
"grad_norm": 0.053637375712495786, | |
"kl": 0.09027099609375, | |
"learning_rate": 1.2918290872891236e-07, | |
"loss": 0.0001, | |
"reward": 1.0725446976721287, | |
"reward_std": 0.1795735191553831, | |
"rewards/equation_reward_func": 0.08705357520375401, | |
"rewards/format_reward_func": 0.9854910969734192, | |
"step": 302 | |
}, | |
{ | |
"completion_length": 343.1964473724365, | |
"epoch": 0.4463204257661956, | |
"grad_norm": 0.04823925382374683, | |
"kl": 0.09429931640625, | |
"learning_rate": 1.260414607646213e-07, | |
"loss": 0.0001, | |
"reward": 1.0736607760190964, | |
"reward_std": 0.18405223218724132, | |
"rewards/equation_reward_func": 0.08928571816068143, | |
"rewards/format_reward_func": 0.9843750298023224, | |
"step": 304 | |
}, | |
{ | |
"completion_length": 350.75225257873535, | |
"epoch": 0.4492567443567627, | |
"grad_norm": 0.05097139551388257, | |
"kl": 0.089996337890625, | |
"learning_rate": 1.2292575559495143e-07, | |
"loss": 0.0001, | |
"reward": 1.072544701397419, | |
"reward_std": 0.19982971157878637, | |
"rewards/equation_reward_func": 0.09709821979049593, | |
"rewards/format_reward_func": 0.9754464663565159, | |
"step": 306 | |
}, | |
{ | |
"completion_length": 359.4218921661377, | |
"epoch": 0.4521930629473298, | |
"grad_norm": 0.04853008603673837, | |
"kl": 0.089630126953125, | |
"learning_rate": 1.1983644026655835e-07, | |
"loss": 0.0001, | |
"reward": 1.0814732648432255, | |
"reward_std": 0.19363146228715777, | |
"rewards/equation_reward_func": 0.09598214726429433, | |
"rewards/format_reward_func": 0.9854911006987095, | |
"step": 308 | |
}, | |
{ | |
"completion_length": 346.43081855773926, | |
"epoch": 0.45512938153789684, | |
"grad_norm": 0.04702497359438019, | |
"kl": 0.091400146484375, | |
"learning_rate": 1.1677415634565066e-07, | |
"loss": 0.0001, | |
"reward": 1.0591518357396126, | |
"reward_std": 0.17150792852044106, | |
"rewards/equation_reward_func": 0.07700893189758062, | |
"rewards/format_reward_func": 0.9821428954601288, | |
"step": 310 | |
}, | |
{ | |
"completion_length": 356.02010345458984, | |
"epoch": 0.4580657001284639, | |
"grad_norm": 0.034288823432535455, | |
"kl": 0.088470458984375, | |
"learning_rate": 1.1373953978475353e-07, | |
"loss": 0.0001, | |
"reward": 1.0446429140865803, | |
"reward_std": 0.17355853877961636, | |
"rewards/equation_reward_func": 0.07031250337604433, | |
"rewards/format_reward_func": 0.9743304029107094, | |
"step": 312 | |
}, | |
{ | |
"completion_length": 348.6417579650879, | |
"epoch": 0.461002018719031, | |
"grad_norm": 0.05696162778862769, | |
"kl": 0.09442138671875, | |
"learning_rate": 1.1073322079063913e-07, | |
"loss": 0.0001, | |
"reward": 1.059151828289032, | |
"reward_std": 0.17560010217130184, | |
"rewards/equation_reward_func": 0.07924107648432255, | |
"rewards/format_reward_func": 0.9799107499420643, | |
"step": 314 | |
}, | |
{ | |
"completion_length": 363.2265796661377, | |
"epoch": 0.4639383373095981, | |
"grad_norm": 0.054353334909736144, | |
"kl": 0.08917236328125, | |
"learning_rate": 1.0775582369344946e-07, | |
"loss": 0.0001, | |
"reward": 1.066964328289032, | |
"reward_std": 0.19891262240707874, | |
"rewards/equation_reward_func": 0.09263393236324191, | |
"rewards/format_reward_func": 0.9743303880095482, | |
"step": 316 | |
}, | |
{ | |
"completion_length": 343.60157585144043, | |
"epoch": 0.4668746559001652, | |
"grad_norm": 0.041973139726504065, | |
"kl": 0.093017578125, | |
"learning_rate": 1.0480796681704077e-07, | |
"loss": 0.0001, | |
"reward": 1.0535714775323868, | |
"reward_std": 0.1561298966407776, | |
"rewards/equation_reward_func": 0.07254464726429433, | |
"rewards/format_reward_func": 0.9810268133878708, | |
"step": 318 | |
}, | |
{ | |
"completion_length": 356.78126525878906, | |
"epoch": 0.4698109744907322, | |
"grad_norm": 0.04263629162591363, | |
"kl": 0.093994140625, | |
"learning_rate": 1.018902623505741e-07, | |
"loss": 0.0001, | |
"reward": 1.0602679066359997, | |
"reward_std": 0.1679113474674523, | |
"rewards/equation_reward_func": 0.0803571465658024, | |
"rewards/format_reward_func": 0.979910746216774, | |
"step": 320 | |
}, | |
{ | |
"completion_length": 334.065860748291, | |
"epoch": 0.4727472930812993, | |
"grad_norm": 0.04185249481712557, | |
"kl": 0.099578857421875, | |
"learning_rate": 9.900331622138063e-08, | |
"loss": 0.0001, | |
"reward": 1.0580357648432255, | |
"reward_std": 0.1624443898908794, | |
"rewards/equation_reward_func": 0.0736607180442661, | |
"rewards/format_reward_func": 0.9843750298023224, | |
"step": 322 | |
}, | |
{ | |
"completion_length": 364.9464416503906, | |
"epoch": 0.4756836116718664, | |
"grad_norm": 0.05018023548907757, | |
"kl": 0.09326171875, | |
"learning_rate": 9.614772796912681e-08, | |
"loss": 0.0001, | |
"reward": 1.0502232536673546, | |
"reward_std": 0.17889559408649802, | |
"rewards/equation_reward_func": 0.07924107415601611, | |
"rewards/format_reward_func": 0.9709821790456772, | |
"step": 324 | |
}, | |
{ | |
"completion_length": 361.39957427978516, | |
"epoch": 0.4786199302624335, | |
"grad_norm": 0.0408224554527788, | |
"kl": 0.09619140625, | |
"learning_rate": 9.332409062130686e-08, | |
"loss": 0.0001, | |
"reward": 1.0535714775323868, | |
"reward_std": 0.18046156875789165, | |
"rewards/equation_reward_func": 0.07589286041911691, | |
"rewards/format_reward_func": 0.9776785932481289, | |
"step": 326 | |
}, | |
{ | |
"completion_length": 373.2198791503906, | |
"epoch": 0.4815562488530005, | |
"grad_norm": 0.04378704975777743, | |
"kl": 0.09039306640625, | |
"learning_rate": 9.053299057008699e-08, | |
"loss": 0.0001, | |
"reward": 1.0379464700818062, | |
"reward_std": 0.1679329937323928, | |
"rewards/equation_reward_func": 0.06138393108267337, | |
"rewards/format_reward_func": 0.9765625409781933, | |
"step": 328 | |
}, | |
{ | |
"completion_length": 373.792423248291, | |
"epoch": 0.4844925674435676, | |
"grad_norm": 0.049976975281267676, | |
"kl": 0.0914306640625, | |
"learning_rate": 8.777500745052743e-08, | |
"loss": 0.0001, | |
"reward": 1.0758929178118706, | |
"reward_std": 0.18235793197527528, | |
"rewards/equation_reward_func": 0.09598214726429433, | |
"rewards/format_reward_func": 0.9799107387661934, | |
"step": 330 | |
}, | |
{ | |
"completion_length": 370.22434997558594, | |
"epoch": 0.4874288860341347, | |
"grad_norm": 0.04464940799424063, | |
"kl": 0.091278076171875, | |
"learning_rate": 8.505071402020892e-08, | |
"loss": 0.0001, | |
"reward": 1.0669643506407738, | |
"reward_std": 0.18469198187813163, | |
"rewards/equation_reward_func": 0.08928571664728224, | |
"rewards/format_reward_func": 0.9776785969734192, | |
"step": 332 | |
}, | |
{ | |
"completion_length": 382.16854095458984, | |
"epoch": 0.4903652046247018, | |
"grad_norm": 0.0496328152205412, | |
"kl": 0.132293701171875, | |
"learning_rate": 8.236067604028562e-08, | |
"loss": 0.0001, | |
"reward": 1.0513393431901932, | |
"reward_std": 0.1591361202299595, | |
"rewards/equation_reward_func": 0.07142857427243143, | |
"rewards/format_reward_func": 0.9799107536673546, | |
"step": 334 | |
}, | |
{ | |
"completion_length": 384.12167167663574, | |
"epoch": 0.4933015232152689, | |
"grad_norm": 0.04586325949575296, | |
"kl": 0.093780517578125, | |
"learning_rate": 7.970545215799327e-08, | |
"loss": 0.0001, | |
"reward": 1.0368303954601288, | |
"reward_std": 0.18249922152608633, | |
"rewards/equation_reward_func": 0.06919643189758062, | |
"rewards/format_reward_func": 0.9676339626312256, | |
"step": 336 | |
}, | |
{ | |
"completion_length": 371.5636348724365, | |
"epoch": 0.4962378418058359, | |
"grad_norm": 0.039612951202073324, | |
"kl": 0.094390869140625, | |
"learning_rate": 7.708559379063204e-08, | |
"loss": 0.0001, | |
"reward": 1.069196492433548, | |
"reward_std": 0.20001838542521, | |
"rewards/equation_reward_func": 0.09151786239817739, | |
"rewards/format_reward_func": 0.9776785932481289, | |
"step": 338 | |
}, | |
{ | |
"completion_length": 375.1752414703369, | |
"epoch": 0.499174160396403, | |
"grad_norm": 0.046616949041271845, | |
"kl": 0.092926025390625, | |
"learning_rate": 7.45016450110534e-08, | |
"loss": 0.0001, | |
"reward": 1.0725446864962578, | |
"reward_std": 0.1956021711230278, | |
"rewards/equation_reward_func": 0.09598214738070965, | |
"rewards/format_reward_func": 0.9765625298023224, | |
"step": 340 | |
}, | |
{ | |
"completion_length": 371.14622688293457, | |
"epoch": 0.50211047898697, | |
"grad_norm": 0.04763239881868573, | |
"kl": 0.0966796875, | |
"learning_rate": 7.195414243467029e-08, | |
"loss": 0.0001, | |
"reward": 1.0558036267757416, | |
"reward_std": 0.16503770695999265, | |
"rewards/equation_reward_func": 0.07254464668221772, | |
"rewards/format_reward_func": 0.983258955180645, | |
"step": 342 | |
}, | |
{ | |
"completion_length": 387.4989013671875, | |
"epoch": 0.5050467975775371, | |
"grad_norm": 0.04756655671271559, | |
"kl": 0.08740234375, | |
"learning_rate": 6.944361510801763e-08, | |
"loss": 0.0001, | |
"reward": 1.0658482685685158, | |
"reward_std": 0.16942863073199987, | |
"rewards/equation_reward_func": 0.08482143247965723, | |
"rewards/format_reward_func": 0.9810268245637417, | |
"step": 344 | |
}, | |
{ | |
"completion_length": 386.53461265563965, | |
"epoch": 0.5079831161681042, | |
"grad_norm": 0.044536873399056064, | |
"kl": 0.092315673828125, | |
"learning_rate": 6.697058439888283e-08, | |
"loss": 0.0001, | |
"reward": 1.0993304178118706, | |
"reward_std": 0.25071316212415695, | |
"rewards/equation_reward_func": 0.12946429091971368, | |
"rewards/format_reward_func": 0.9698660969734192, | |
"step": 346 | |
}, | |
{ | |
"completion_length": 380.17635345458984, | |
"epoch": 0.5109194347586713, | |
"grad_norm": 0.038330951190360854, | |
"kl": 0.09490966796875, | |
"learning_rate": 6.453556388803288e-08, | |
"loss": 0.0001, | |
"reward": 1.0825893357396126, | |
"reward_std": 0.19026875868439674, | |
"rewards/equation_reward_func": 0.0993303619325161, | |
"rewards/format_reward_func": 0.9832589589059353, | |
"step": 348 | |
}, | |
{ | |
"completion_length": 389.3169822692871, | |
"epoch": 0.5138557533492384, | |
"grad_norm": 0.042924839902815745, | |
"kl": 0.092254638671875, | |
"learning_rate": 6.213905926255697e-08, | |
"loss": 0.0001, | |
"reward": 1.0569197051227093, | |
"reward_std": 0.19485764298588037, | |
"rewards/equation_reward_func": 0.08482143247965723, | |
"rewards/format_reward_func": 0.9720982499420643, | |
"step": 350 | |
}, | |
{ | |
"completion_length": 386.6774711608887, | |
"epoch": 0.5167920719398055, | |
"grad_norm": 0.052338487607322216, | |
"kl": 0.09173583984375, | |
"learning_rate": 5.978156821084987e-08, | |
"loss": 0.0001, | |
"reward": 1.059151828289032, | |
"reward_std": 0.22638031467795372, | |
"rewards/equation_reward_func": 0.09040179057046771, | |
"rewards/format_reward_func": 0.968750037252903, | |
"step": 352 | |
}, | |
{ | |
"completion_length": 390.3426513671875, | |
"epoch": 0.5197283905303726, | |
"grad_norm": 0.05158535991793682, | |
"kl": 0.0892333984375, | |
"learning_rate": 5.7463580319254853e-08, | |
"loss": 0.0001, | |
"reward": 1.0535714775323868, | |
"reward_std": 0.20130603248253465, | |
"rewards/equation_reward_func": 0.08035714738070965, | |
"rewards/format_reward_func": 0.9732143171131611, | |
"step": 354 | |
}, | |
{ | |
"completion_length": 369.5725612640381, | |
"epoch": 0.5226647091209397, | |
"grad_norm": 0.05330391501700506, | |
"kl": 0.097869873046875, | |
"learning_rate": 5.518557697039081e-08, | |
"loss": 0.0001, | |
"reward": 1.0703125558793545, | |
"reward_std": 0.19206226477399468, | |
"rewards/equation_reward_func": 0.09375000512227416, | |
"rewards/format_reward_func": 0.9765625409781933, | |
"step": 356 | |
}, | |
{ | |
"completion_length": 398.66854667663574, | |
"epoch": 0.5256010277115067, | |
"grad_norm": 0.051887163767993155, | |
"kl": 0.091033935546875, | |
"learning_rate": 5.294803124318145e-08, | |
"loss": 0.0001, | |
"reward": 1.05245541036129, | |
"reward_std": 0.18004788551479578, | |
"rewards/equation_reward_func": 0.07812500337604433, | |
"rewards/format_reward_func": 0.9743303880095482, | |
"step": 358 | |
}, | |
{ | |
"completion_length": 404.6495723724365, | |
"epoch": 0.5285373463020738, | |
"grad_norm": 0.050939622333094535, | |
"kl": 0.095245361328125, | |
"learning_rate": 5.07514078146106e-08, | |
"loss": 0.0001, | |
"reward": 1.0546875484287739, | |
"reward_std": 0.2043363954871893, | |
"rewards/equation_reward_func": 0.08705357497092336, | |
"rewards/format_reward_func": 0.967633955180645, | |
"step": 360 | |
}, | |
{ | |
"completion_length": 388.80581855773926, | |
"epoch": 0.5314736648926408, | |
"grad_norm": 0.05380538494559486, | |
"kl": 0.093353271484375, | |
"learning_rate": 4.859616286322094e-08, | |
"loss": 0.0001, | |
"reward": 1.0770089849829674, | |
"reward_std": 0.20855529373511672, | |
"rewards/equation_reward_func": 0.09821429126895964, | |
"rewards/format_reward_func": 0.9787946604192257, | |
"step": 362 | |
}, | |
{ | |
"completion_length": 391.8716678619385, | |
"epoch": 0.5344099834832079, | |
"grad_norm": 0.053453712412842525, | |
"kl": 0.091156005859375, | |
"learning_rate": 4.648274397437829e-08, | |
"loss": 0.0001, | |
"reward": 1.0647321939468384, | |
"reward_std": 0.2162796063348651, | |
"rewards/equation_reward_func": 0.09263393317814916, | |
"rewards/format_reward_func": 0.972098246216774, | |
"step": 364 | |
}, | |
{ | |
"completion_length": 391.14622497558594, | |
"epoch": 0.537346302073775, | |
"grad_norm": 0.06186318024343802, | |
"kl": 0.095550537109375, | |
"learning_rate": 4.4411590047320617e-08, | |
"loss": 0.0001, | |
"reward": 1.0959821864962578, | |
"reward_std": 0.25807888340204954, | |
"rewards/equation_reward_func": 0.12611607741564512, | |
"rewards/format_reward_func": 0.9698660969734192, | |
"step": 366 | |
}, | |
{ | |
"completion_length": 377.6551513671875, | |
"epoch": 0.5402826206643421, | |
"grad_norm": 0.053715796741151546, | |
"kl": 0.099456787109375, | |
"learning_rate": 4.2383131204010494e-08, | |
"loss": 0.0001, | |
"reward": 1.12276791036129, | |
"reward_std": 0.2649730620905757, | |
"rewards/equation_reward_func": 0.14508929126895964, | |
"rewards/format_reward_func": 0.9776785895228386, | |
"step": 368 | |
}, | |
{ | |
"completion_length": 403.7723388671875, | |
"epoch": 0.5432189392549092, | |
"grad_norm": 0.047203255127769794, | |
"kl": 0.098663330078125, | |
"learning_rate": 4.039778869981064e-08, | |
"loss": 0.0001, | |
"reward": 1.08370541036129, | |
"reward_std": 0.20436841249465942, | |
"rewards/equation_reward_func": 0.10825893376022577, | |
"rewards/format_reward_func": 0.9754464589059353, | |
"step": 370 | |
}, | |
{ | |
"completion_length": 392.40403175354004, | |
"epoch": 0.5461552578454762, | |
"grad_norm": 0.05153714808744298, | |
"kl": 0.097930908203125, | |
"learning_rate": 3.845597483600049e-08, | |
"loss": 0.0001, | |
"reward": 1.1093750447034836, | |
"reward_std": 0.2366180717945099, | |
"rewards/equation_reward_func": 0.13058036309666932, | |
"rewards/format_reward_func": 0.9787946753203869, | |
"step": 372 | |
}, | |
{ | |
"completion_length": 415.2890796661377, | |
"epoch": 0.5490915764360433, | |
"grad_norm": 0.038295429751397964, | |
"kl": 0.10107421875, | |
"learning_rate": 3.655809287415284e-08, | |
"loss": 0.0001, | |
"reward": 1.0401786118745804, | |
"reward_std": 0.19542105589061975, | |
"rewards/equation_reward_func": 0.07589286018628627, | |
"rewards/format_reward_func": 0.9642857387661934, | |
"step": 374 | |
}, | |
{ | |
"completion_length": 403.9542598724365, | |
"epoch": 0.5520278950266104, | |
"grad_norm": 0.04692015937158236, | |
"kl": 0.093994140625, | |
"learning_rate": 3.4704536952387285e-08, | |
"loss": 0.0001, | |
"reward": 1.0703125521540642, | |
"reward_std": 0.18491909513249993, | |
"rewards/equation_reward_func": 0.089285719092004, | |
"rewards/format_reward_func": 0.9810268208384514, | |
"step": 376 | |
}, | |
{ | |
"completion_length": 406.22546577453613, | |
"epoch": 0.5549642136171775, | |
"grad_norm": 0.04774051648965683, | |
"kl": 0.094635009765625, | |
"learning_rate": 3.2895692003518575e-08, | |
"loss": 0.0001, | |
"reward": 1.058035746216774, | |
"reward_std": 0.23594195628538728, | |
"rewards/equation_reward_func": 0.09933036204893142, | |
"rewards/format_reward_func": 0.9587053768336773, | |
"step": 378 | |
}, | |
{ | |
"completion_length": 386.1004638671875, | |
"epoch": 0.5579005322077445, | |
"grad_norm": 0.05077221624579398, | |
"kl": 0.099517822265625, | |
"learning_rate": 3.113193367511635e-08, | |
"loss": 0.0001, | |
"reward": 1.0736607611179352, | |
"reward_std": 0.19753747899085283, | |
"rewards/equation_reward_func": 0.09263393352739513, | |
"rewards/format_reward_func": 0.9810268171131611, | |
"step": 380 | |
}, | |
{ | |
"completion_length": 409.23438835144043, | |
"epoch": 0.5608368507983116, | |
"grad_norm": 0.04807883821547216, | |
"kl": 0.096954345703125, | |
"learning_rate": 2.9413628251493934e-08, | |
"loss": 0.0001, | |
"reward": 1.0703125670552254, | |
"reward_std": 0.20752025907859206, | |
"rewards/equation_reward_func": 0.09598214703146368, | |
"rewards/format_reward_func": 0.9743303842842579, | |
"step": 382 | |
}, | |
{ | |
"completion_length": 404.74109077453613, | |
"epoch": 0.5637731693888787, | |
"grad_norm": 0.05267361932547423, | |
"kl": 0.10076904296875, | |
"learning_rate": 2.774113257764066e-08, | |
"loss": 0.0001, | |
"reward": 1.0959821939468384, | |
"reward_std": 0.2620803425088525, | |
"rewards/equation_reward_func": 0.12946429033763707, | |
"rewards/format_reward_func": 0.9665178917348385, | |
"step": 384 | |
}, | |
{ | |
"completion_length": 398.66966247558594, | |
"epoch": 0.5667094879794458, | |
"grad_norm": 0.045965645927339774, | |
"kl": 0.093353271484375, | |
"learning_rate": 2.611479398511518e-08, | |
"loss": 0.0001, | |
"reward": 1.0959821939468384, | |
"reward_std": 0.22108421614393592, | |
"rewards/equation_reward_func": 0.1205357207218185, | |
"rewards/format_reward_func": 0.9754464514553547, | |
"step": 386 | |
}, | |
{ | |
"completion_length": 395.1495666503906, | |
"epoch": 0.5696458065700128, | |
"grad_norm": 0.05174221228956814, | |
"kl": 0.099700927734375, | |
"learning_rate": 2.4534950219914057e-08, | |
"loss": 0.0001, | |
"reward": 1.1383929029107094, | |
"reward_std": 0.25243138894438744, | |
"rewards/equation_reward_func": 0.16183036472648382, | |
"rewards/format_reward_func": 0.9765625260770321, | |
"step": 388 | |
}, | |
{ | |
"completion_length": 406.16854667663574, | |
"epoch": 0.5725821251605799, | |
"grad_norm": 0.05019021020140579, | |
"kl": 0.09979248046875, | |
"learning_rate": 2.300192937233128e-08, | |
"loss": 0.0001, | |
"reward": 1.0892857573926449, | |
"reward_std": 0.21054052747786045, | |
"rewards/equation_reward_func": 0.11049107869621366, | |
"rewards/format_reward_func": 0.9787946864962578, | |
"step": 390 | |
}, | |
{ | |
"completion_length": 388.2801513671875, | |
"epoch": 0.575518443751147, | |
"grad_norm": 0.04975482536547829, | |
"kl": 0.101959228515625, | |
"learning_rate": 2.1516049808822935e-08, | |
"loss": 0.0001, | |
"reward": 1.102678619325161, | |
"reward_std": 0.2276111119426787, | |
"rewards/equation_reward_func": 0.11941964831203222, | |
"rewards/format_reward_func": 0.9832589514553547, | |
"step": 392 | |
}, | |
{ | |
"completion_length": 397.13059997558594, | |
"epoch": 0.5784547623417141, | |
"grad_norm": 0.0482003869323553, | |
"kl": 0.092620849609375, | |
"learning_rate": 2.007762010589098e-08, | |
"loss": 0.0001, | |
"reward": 1.0915179140865803, | |
"reward_std": 0.22312113596126437, | |
"rewards/equation_reward_func": 0.11607143434230238, | |
"rewards/format_reward_func": 0.9754464589059353, | |
"step": 394 | |
}, | |
{ | |
"completion_length": 405.1875171661377, | |
"epoch": 0.5813910809322812, | |
"grad_norm": 0.058761232555913534, | |
"kl": 0.10430908203125, | |
"learning_rate": 1.8686938986000627e-08, | |
"loss": 0.0001, | |
"reward": 1.0970982611179352, | |
"reward_std": 0.2505375109612942, | |
"rewards/equation_reward_func": 0.12388393399305642, | |
"rewards/format_reward_func": 0.9732143208384514, | |
"step": 396 | |
}, | |
{ | |
"completion_length": 415.2120723724365, | |
"epoch": 0.5843273995228482, | |
"grad_norm": 0.04939032637161742, | |
"kl": 0.098541259765625, | |
"learning_rate": 1.734429525554365e-08, | |
"loss": 0.0001, | |
"reward": 1.0647321939468384, | |
"reward_std": 0.21378363063558936, | |
"rewards/equation_reward_func": 0.09709821967408061, | |
"rewards/format_reward_func": 0.9676339514553547, | |
"step": 398 | |
}, | |
{ | |
"completion_length": 402.12947845458984, | |
"epoch": 0.5872637181134153, | |
"grad_norm": 0.0582226075441157, | |
"kl": 0.101898193359375, | |
"learning_rate": 1.604996774486145e-08, | |
"loss": 0.0001, | |
"reward": 1.0747768357396126, | |
"reward_std": 0.23708053398877382, | |
"rewards/equation_reward_func": 0.10825893213041127, | |
"rewards/format_reward_func": 0.9665178917348385, | |
"step": 400 | |
}, | |
{ | |
"completion_length": 419.4643077850342, | |
"epoch": 0.5902000367039824, | |
"grad_norm": 0.05248475625533865, | |
"kl": 0.09747314453125, | |
"learning_rate": 1.4804225250339281e-08, | |
"loss": 0.0001, | |
"reward": 1.056919701397419, | |
"reward_std": 0.2246052329428494, | |
"rewards/equation_reward_func": 0.09040178917348385, | |
"rewards/format_reward_func": 0.9665178917348385, | |
"step": 402 | |
}, | |
{ | |
"completion_length": 406.70760917663574, | |
"epoch": 0.5931363552945494, | |
"grad_norm": 0.04188127192836941, | |
"kl": 0.10638427734375, | |
"learning_rate": 1.360732647858498e-08, | |
"loss": 0.0001, | |
"reward": 1.0580357648432255, | |
"reward_std": 0.1819898965768516, | |
"rewards/equation_reward_func": 0.08147321932483464, | |
"rewards/format_reward_func": 0.9765625260770321, | |
"step": 404 | |
}, | |
{ | |
"completion_length": 423.35604667663574, | |
"epoch": 0.5960726738851165, | |
"grad_norm": 0.046451328772035054, | |
"kl": 0.09967041015625, | |
"learning_rate": 1.2459519992702311e-08, | |
"loss": 0.0001, | |
"reward": 1.10714291036129, | |
"reward_std": 0.2326993877068162, | |
"rewards/equation_reward_func": 0.13169643562287092, | |
"rewards/format_reward_func": 0.9754464477300644, | |
"step": 406 | |
}, | |
{ | |
"completion_length": 392.1015796661377, | |
"epoch": 0.5990089924756836, | |
"grad_norm": 0.05080696425513128, | |
"kl": 0.105438232421875, | |
"learning_rate": 1.1361044160671629e-08, | |
"loss": 0.0001, | |
"reward": 1.1082589849829674, | |
"reward_std": 0.2411864292807877, | |
"rewards/equation_reward_func": 0.13058036274742335, | |
"rewards/format_reward_func": 0.9776786044239998, | |
"step": 408 | |
}, | |
{ | |
"completion_length": 396.29577445983887, | |
"epoch": 0.6019453110662507, | |
"grad_norm": 0.05576037207187397, | |
"kl": 0.09539794921875, | |
"learning_rate": 1.0312127105846947e-08, | |
"loss": 0.0001, | |
"reward": 1.1026786267757416, | |
"reward_std": 0.2508852328173816, | |
"rewards/equation_reward_func": 0.13281250547152013, | |
"rewards/format_reward_func": 0.9698660969734192, | |
"step": 410 | |
}, | |
{ | |
"completion_length": 399.7053737640381, | |
"epoch": 0.6048816296568178, | |
"grad_norm": 0.059732333663183326, | |
"kl": 0.10064697265625, | |
"learning_rate": 9.312986659581301e-09, | |
"loss": 0.0001, | |
"reward": 1.1049107611179352, | |
"reward_std": 0.23088762862607837, | |
"rewards/equation_reward_func": 0.12834822211880237, | |
"rewards/format_reward_func": 0.9765625298023224, | |
"step": 412 | |
}, | |
{ | |
"completion_length": 398.66184997558594, | |
"epoch": 0.6078179482473849, | |
"grad_norm": 0.05324309506242098, | |
"kl": 0.103302001953125, | |
"learning_rate": 8.363830315988945e-09, | |
"loss": 0.0001, | |
"reward": 1.102678619325161, | |
"reward_std": 0.2296189209446311, | |
"rewards/equation_reward_func": 0.12388393364381045, | |
"rewards/format_reward_func": 0.9787946864962578, | |
"step": 414 | |
}, | |
{ | |
"completion_length": 417.6897506713867, | |
"epoch": 0.6107542668379519, | |
"grad_norm": 0.04683799522299558, | |
"kl": 0.107086181640625, | |
"learning_rate": 7.46485518885462e-09, | |
"loss": 0.0001, | |
"reward": 1.0669643469154835, | |
"reward_std": 0.20856121368706226, | |
"rewards/equation_reward_func": 0.09263393329456449, | |
"rewards/format_reward_func": 0.9743303842842579, | |
"step": 416 | |
}, | |
{ | |
"completion_length": 405.1105079650879, | |
"epoch": 0.613690585428519, | |
"grad_norm": 0.049111930200072484, | |
"kl": 0.096710205078125, | |
"learning_rate": 6.616247970698319e-09, | |
"loss": 0.0001, | |
"reward": 1.09933041036129, | |
"reward_std": 0.26021250896155834, | |
"rewards/equation_reward_func": 0.13281250465661287, | |
"rewards/format_reward_func": 0.9665178917348385, | |
"step": 418 | |
}, | |
{ | |
"completion_length": 412.4944381713867, | |
"epoch": 0.616626904019086, | |
"grad_norm": 0.05400451597158681, | |
"kl": 0.102020263671875, | |
"learning_rate": 5.8181848940044855e-09, | |
"loss": 0.0001, | |
"reward": 1.1149553880095482, | |
"reward_std": 0.26809723395854235, | |
"rewards/equation_reward_func": 0.15290179254952818, | |
"rewards/format_reward_func": 0.9620536044239998, | |
"step": 420 | |
}, | |
{ | |
"completion_length": 416.1339473724365, | |
"epoch": 0.6195632226096531, | |
"grad_norm": 0.04012774328694736, | |
"kl": 0.099609375, | |
"learning_rate": 5.070831694623135e-09, | |
"loss": 0.0001, | |
"reward": 1.0937500447034836, | |
"reward_std": 0.23063845187425613, | |
"rewards/equation_reward_func": 0.1194196492433548, | |
"rewards/format_reward_func": 0.9743303917348385, | |
"step": 422 | |
}, | |
{ | |
"completion_length": 403.98104667663574, | |
"epoch": 0.6224995412002202, | |
"grad_norm": 0.04957974374231148, | |
"kl": 0.097686767578125, | |
"learning_rate": 4.374343577351336e-09, | |
"loss": 0.0001, | |
"reward": 1.0814732536673546, | |
"reward_std": 0.22890522051602602, | |
"rewards/equation_reward_func": 0.10825893364381045, | |
"rewards/format_reward_func": 0.9732143171131611, | |
"step": 424 | |
}, | |
{ | |
"completion_length": 403.1160888671875, | |
"epoch": 0.6254358597907873, | |
"grad_norm": 0.04373844754088721, | |
"kl": 0.10205078125, | |
"learning_rate": 3.7288651837012745e-09, | |
"loss": 0.0001, | |
"reward": 1.0870536155998707, | |
"reward_std": 0.20328176161274314, | |
"rewards/equation_reward_func": 0.1116071492433548, | |
"rewards/format_reward_func": 0.9754464775323868, | |
"step": 426 | |
}, | |
{ | |
"completion_length": 416.70426177978516, | |
"epoch": 0.6283721783813544, | |
"grad_norm": 0.05687104274871836, | |
"kl": 0.099456787109375, | |
"learning_rate": 3.134530561862081e-09, | |
"loss": 0.0001, | |
"reward": 1.103794690221548, | |
"reward_std": 0.2526063285768032, | |
"rewards/equation_reward_func": 0.13616072200238705, | |
"rewards/format_reward_func": 0.9676339626312256, | |
"step": 428 | |
}, | |
{ | |
"completion_length": 402.1964473724365, | |
"epoch": 0.6313084969719215, | |
"grad_norm": 0.044913896708701294, | |
"kl": 0.1141357421875, | |
"learning_rate": 2.5914631388619103e-09, | |
"loss": 0.0001, | |
"reward": 1.0725446864962578, | |
"reward_std": 0.23142389208078384, | |
"rewards/equation_reward_func": 0.10491071839351207, | |
"rewards/format_reward_func": 0.9676339663565159, | |
"step": 430 | |
}, | |
{ | |
"completion_length": 395.0424289703369, | |
"epoch": 0.6342448155624886, | |
"grad_norm": 0.045882245640528084, | |
"kl": 0.109832763671875, | |
"learning_rate": 2.0997756949353297e-09, | |
"loss": 0.0001, | |
"reward": 1.0814732536673546, | |
"reward_std": 0.22362250182777643, | |
"rewards/equation_reward_func": 0.11160714749712497, | |
"rewards/format_reward_func": 0.9698661118745804, | |
"step": 432 | |
}, | |
{ | |
"completion_length": 426.2924289703369, | |
"epoch": 0.6371811341530557, | |
"grad_norm": 0.04758448749390688, | |
"kl": 0.1451416015625, | |
"learning_rate": 1.6595703401020844e-09, | |
"loss": 0.0001, | |
"reward": 1.0669643357396126, | |
"reward_std": 0.23458201717585325, | |
"rewards/equation_reward_func": 0.10044643271248788, | |
"rewards/format_reward_func": 0.9665178842842579, | |
"step": 434 | |
}, | |
{ | |
"completion_length": 406.60493087768555, | |
"epoch": 0.6401174527436226, | |
"grad_norm": 0.06141012337958767, | |
"kl": 0.100128173828125, | |
"learning_rate": 1.2709384929615596e-09, | |
"loss": 0.0001, | |
"reward": 1.0948661044239998, | |
"reward_std": 0.2197393993847072, | |
"rewards/equation_reward_func": 0.11941964959260076, | |
"rewards/format_reward_func": 0.9754464626312256, | |
"step": 436 | |
}, | |
{ | |
"completion_length": 409.0055980682373, | |
"epoch": 0.6430537713341897, | |
"grad_norm": 0.05577362369837321, | |
"kl": 0.113677978515625, | |
"learning_rate": 9.339608617077165e-10, | |
"loss": 0.0001, | |
"reward": 1.0814732611179352, | |
"reward_std": 0.22773250937461853, | |
"rewards/equation_reward_func": 0.111607147147879, | |
"rewards/format_reward_func": 0.9698661006987095, | |
"step": 438 | |
}, | |
{ | |
"completion_length": 423.28126525878906, | |
"epoch": 0.6459900899247568, | |
"grad_norm": 0.04649263712299981, | |
"kl": 0.10736083984375, | |
"learning_rate": 6.487074273681114e-10, | |
"loss": 0.0001, | |
"reward": 1.0379464700818062, | |
"reward_std": 0.2183691617101431, | |
"rewards/equation_reward_func": 0.08035714703146368, | |
"rewards/format_reward_func": 0.9575893208384514, | |
"step": 440 | |
}, | |
{ | |
"completion_length": 424.0346221923828, | |
"epoch": 0.6489264085153239, | |
"grad_norm": 0.051863884871225424, | |
"kl": 0.0963134765625, | |
"learning_rate": 4.152374292708538e-10, | |
"loss": 0.0001, | |
"reward": 1.08370541036129, | |
"reward_std": 0.23894479172304273, | |
"rewards/equation_reward_func": 0.11607143329456449, | |
"rewards/format_reward_func": 0.967633955180645, | |
"step": 442 | |
}, | |
{ | |
"completion_length": 419.2076072692871, | |
"epoch": 0.651862727105891, | |
"grad_norm": 0.041908418740650634, | |
"kl": 0.09466552734375, | |
"learning_rate": 2.3359935274214204e-10, | |
"loss": 0.0001, | |
"reward": 1.0781250521540642, | |
"reward_std": 0.20687641110271215, | |
"rewards/equation_reward_func": 0.10602678940631449, | |
"rewards/format_reward_func": 0.9720982424914837, | |
"step": 444 | |
}, | |
{ | |
"completion_length": 409.03015327453613, | |
"epoch": 0.6547990456964581, | |
"grad_norm": 0.04991210004252944, | |
"kl": 0.096923828125, | |
"learning_rate": 1.0383091903720665e-10, | |
"loss": 0.0001, | |
"reward": 1.0892857760190964, | |
"reward_std": 0.2569359806366265, | |
"rewards/equation_reward_func": 0.12276786111760885, | |
"rewards/format_reward_func": 0.9665179029107094, | |
"step": 446 | |
}, | |
{ | |
"completion_length": 421.3951072692871, | |
"epoch": 0.6577353642870252, | |
"grad_norm": 0.0511078437253658, | |
"kl": 0.09625244140625, | |
"learning_rate": 2.595907750671533e-11, | |
"loss": 0.0001, | |
"reward": 1.1015625447034836, | |
"reward_std": 0.2752464488148689, | |
"rewards/equation_reward_func": 0.13950893515720963, | |
"rewards/format_reward_func": 0.9620535932481289, | |
"step": 448 | |
}, | |
{ | |
"completion_length": 429.61497497558594, | |
"epoch": 0.6606716828775923, | |
"grad_norm": 0.049613253545700765, | |
"kl": 0.095245361328125, | |
"learning_rate": 0.0, | |
"loss": 0.0001, | |
"reward": 1.0691964775323868, | |
"reward_std": 0.2418026770465076, | |
"rewards/equation_reward_func": 0.10602679173462093, | |
"rewards/format_reward_func": 0.9631696715950966, | |
"step": 450 | |
}, | |
{ | |
"epoch": 0.6606716828775923, | |
"step": 450, | |
"total_flos": 0.0, | |
"train_loss": 6.581872487505476e-05, | |
"train_runtime": 36634.2295, | |
"train_samples_per_second": 0.688, | |
"train_steps_per_second": 0.012 | |
} | |
], | |
"logging_steps": 2, | |
"max_steps": 450, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 1, | |
"save_steps": 25, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": true, | |
"should_training_stop": true | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 0.0, | |
"train_batch_size": 1, | |
"trial_name": null, | |
"trial_params": null | |
} | |