{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6606716828775923, "eval_steps": 500, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 425.11497688293457, "epoch": 0.0029363185905670764, "grad_norm": 0.10666760082017611, "kl": 0.0, "learning_rate": 7.142857142857142e-08, "loss": -0.0, "reward": 0.2834821557626128, "reward_std": 0.4256630390882492, "rewards/equation_reward_func": 0.005580357392318547, "rewards/format_reward_func": 0.27790179941803217, "step": 2 }, { "completion_length": 397.3091697692871, "epoch": 0.005872637181134153, "grad_norm": 0.11172384920713097, "kl": 0.0004100799560546875, "learning_rate": 1.4285714285714285e-07, "loss": 0.0, "reward": 0.31696429941803217, "reward_std": 0.4561923108994961, "rewards/equation_reward_func": 0.004464285913854837, "rewards/format_reward_func": 0.3125000139698386, "step": 4 }, { "completion_length": 387.9654178619385, "epoch": 0.00880895577170123, "grad_norm": 0.12273474882696524, "kl": 0.00041091442108154297, "learning_rate": 2.1428571428571426e-07, "loss": 0.0, "reward": 0.3236607266589999, "reward_std": 0.4472240339964628, "rewards/equation_reward_func": 0.006696428870782256, "rewards/format_reward_func": 0.3169643012806773, "step": 6 }, { "completion_length": 397.4364013671875, "epoch": 0.011745274362268306, "grad_norm": 0.11874955075675916, "kl": 0.0004132986068725586, "learning_rate": 2.857142857142857e-07, "loss": 0.0, "reward": 0.3448660895228386, "reward_std": 0.4638795666396618, "rewards/equation_reward_func": 0.008928571827709675, "rewards/format_reward_func": 0.33593751676380634, "step": 8 }, { "completion_length": 408.18640327453613, "epoch": 0.014681592952835382, "grad_norm": 0.1291957962223834, "kl": 0.0004401206970214844, "learning_rate": 3.5714285714285716e-07, "loss": 0.0, "reward": 0.3448660895228386, "reward_std": 0.47657241858541965, "rewards/equation_reward_func": 0.015625000814907253, "rewards/format_reward_func": 0.3292410857975483, "step": 10 }, { "completion_length": 383.9297065734863, "epoch": 0.01761791154340246, "grad_norm": 0.11615535346233327, "kl": 0.0005271434783935547, "learning_rate": 4.285714285714285e-07, "loss": 0.0, "reward": 0.4218750223517418, "reward_std": 0.4933844096958637, "rewards/equation_reward_func": 0.012276786263100803, "rewards/format_reward_func": 0.40959823317825794, "step": 12 }, { "completion_length": 395.9866237640381, "epoch": 0.020554230133969537, "grad_norm": 0.11975361330667507, "kl": 0.0011701583862304688, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.4676339440047741, "reward_std": 0.5037149954587221, "rewards/equation_reward_func": 0.007812500349245965, "rewards/format_reward_func": 0.45982144586741924, "step": 14 }, { "completion_length": 354.48550033569336, "epoch": 0.02349054872453661, "grad_norm": 0.11506849184873436, "kl": 0.0016536712646484375, "learning_rate": 4.999740409224932e-07, "loss": 0.0, "reward": 0.6116071678698063, "reward_std": 0.4877460356801748, "rewards/equation_reward_func": 0.011160714784637094, "rewards/format_reward_func": 0.6004464514553547, "step": 16 }, { "completion_length": 358.6852779388428, "epoch": 0.02642686731510369, "grad_norm": 0.09667949769991004, "kl": 0.00521087646484375, "learning_rate": 4.998961690809627e-07, "loss": 0.0, "reward": 0.7633928954601288, "reward_std": 0.40501935593783855, "rewards/equation_reward_func": 0.006696428870782256, "rewards/format_reward_func": 0.7566964663565159, "step": 18 }, { "completion_length": 372.5792598724365, "epoch": 0.029363185905670764, "grad_norm": 0.07439885917757044, "kl": 0.0065975189208984375, "learning_rate": 4.997664006472578e-07, "loss": 0.0, "reward": 0.8404018245637417, "reward_std": 0.34173065423965454, "rewards/equation_reward_func": 0.010044643306173384, "rewards/format_reward_func": 0.8303571790456772, "step": 20 }, { "completion_length": 371.32590675354004, "epoch": 0.03229950449623784, "grad_norm": 0.06516039437584277, "kl": 0.0086212158203125, "learning_rate": 4.995847625707292e-07, "loss": 0.0, "reward": 0.8906250484287739, "reward_std": 0.29503875970840454, "rewards/equation_reward_func": 0.013392857741564512, "rewards/format_reward_func": 0.8772321939468384, "step": 22 }, { "completion_length": 361.12055587768555, "epoch": 0.03523582308680492, "grad_norm": 0.06376645250487896, "kl": 0.008119583129882812, "learning_rate": 4.993512925726318e-07, "loss": 0.0, "reward": 0.9162946864962578, "reward_std": 0.2622959101572633, "rewards/equation_reward_func": 0.01785714377183467, "rewards/format_reward_func": 0.8984375447034836, "step": 24 }, { "completion_length": 371.7355079650879, "epoch": 0.03817214167737199, "grad_norm": 0.05836390189597255, "kl": 0.010738372802734375, "learning_rate": 4.990660391382923e-07, "loss": 0.0, "reward": 0.9341518208384514, "reward_std": 0.2500613871961832, "rewards/equation_reward_func": 0.026785715483129025, "rewards/format_reward_func": 0.9073661155998707, "step": 26 }, { "completion_length": 365.8471145629883, "epoch": 0.04110846026793907, "grad_norm": 0.04136502377843216, "kl": 0.0107421875, "learning_rate": 4.987290615070384e-07, "loss": 0.0, "reward": 0.9642857499420643, "reward_std": 0.15522026224061847, "rewards/equation_reward_func": 0.014508929336443543, "rewards/format_reward_func": 0.9497768245637417, "step": 28 }, { "completion_length": 361.58260917663574, "epoch": 0.04404477885850615, "grad_norm": 0.03434405791900483, "kl": 0.012409210205078125, "learning_rate": 4.983404296598978e-07, "loss": 0.0, "reward": 0.9776786081492901, "reward_std": 0.11330996686592698, "rewards/equation_reward_func": 0.012276786379516125, "rewards/format_reward_func": 0.9654018245637417, "step": 30 }, { "completion_length": 346.245548248291, "epoch": 0.04698109744907322, "grad_norm": 0.030663261177108603, "kl": 0.015239715576171875, "learning_rate": 4.979002243050646e-07, "loss": 0.0, "reward": 0.9955357573926449, "reward_std": 0.1037649204954505, "rewards/equation_reward_func": 0.018973215483129025, "rewards/format_reward_func": 0.9765625335276127, "step": 32 }, { "completion_length": 334.4799270629883, "epoch": 0.049917416039640304, "grad_norm": 0.030854717221714126, "kl": 0.0161285400390625, "learning_rate": 4.974085368611381e-07, "loss": 0.0, "reward": 1.0156250596046448, "reward_std": 0.09721619635820389, "rewards/equation_reward_func": 0.0290178582072258, "rewards/format_reward_func": 0.9866071790456772, "step": 34 }, { "completion_length": 336.73773765563965, "epoch": 0.05285373463020738, "grad_norm": 0.03327381233463755, "kl": 0.014263153076171875, "learning_rate": 4.968654694381379e-07, "loss": 0.0, "reward": 1.0122768357396126, "reward_std": 0.08523162081837654, "rewards/equation_reward_func": 0.021205358090810478, "rewards/format_reward_func": 0.991071455180645, "step": 36 }, { "completion_length": 332.9218864440918, "epoch": 0.05579005322077445, "grad_norm": 0.02236423630507305, "kl": 0.0166168212890625, "learning_rate": 4.962711348162987e-07, "loss": 0.0, "reward": 0.9966518208384514, "reward_std": 0.05468874936923385, "rewards/equation_reward_func": 0.008928571827709675, "rewards/format_reward_func": 0.9877232536673546, "step": 38 }, { "completion_length": 331.6495666503906, "epoch": 0.05872637181134153, "grad_norm": 0.02270153959336241, "kl": 0.016017913818359375, "learning_rate": 4.956256564226487e-07, "loss": 0.0, "reward": 1.0089286044239998, "reward_std": 0.07463237782940269, "rewards/equation_reward_func": 0.020089286845177412, "rewards/format_reward_func": 0.9888393022119999, "step": 40 }, { "completion_length": 331.0167541503906, "epoch": 0.06166269040190861, "grad_norm": 0.025458258107277202, "kl": 0.0180816650390625, "learning_rate": 4.949291683053768e-07, "loss": 0.0, "reward": 1.0033482536673546, "reward_std": 0.0625700019299984, "rewards/equation_reward_func": 0.013392857741564512, "rewards/format_reward_func": 0.9899553880095482, "step": 42 }, { "completion_length": 322.00559425354004, "epoch": 0.06459900899247568, "grad_norm": 0.03083960172379516, "kl": 0.016857147216796875, "learning_rate": 4.941818151059955e-07, "loss": 0.0, "reward": 1.0022321827709675, "reward_std": 0.056821079924702644, "rewards/equation_reward_func": 0.011160714784637094, "rewards/format_reward_func": 0.9910714514553547, "step": 44 }, { "completion_length": 336.2935447692871, "epoch": 0.06753532758304276, "grad_norm": 0.03395926937254199, "kl": 0.01757049560546875, "learning_rate": 4.933837520293017e-07, "loss": 0.0, "reward": 1.0044643357396126, "reward_std": 0.10183899104595184, "rewards/equation_reward_func": 0.02232142968568951, "rewards/format_reward_func": 0.9821428880095482, "step": 46 }, { "completion_length": 315.8538112640381, "epoch": 0.07047164617360983, "grad_norm": 0.03156746554424257, "kl": 0.01998138427734375, "learning_rate": 4.925351448111454e-07, "loss": 0.0, "reward": 1.0111607536673546, "reward_std": 0.06636612536385655, "rewards/equation_reward_func": 0.01785714365541935, "rewards/format_reward_func": 0.9933035895228386, "step": 48 }, { "completion_length": 306.9486770629883, "epoch": 0.07340796476417691, "grad_norm": 0.02962714783243483, "kl": 0.019634246826171875, "learning_rate": 4.91636169684011e-07, "loss": 0.0, "reward": 1.0100446864962578, "reward_std": 0.06970830773934722, "rewards/equation_reward_func": 0.018973215366713703, "rewards/format_reward_func": 0.9910714402794838, "step": 50 }, { "completion_length": 314.1015815734863, "epoch": 0.07634428335474398, "grad_norm": 0.029626162068749, "kl": 0.01953125, "learning_rate": 4.906870133404186e-07, "loss": 0.0, "reward": 1.0078125298023224, "reward_std": 0.0697083086706698, "rewards/equation_reward_func": 0.018973215483129025, "rewards/format_reward_func": 0.9888393059372902, "step": 52 }, { "completion_length": 324.9698791503906, "epoch": 0.07928060194531107, "grad_norm": 0.02581923255114933, "kl": 0.03170013427734375, "learning_rate": 4.896878728941531e-07, "loss": 0.0, "reward": 1.0055804029107094, "reward_std": 0.061442055739462376, "rewards/equation_reward_func": 0.014508929220028222, "rewards/format_reward_func": 0.9910714477300644, "step": 54 }, { "completion_length": 313.5446529388428, "epoch": 0.08221692053587815, "grad_norm": 0.0268506054426233, "kl": 0.0242767333984375, "learning_rate": 4.886389558393284e-07, "loss": 0.0, "reward": 1.0100446939468384, "reward_std": 0.0723005011677742, "rewards/equation_reward_func": 0.02008928672876209, "rewards/format_reward_func": 0.989955373108387, "step": 56 }, { "completion_length": 311.1138515472412, "epoch": 0.08515323912644522, "grad_norm": 0.026590096596338642, "kl": 0.0207061767578125, "learning_rate": 4.875404800072976e-07, "loss": 0.0, "reward": 1.0044643208384514, "reward_std": 0.05610500229522586, "rewards/equation_reward_func": 0.01339285762514919, "rewards/format_reward_func": 0.9910714477300644, "step": 58 }, { "completion_length": 299.32813835144043, "epoch": 0.0880895577170123, "grad_norm": 0.029592636449534623, "kl": 0.022125244140625, "learning_rate": 4.86392673521415e-07, "loss": 0.0, "reward": 1.020089328289032, "reward_std": 0.06914377678185701, "rewards/equation_reward_func": 0.024553572526201606, "rewards/format_reward_func": 0.9955357313156128, "step": 60 }, { "completion_length": 297.50894355773926, "epoch": 0.09102587630757937, "grad_norm": 0.0345348188639915, "kl": 0.02149200439453125, "learning_rate": 4.851957747496606e-07, "loss": 0.0, "reward": 1.0078125447034836, "reward_std": 0.07301658112555742, "rewards/equation_reward_func": 0.01785714365541935, "rewards/format_reward_func": 0.9899553805589676, "step": 62 }, { "completion_length": 301.83595275878906, "epoch": 0.09396219489814644, "grad_norm": 0.024731850879510856, "kl": 0.0228271484375, "learning_rate": 4.839500322551386e-07, "loss": 0.0, "reward": 1.0156250298023224, "reward_std": 0.05877388082444668, "rewards/equation_reward_func": 0.01897321513388306, "rewards/format_reward_func": 0.996651791036129, "step": 64 }, { "completion_length": 304.755597114563, "epoch": 0.09689851348871352, "grad_norm": 0.022582520854879787, "kl": 0.022369384765625, "learning_rate": 4.826557047444563e-07, "loss": 0.0, "reward": 1.0111607611179352, "reward_std": 0.06549919955432415, "rewards/equation_reward_func": 0.01897321525029838, "rewards/format_reward_func": 0.9921875223517418, "step": 66 }, { "completion_length": 299.7221088409424, "epoch": 0.09983483207928061, "grad_norm": 0.02378726780795042, "kl": 0.02381134033203125, "learning_rate": 4.813130610139993e-07, "loss": 0.0, "reward": 1.0133928954601288, "reward_std": 0.05174434743821621, "rewards/equation_reward_func": 0.017857143888249993, "rewards/format_reward_func": 0.9955357238650322, "step": 68 }, { "completion_length": 295.27344703674316, "epoch": 0.10277115066984768, "grad_norm": 0.03710159597518268, "kl": 0.02559661865234375, "learning_rate": 4.799223798941089e-07, "loss": 0.0, "reward": 1.0234375521540642, "reward_std": 0.08428801316767931, "rewards/equation_reward_func": 0.02901785832364112, "rewards/format_reward_func": 0.994419664144516, "step": 70 }, { "completion_length": 293.92858505249023, "epoch": 0.10570746926041476, "grad_norm": 0.030422368236480923, "kl": 0.0235748291015625, "learning_rate": 4.78483950191177e-07, "loss": 0.0, "reward": 1.016741119325161, "reward_std": 0.06475032959133387, "rewards/equation_reward_func": 0.02120535832364112, "rewards/format_reward_func": 0.9955357238650322, "step": 72 }, { "completion_length": 301.22099113464355, "epoch": 0.10864378785098183, "grad_norm": 0.027016440417082194, "kl": 0.02487945556640625, "learning_rate": 4.769980706276687e-07, "loss": 0.0, "reward": 1.0089285969734192, "reward_std": 0.06354640237987041, "rewards/equation_reward_func": 0.016741072293370962, "rewards/format_reward_func": 0.9921875149011612, "step": 74 }, { "completion_length": 306.91072845458984, "epoch": 0.1115801064415489, "grad_norm": 0.03756361261599655, "kl": 0.03522491455078125, "learning_rate": 4.7546504978008595e-07, "loss": 0.0, "reward": 1.0133929029107094, "reward_std": 0.06290700566023588, "rewards/equation_reward_func": 0.018973215483129025, "rewards/format_reward_func": 0.994419664144516, "step": 76 }, { "completion_length": 309.30805015563965, "epoch": 0.11451642503211598, "grad_norm": 0.03409602166374047, "kl": 0.02864837646484375, "learning_rate": 4.738852060148848e-07, "loss": 0.0, "reward": 1.004464328289032, "reward_std": 0.07286503352224827, "rewards/equation_reward_func": 0.016741072293370962, "rewards/format_reward_func": 0.9877232313156128, "step": 78 }, { "completion_length": 295.35604095458984, "epoch": 0.11745274362268306, "grad_norm": 0.016373157837020508, "kl": 0.0261993408203125, "learning_rate": 4.722588674223593e-07, "loss": 0.0, "reward": 1.0044643133878708, "reward_std": 0.04678636882454157, "rewards/equation_reward_func": 0.011160714784637094, "rewards/format_reward_func": 0.9933035969734192, "step": 80 }, { "completion_length": 294.49331283569336, "epoch": 0.12038906221325013, "grad_norm": 0.028656622522124857, "kl": 0.02646636962890625, "learning_rate": 4.70586371748506e-07, "loss": 0.0, "reward": 1.0145089775323868, "reward_std": 0.05407622084021568, "rewards/equation_reward_func": 0.01897321513388306, "rewards/format_reward_func": 0.9955357313156128, "step": 82 }, { "completion_length": 295.83595085144043, "epoch": 0.12332538080381722, "grad_norm": 0.042234987959512645, "kl": 0.027313232421875, "learning_rate": 4.6886806632488363e-07, "loss": 0.0, "reward": 1.022321481257677, "reward_std": 0.09619954135268927, "rewards/equation_reward_func": 0.03125000174622983, "rewards/format_reward_func": 0.9910714440047741, "step": 84 }, { "completion_length": 291.2533645629883, "epoch": 0.12626169939438428, "grad_norm": 0.023816703176273206, "kl": 0.0269775390625, "learning_rate": 4.6710430799648143e-07, "loss": 0.0, "reward": 1.0212053954601288, "reward_std": 0.060498448088765144, "rewards/equation_reward_func": 0.024553572409786284, "rewards/format_reward_func": 0.9966517984867096, "step": 86 }, { "completion_length": 280.7332754135132, "epoch": 0.12919801798495137, "grad_norm": 0.0349694305046122, "kl": 0.02933502197265625, "learning_rate": 4.652954630476127e-07, "loss": 0.0, "reward": 1.0279018357396126, "reward_std": 0.08335960982367396, "rewards/equation_reward_func": 0.03348214493598789, "rewards/format_reward_func": 0.994419664144516, "step": 88 }, { "completion_length": 282.4788055419922, "epoch": 0.13213433657551846, "grad_norm": 0.02737283392710255, "kl": 0.03136444091796875, "learning_rate": 4.6344190712584713e-07, "loss": 0.0, "reward": 1.0145089626312256, "reward_std": 0.06711499718949199, "rewards/equation_reward_func": 0.021205358440056443, "rewards/format_reward_func": 0.9933035969734192, "step": 90 }, { "completion_length": 286.83037185668945, "epoch": 0.13507065516608552, "grad_norm": 0.03769576508641649, "kl": 0.03195953369140625, "learning_rate": 4.615440251639995e-07, "loss": 0.0, "reward": 1.0279018431901932, "reward_std": 0.08161913510411978, "rewards/equation_reward_func": 0.031250001629814506, "rewards/format_reward_func": 0.9966517984867096, "step": 92 }, { "completion_length": 284.62947845458984, "epoch": 0.1380069737566526, "grad_norm": 0.026738706985139863, "kl": 0.0301971435546875, "learning_rate": 4.596022113001894e-07, "loss": 0.0, "reward": 1.0167411118745804, "reward_std": 0.0534368259832263, "rewards/equation_reward_func": 0.018973215366713703, "rewards/format_reward_func": 0.9977678656578064, "step": 94 }, { "completion_length": 273.97992610931396, "epoch": 0.14094329234721967, "grad_norm": 0.03717968288620107, "kl": 0.03003692626953125, "learning_rate": 4.576168687959895e-07, "loss": 0.0, "reward": 1.02901791036129, "reward_std": 0.07455569412559271, "rewards/equation_reward_func": 0.033482144703157246, "rewards/format_reward_func": 0.9955357313156128, "step": 96 }, { "completion_length": 283.43974590301514, "epoch": 0.14387961093778676, "grad_norm": 0.03419476151952092, "kl": 0.032470703125, "learning_rate": 4.555884099526793e-07, "loss": 0.0, "reward": 1.018973246216774, "reward_std": 0.05700471764430404, "rewards/equation_reward_func": 0.022321429918520153, "rewards/format_reward_func": 0.9966517984867096, "step": 98 }, { "completion_length": 278.54130840301514, "epoch": 0.14681592952835382, "grad_norm": 0.03878824475168632, "kl": 0.03301239013671875, "learning_rate": 4.5351725602562174e-07, "loss": 0.0, "reward": 1.025669701397419, "reward_std": 0.07384143397212029, "rewards/equation_reward_func": 0.0279017873108387, "rewards/format_reward_func": 0.9977678656578064, "step": 100 }, { "completion_length": 280.62055110931396, "epoch": 0.1497522481189209, "grad_norm": 0.02811906331283331, "kl": 0.033050537109375, "learning_rate": 4.514038371367791e-07, "loss": 0.0, "reward": 1.015625037252903, "reward_std": 0.05238374415785074, "rewards/equation_reward_func": 0.01897321525029838, "rewards/format_reward_func": 0.9966517984867096, "step": 102 }, { "completion_length": 271.188627243042, "epoch": 0.15268856670948797, "grad_norm": 0.03367420492297424, "kl": 0.03389739990234375, "learning_rate": 4.4924859218538936e-07, "loss": 0.0, "reward": 1.020089328289032, "reward_std": 0.0539246741682291, "rewards/equation_reward_func": 0.02120535832364112, "rewards/format_reward_func": 0.9988839328289032, "step": 104 }, { "completion_length": 274.41965198516846, "epoch": 0.15562488530005505, "grad_norm": 0.032704500738241556, "kl": 0.0342559814453125, "learning_rate": 4.470519687568185e-07, "loss": 0.0, "reward": 1.0156250298023224, "reward_std": 0.06005267146974802, "rewards/equation_reward_func": 0.021205357974395156, "rewards/format_reward_func": 0.994419664144516, "step": 106 }, { "completion_length": 267.667423248291, "epoch": 0.15856120389062214, "grad_norm": 0.022983000288007625, "kl": 0.03482818603515625, "learning_rate": 4.4481442302960923e-07, "loss": 0.0, "reward": 1.0145089700818062, "reward_std": 0.05625654757022858, "rewards/equation_reward_func": 0.01785714377183467, "rewards/format_reward_func": 0.9966517984867096, "step": 108 }, { "completion_length": 276.2734498977661, "epoch": 0.1614975224811892, "grad_norm": 0.03293451132142616, "kl": 0.03582000732421875, "learning_rate": 4.4253641968074505e-07, "loss": 0.0, "reward": 1.0212053954601288, "reward_std": 0.051668363623321056, "rewards/equation_reward_func": 0.02343750116415322, "rewards/format_reward_func": 0.9977678582072258, "step": 110 }, { "completion_length": 259.26229190826416, "epoch": 0.1644338410717563, "grad_norm": 0.030451551813034692, "kl": 0.0374908447265625, "learning_rate": 4.402184317891501e-07, "loss": 0.0, "reward": 1.0279018357396126, "reward_std": 0.07869063876569271, "rewards/equation_reward_func": 0.029017858556471765, "rewards/format_reward_func": 0.9988839328289032, "step": 112 }, { "completion_length": 263.8515748977661, "epoch": 0.16737015966232335, "grad_norm": 0.03266847523674648, "kl": 0.03509521484375, "learning_rate": 4.37860940737443e-07, "loss": 0.0, "reward": 1.024553619325161, "reward_std": 0.06911690765991807, "rewards/equation_reward_func": 0.026785715715959668, "rewards/format_reward_func": 0.9977678656578064, "step": 114 }, { "completion_length": 270.2511262893677, "epoch": 0.17030647825289044, "grad_norm": 0.029203115735684546, "kl": 0.035980224609375, "learning_rate": 4.354644361119671e-07, "loss": 0.0, "reward": 1.0145089700818062, "reward_std": 0.0429902458563447, "rewards/equation_reward_func": 0.015625000814907253, "rewards/format_reward_func": 0.9988839328289032, "step": 116 }, { "completion_length": 265.80135345458984, "epoch": 0.1732427968434575, "grad_norm": 0.03659231709237539, "kl": 0.0381317138671875, "learning_rate": 4.3302941560111716e-07, "loss": 0.0, "reward": 1.0189732685685158, "reward_std": 0.0753056826069951, "rewards/equation_reward_func": 0.025669644121080637, "rewards/format_reward_func": 0.9933035895228386, "step": 118 }, { "completion_length": 261.29130935668945, "epoch": 0.1761791154340246, "grad_norm": 0.01985992912461338, "kl": 0.03851318359375, "learning_rate": 4.3055638489198236e-07, "loss": 0.0, "reward": 1.007812537252903, "reward_std": 0.04881514888256788, "rewards/equation_reward_func": 0.013392857974395156, "rewards/format_reward_func": 0.994419664144516, "step": 120 }, { "completion_length": 258.831485748291, "epoch": 0.17911543402459168, "grad_norm": 0.028601661901634927, "kl": 0.0393218994140625, "learning_rate": 4.280458575653296e-07, "loss": 0.0, "reward": 1.0178571864962578, "reward_std": 0.06534695206210017, "rewards/equation_reward_func": 0.023437501513399184, "rewards/format_reward_func": 0.994419664144516, "step": 122 }, { "completion_length": 257.6707715988159, "epoch": 0.18205175261515874, "grad_norm": 0.020698279128715157, "kl": 0.04107666015625, "learning_rate": 4.2549835498894665e-07, "loss": 0.0, "reward": 1.018973246216774, "reward_std": 0.057493268977850676, "rewards/equation_reward_func": 0.023437501047737896, "rewards/format_reward_func": 0.9955357238650322, "step": 124 }, { "completion_length": 251.8906373977661, "epoch": 0.18498807120572583, "grad_norm": 0.022202862136424105, "kl": 0.041351318359375, "learning_rate": 4.229144062093679e-07, "loss": 0.0, "reward": 1.0234375447034836, "reward_std": 0.0553896245546639, "rewards/equation_reward_func": 0.023437501047737896, "rewards/format_reward_func": 1.0, "step": 126 }, { "completion_length": 251.66853618621826, "epoch": 0.1879243897962929, "grad_norm": 0.032026257478678814, "kl": 0.0420074462890625, "learning_rate": 4.2029454784200675e-07, "loss": 0.0, "reward": 1.0245536267757416, "reward_std": 0.057569249998778105, "rewards/equation_reward_func": 0.024553572642616928, "rewards/format_reward_func": 1.0, "step": 128 }, { "completion_length": 257.19309520721436, "epoch": 0.19086070838685998, "grad_norm": 0.035245007765867475, "kl": 0.0410308837890625, "learning_rate": 4.1763932395971433e-07, "loss": 0.0, "reward": 1.020089328289032, "reward_std": 0.057341722305864096, "rewards/equation_reward_func": 0.02343750128056854, "rewards/format_reward_func": 0.9966517984867096, "step": 130 }, { "completion_length": 263.37054920196533, "epoch": 0.19379702697742704, "grad_norm": 0.036559422829769254, "kl": 0.04827880859375, "learning_rate": 4.1494928597979117e-07, "loss": 0.0, "reward": 1.024553619325161, "reward_std": 0.07117325672879815, "rewards/equation_reward_func": 0.026785715483129025, "rewards/format_reward_func": 0.9977678582072258, "step": 132 }, { "completion_length": 250.35491847991943, "epoch": 0.19673334556799413, "grad_norm": 0.0355452816503214, "kl": 0.04229736328125, "learning_rate": 4.122249925494726e-07, "loss": 0.0, "reward": 1.023437537252903, "reward_std": 0.07714970735833049, "rewards/equation_reward_func": 0.026785715599544346, "rewards/format_reward_func": 0.996651791036129, "step": 134 }, { "completion_length": 249.38951969146729, "epoch": 0.19966966415856122, "grad_norm": 0.04139312520171231, "kl": 0.0432281494140625, "learning_rate": 4.094670094299131e-07, "loss": 0.0, "reward": 1.032366119325161, "reward_std": 0.07797456067055464, "rewards/equation_reward_func": 0.03236607275903225, "rewards/format_reward_func": 1.0, "step": 136 }, { "completion_length": 258.31921100616455, "epoch": 0.20260598274912828, "grad_norm": 0.04587399823733233, "kl": 0.04254150390625, "learning_rate": 4.066759093786931e-07, "loss": 0.0, "reward": 1.033482201397419, "reward_std": 0.09353066422045231, "rewards/equation_reward_func": 0.03794643084984273, "rewards/format_reward_func": 0.9955357238650322, "step": 138 }, { "completion_length": 257.14510345458984, "epoch": 0.20554230133969537, "grad_norm": 0.03418838476363922, "kl": 0.04620361328125, "learning_rate": 4.038522720308732e-07, "loss": 0.0, "reward": 1.0223214775323868, "reward_std": 0.061702375300228596, "rewards/equation_reward_func": 0.02455357275903225, "rewards/format_reward_func": 0.9977678656578064, "step": 140 }, { "completion_length": 257.2689838409424, "epoch": 0.20847861993026243, "grad_norm": 0.04383610634106021, "kl": 0.0451202392578125, "learning_rate": 4.009966837786194e-07, "loss": 0.0, "reward": 1.03683041036129, "reward_std": 0.10074383299797773, "rewards/equation_reward_func": 0.04241071583237499, "rewards/format_reward_func": 0.9944196566939354, "step": 142 }, { "completion_length": 257.14287090301514, "epoch": 0.21141493852082952, "grad_norm": 0.03707968717170861, "kl": 0.046173095703125, "learning_rate": 3.981097376494259e-07, "loss": 0.0, "reward": 1.0200893208384514, "reward_std": 0.0691437772475183, "rewards/equation_reward_func": 0.02455357275903225, "rewards/format_reward_func": 0.9955357313156128, "step": 144 }, { "completion_length": 256.9732303619385, "epoch": 0.21435125711139658, "grad_norm": 0.03163136571449981, "kl": 0.0549163818359375, "learning_rate": 3.951920331829592e-07, "loss": 0.0001, "reward": 1.0279018208384514, "reward_std": 0.07846311014145613, "rewards/equation_reward_func": 0.030133930034935474, "rewards/format_reward_func": 0.9977678656578064, "step": 146 }, { "completion_length": 258.79242038726807, "epoch": 0.21728757570196366, "grad_norm": 0.03734197816068146, "kl": 0.0498809814453125, "learning_rate": 3.922441763065506e-07, "loss": 0.0, "reward": 1.0200893357396126, "reward_std": 0.05246042646467686, "rewards/equation_reward_func": 0.02008928661234677, "rewards/format_reward_func": 1.0, "step": 148 }, { "completion_length": 265.50113010406494, "epoch": 0.22022389429253075, "grad_norm": 0.032988418690538694, "kl": 0.0509033203125, "learning_rate": 3.8926677920936093e-07, "loss": 0.0001, "reward": 1.0200893357396126, "reward_std": 0.07917848788201809, "rewards/equation_reward_func": 0.025669644004665315, "rewards/format_reward_func": 0.9944196566939354, "step": 150 }, { "completion_length": 262.2790298461914, "epoch": 0.2231602128830978, "grad_norm": 0.03947725857640478, "kl": 0.05499267578125, "learning_rate": 3.862604602152464e-07, "loss": 0.0001, "reward": 1.0412946864962578, "reward_std": 0.10787475202232599, "rewards/equation_reward_func": 0.04687500232830644, "rewards/format_reward_func": 0.994419664144516, "step": 152 }, { "completion_length": 270.6116189956665, "epoch": 0.2260965314736649, "grad_norm": 0.032374211940469313, "kl": 0.0520782470703125, "learning_rate": 3.8322584365434934e-07, "loss": 0.0001, "reward": 1.03683041036129, "reward_std": 0.07872272981330752, "rewards/equation_reward_func": 0.03683035902213305, "rewards/format_reward_func": 1.0, "step": 154 }, { "completion_length": 261.8973340988159, "epoch": 0.22903285006423196, "grad_norm": 0.03908993116978629, "kl": 0.053985595703125, "learning_rate": 3.8016355973344173e-07, "loss": 0.0001, "reward": 1.0267857611179352, "reward_std": 0.07741002831608057, "rewards/equation_reward_func": 0.030133930151350796, "rewards/format_reward_func": 0.9966517984867096, "step": 156 }, { "completion_length": 271.08595180511475, "epoch": 0.23196916865479905, "grad_norm": 0.043196101404256254, "kl": 0.054473876953125, "learning_rate": 3.7707424440504863e-07, "loss": 0.0001, "reward": 1.032366119325161, "reward_std": 0.09447245439514518, "rewards/equation_reward_func": 0.037946430034935474, "rewards/format_reward_func": 0.9944196566939354, "step": 158 }, { "completion_length": 265.0502338409424, "epoch": 0.2349054872453661, "grad_norm": 0.03806362103891916, "kl": 0.0577392578125, "learning_rate": 3.739585392353787e-07, "loss": 0.0001, "reward": 1.0212054029107094, "reward_std": 0.07966633653268218, "rewards/equation_reward_func": 0.026785715599544346, "rewards/format_reward_func": 0.994419664144516, "step": 160 }, { "completion_length": 268.6663064956665, "epoch": 0.2378418058359332, "grad_norm": 0.029412731757100335, "kl": 0.0699005126953125, "learning_rate": 3.7081709127108767e-07, "loss": 0.0001, "reward": 1.0256696827709675, "reward_std": 0.08432010188698769, "rewards/equation_reward_func": 0.03459821583237499, "rewards/format_reward_func": 0.9910714440047741, "step": 162 }, { "completion_length": 286.89733505249023, "epoch": 0.24077812442650026, "grad_norm": 0.04278121737809733, "kl": 0.0554046630859375, "learning_rate": 3.6765055290490513e-07, "loss": 0.0001, "reward": 1.025669701397419, "reward_std": 0.08428801316767931, "rewards/equation_reward_func": 0.030133930034935474, "rewards/format_reward_func": 0.9955357313156128, "step": 164 }, { "completion_length": 281.04465675354004, "epoch": 0.24371444301706735, "grad_norm": 0.03589008461446017, "kl": 0.0649261474609375, "learning_rate": 3.644595817401501e-07, "loss": 0.0001, "reward": 1.0290179178118706, "reward_std": 0.08672865945845842, "rewards/equation_reward_func": 0.0334821444703266, "rewards/format_reward_func": 0.9955357238650322, "step": 166 }, { "completion_length": 280.0134057998657, "epoch": 0.24665076160763444, "grad_norm": 0.03504932595126223, "kl": 0.0573883056640625, "learning_rate": 3.6124484045416483e-07, "loss": 0.0001, "reward": 1.0156250447034836, "reward_std": 0.06922045908868313, "rewards/equation_reward_func": 0.0212053582072258, "rewards/format_reward_func": 0.9944196566939354, "step": 168 }, { "completion_length": 284.7399673461914, "epoch": 0.2495870801982015, "grad_norm": 0.03348991366352892, "kl": 0.0587158203125, "learning_rate": 3.580069966606949e-07, "loss": 0.0001, "reward": 1.0312500521540642, "reward_std": 0.08672866132110357, "rewards/equation_reward_func": 0.03571428719442338, "rewards/format_reward_func": 0.9955357238650322, "step": 170 }, { "completion_length": 287.6384048461914, "epoch": 0.25252339878876856, "grad_norm": 0.02297432211698741, "kl": 0.0548095703125, "learning_rate": 3.547467227712444e-07, "loss": 0.0001, "reward": 1.0212054029107094, "reward_std": 0.068167376331985, "rewards/equation_reward_func": 0.025669644586741924, "rewards/format_reward_func": 0.9955357313156128, "step": 172 }, { "completion_length": 276.344877243042, "epoch": 0.25545971737933565, "grad_norm": 0.031249403617369484, "kl": 0.058990478515625, "learning_rate": 3.5146469585543386e-07, "loss": 0.0001, "reward": 1.0223214626312256, "reward_std": 0.0465588397346437, "rewards/equation_reward_func": 0.02232142968568951, "rewards/format_reward_func": 1.0, "step": 174 }, { "completion_length": 287.0747871398926, "epoch": 0.25839603596990274, "grad_norm": 0.024363445225025017, "kl": 0.0567626953125, "learning_rate": 3.481615975003922e-07, "loss": 0.0001, "reward": 1.0223214700818062, "reward_std": 0.05978166777640581, "rewards/equation_reward_func": 0.02678571583237499, "rewards/format_reward_func": 0.9955357313156128, "step": 176 }, { "completion_length": 273.2667541503906, "epoch": 0.2613323545604698, "grad_norm": 0.03673410404509551, "kl": 0.0570526123046875, "learning_rate": 3.448381136692089e-07, "loss": 0.0001, "reward": 1.0435268357396126, "reward_std": 0.1020226301625371, "rewards/equation_reward_func": 0.04799107392318547, "rewards/format_reward_func": 0.9955357313156128, "step": 178 }, { "completion_length": 276.5803737640381, "epoch": 0.2642686731510369, "grad_norm": 0.03470857489299649, "kl": 0.0600433349609375, "learning_rate": 3.4149493455847897e-07, "loss": 0.0001, "reward": 1.0223214626312256, "reward_std": 0.06948007736355066, "rewards/equation_reward_func": 0.027901786961592734, "rewards/format_reward_func": 0.994419664144516, "step": 180 }, { "completion_length": 283.47433948516846, "epoch": 0.26720499174160395, "grad_norm": 0.04127720885486165, "kl": 0.0587921142578125, "learning_rate": 3.3813275445496766e-07, "loss": 0.0001, "reward": 1.0267857611179352, "reward_std": 0.08631567610427737, "rewards/equation_reward_func": 0.03348214435391128, "rewards/format_reward_func": 0.9933035895228386, "step": 182 }, { "completion_length": 282.8973331451416, "epoch": 0.27014131033217104, "grad_norm": 0.032665399093278785, "kl": 0.0596160888671875, "learning_rate": 3.347522715914262e-07, "loss": 0.0001, "reward": 1.0256696939468384, "reward_std": 0.07425330020487309, "rewards/equation_reward_func": 0.02901785878930241, "rewards/format_reward_func": 0.9966517984867096, "step": 184 }, { "completion_length": 290.776798248291, "epoch": 0.2730776289227381, "grad_norm": 0.034926325047606295, "kl": 0.0619354248046875, "learning_rate": 3.313541880015877e-07, "loss": 0.0001, "reward": 1.0267857611179352, "reward_std": 0.0910613308660686, "rewards/equation_reward_func": 0.034598216065205634, "rewards/format_reward_func": 0.9921875149011612, "step": 186 }, { "completion_length": 290.6216640472412, "epoch": 0.2760139475133052, "grad_norm": 0.03913319668173702, "kl": 0.0630340576171875, "learning_rate": 3.279392093743747e-07, "loss": 0.0001, "reward": 1.0267857685685158, "reward_std": 0.09586183680221438, "rewards/equation_reward_func": 0.033482144703157246, "rewards/format_reward_func": 0.9933035895228386, "step": 188 }, { "completion_length": 286.97992515563965, "epoch": 0.27895026610387225, "grad_norm": 0.02680827432830071, "kl": 0.0650177001953125, "learning_rate": 3.245080449073459e-07, "loss": 0.0001, "reward": 1.0189732536673546, "reward_std": 0.0477627688087523, "rewards/equation_reward_func": 0.02008928661234677, "rewards/format_reward_func": 0.9988839328289032, "step": 190 }, { "completion_length": 296.9609498977661, "epoch": 0.28188658469443933, "grad_norm": 0.05153256098850436, "kl": 0.0635528564453125, "learning_rate": 3.210614071594162e-07, "loss": 0.0001, "reward": 1.0491071939468384, "reward_std": 0.12858914118260145, "rewards/equation_reward_func": 0.054687502793967724, "rewards/format_reward_func": 0.9944196566939354, "step": 192 }, { "completion_length": 289.1529150009155, "epoch": 0.2848229032850064, "grad_norm": 0.04303274228699889, "kl": 0.0654144287109375, "learning_rate": 3.1760001190287695e-07, "loss": 0.0001, "reward": 1.0345982611179352, "reward_std": 0.10969929629936814, "rewards/equation_reward_func": 0.041294644703157246, "rewards/format_reward_func": 0.9933035895228386, "step": 194 }, { "completion_length": 271.6651954650879, "epoch": 0.2877592218755735, "grad_norm": 0.05046112376966487, "kl": 0.0657958984375, "learning_rate": 3.141245779747502e-07, "loss": 0.0001, "reward": 1.0412946939468384, "reward_std": 0.09476668341085315, "rewards/equation_reward_func": 0.04352678812574595, "rewards/format_reward_func": 0.9977678656578064, "step": 196 }, { "completion_length": 281.21876430511475, "epoch": 0.2906955404661406, "grad_norm": 0.03956244273767483, "kl": 0.0671844482421875, "learning_rate": 3.106358271275056e-07, "loss": 0.0001, "reward": 1.0446429178118706, "reward_std": 0.10356425913050771, "rewards/equation_reward_func": 0.04910714505240321, "rewards/format_reward_func": 0.9955357313156128, "step": 198 }, { "completion_length": 284.19421100616455, "epoch": 0.29363185905670763, "grad_norm": 0.04003291703794155, "kl": 0.0689239501953125, "learning_rate": 3.0713448387917227e-07, "loss": 0.0001, "reward": 1.032366119325161, "reward_std": 0.10179621493443847, "rewards/equation_reward_func": 0.039062502211891115, "rewards/format_reward_func": 0.9933035895228386, "step": 200 }, { "completion_length": 289.67523765563965, "epoch": 0.2965681776472747, "grad_norm": 0.03818285499902066, "kl": 0.066192626953125, "learning_rate": 3.0362127536287636e-07, "loss": 0.0001, "reward": 1.0312500521540642, "reward_std": 0.09165203105658293, "rewards/equation_reward_func": 0.03906250128056854, "rewards/format_reward_func": 0.9921875223517418, "step": 202 }, { "completion_length": 290.83595180511475, "epoch": 0.2995044962378418, "grad_norm": 0.04200317878906047, "kl": 0.068603515625, "learning_rate": 3.0009693117583523e-07, "loss": 0.0001, "reward": 1.0368304029107094, "reward_std": 0.08616412943229079, "rewards/equation_reward_func": 0.04017857275903225, "rewards/format_reward_func": 0.9966517984867096, "step": 204 }, { "completion_length": 291.18751335144043, "epoch": 0.3024408148284089, "grad_norm": 0.036055247335712394, "kl": 0.0648193359375, "learning_rate": 2.965621832278401e-07, "loss": 0.0001, "reward": 1.035714328289032, "reward_std": 0.08680282393470407, "rewards/equation_reward_func": 0.04129464505240321, "rewards/format_reward_func": 0.994419664144516, "step": 206 }, { "completion_length": 293.39175605773926, "epoch": 0.30537713341897593, "grad_norm": 0.04861484810687108, "kl": 0.068359375, "learning_rate": 2.9301776558925875e-07, "loss": 0.0001, "reward": 1.0323661267757416, "reward_std": 0.10604698117822409, "rewards/equation_reward_func": 0.041294644586741924, "rewards/format_reward_func": 0.9910714477300644, "step": 208 }, { "completion_length": 287.9776916503906, "epoch": 0.308313452009543, "grad_norm": 0.03249802953930502, "kl": 0.068572998046875, "learning_rate": 2.894644143385885e-07, "loss": 0.0001, "reward": 1.0357143208384514, "reward_std": 0.0992788840085268, "rewards/equation_reward_func": 0.04352678789291531, "rewards/format_reward_func": 0.9921875149011612, "step": 210 }, { "completion_length": 290.8493432998657, "epoch": 0.3112497706001101, "grad_norm": 0.03516633650510248, "kl": 0.067108154296875, "learning_rate": 2.859028674095937e-07, "loss": 0.0001, "reward": 1.0223214849829674, "reward_std": 0.06850438052788377, "rewards/equation_reward_func": 0.025669644004665315, "rewards/format_reward_func": 0.9966517984867096, "step": 212 }, { "completion_length": 293.99108505249023, "epoch": 0.3141860891906772, "grad_norm": 0.037137142041911965, "kl": 0.0703582763671875, "learning_rate": 2.823338644380566e-07, "loss": 0.0001, "reward": 1.0435268431901932, "reward_std": 0.09563360968604684, "rewards/equation_reward_func": 0.045758930733427405, "rewards/format_reward_func": 0.9977678656578064, "step": 214 }, { "completion_length": 290.6272430419922, "epoch": 0.3171224077812443, "grad_norm": 0.03653835097188836, "kl": 0.07354736328125, "learning_rate": 2.7875814660817504e-07, "loss": 0.0001, "reward": 1.0301339626312256, "reward_std": 0.07902582315728068, "rewards/equation_reward_func": 0.034598216181620955, "rewards/format_reward_func": 0.9955357313156128, "step": 216 }, { "completion_length": 280.104923248291, "epoch": 0.3200587263718113, "grad_norm": 0.044884706205922956, "kl": 0.077239990234375, "learning_rate": 2.751764564986396e-07, "loss": 0.0001, "reward": 1.0379464700818062, "reward_std": 0.10856953356415033, "rewards/equation_reward_func": 0.04464285972062498, "rewards/format_reward_func": 0.9933035895228386, "step": 218 }, { "completion_length": 287.18862533569336, "epoch": 0.3229950449623784, "grad_norm": 0.04383008320984735, "kl": 0.074615478515625, "learning_rate": 2.715895379284194e-07, "loss": 0.0001, "reward": 1.0435268208384514, "reward_std": 0.11731589119881392, "rewards/equation_reward_func": 0.05357143084984273, "rewards/format_reward_func": 0.9899553805589676, "step": 220 }, { "completion_length": 276.37277603149414, "epoch": 0.3259313635529455, "grad_norm": 0.04837055458994073, "kl": 0.076263427734375, "learning_rate": 2.6799813580229174e-07, "loss": 0.0001, "reward": 1.0591518357396126, "reward_std": 0.12979829125106335, "rewards/equation_reward_func": 0.06361607415601611, "rewards/format_reward_func": 0.9955357313156128, "step": 222 }, { "completion_length": 280.39175510406494, "epoch": 0.3288676821435126, "grad_norm": 0.04410099259171204, "kl": 0.08013916015625, "learning_rate": 2.6440299595614606e-07, "loss": 0.0001, "reward": 1.032366119325161, "reward_std": 0.09281388577073812, "rewards/equation_reward_func": 0.03906250186264515, "rewards/format_reward_func": 0.9933035895228386, "step": 224 }, { "completion_length": 290.5971097946167, "epoch": 0.3318040007340797, "grad_norm": 0.039702613268333235, "kl": 0.079376220703125, "learning_rate": 2.6080486500209347e-07, "loss": 0.0001, "reward": 1.0267857387661934, "reward_std": 0.10266906302422285, "rewards/equation_reward_func": 0.039062502793967724, "rewards/format_reward_func": 0.9877232424914837, "step": 226 }, { "completion_length": 286.54019355773926, "epoch": 0.3347403193246467, "grad_norm": 0.043480221809629085, "kl": 0.07806396484375, "learning_rate": 2.572044901734166e-07, "loss": 0.0001, "reward": 1.0446429029107094, "reward_std": 0.12140736309811473, "rewards/equation_reward_func": 0.054687502793967724, "rewards/format_reward_func": 0.9899553880095482, "step": 228 }, { "completion_length": 290.90514373779297, "epoch": 0.3376766379152138, "grad_norm": 0.05242052531581984, "kl": 0.082000732421875, "learning_rate": 2.536026191693893e-07, "loss": 0.0001, "reward": 1.0435268357396126, "reward_std": 0.11180294072255492, "rewards/equation_reward_func": 0.05022321769502014, "rewards/format_reward_func": 0.993303582072258, "step": 230 }, { "completion_length": 282.847110748291, "epoch": 0.3406129565057809, "grad_norm": 0.05191571705379443, "kl": 0.0870361328125, "learning_rate": 2.5e-07, "loss": 0.0001, "reward": 1.0446429178118706, "reward_std": 0.09753446979448199, "rewards/equation_reward_func": 0.04910714563447982, "rewards/format_reward_func": 0.9955357313156128, "step": 232 }, { "completion_length": 294.40960693359375, "epoch": 0.343549275096348, "grad_norm": 0.0432690838716016, "kl": 0.083526611328125, "learning_rate": 2.4639738083061073e-07, "loss": 0.0001, "reward": 1.0256696864962578, "reward_std": 0.08049006946384907, "rewards/equation_reward_func": 0.033482144586741924, "rewards/format_reward_func": 0.9921875149011612, "step": 234 }, { "completion_length": 280.33148670196533, "epoch": 0.346485593686915, "grad_norm": 0.038252828001278785, "kl": 0.087615966796875, "learning_rate": 2.4279550982658345e-07, "loss": 0.0001, "reward": 1.0357143580913544, "reward_std": 0.09379028435796499, "rewards/equation_reward_func": 0.04017857392318547, "rewards/format_reward_func": 0.9955357313156128, "step": 236 }, { "completion_length": 295.5837182998657, "epoch": 0.3494219122774821, "grad_norm": 0.02719561274312582, "kl": 0.086944580078125, "learning_rate": 2.3919513499790646e-07, "loss": 0.0001, "reward": 1.0412946864962578, "reward_std": 0.0848828162997961, "rewards/equation_reward_func": 0.04464285972062498, "rewards/format_reward_func": 0.9966517984867096, "step": 238 }, { "completion_length": 292.82367038726807, "epoch": 0.3523582308680492, "grad_norm": 0.04035208487514967, "kl": 0.08807373046875, "learning_rate": 2.3559700404385394e-07, "loss": 0.0001, "reward": 1.0334821790456772, "reward_std": 0.10879184119403362, "rewards/equation_reward_func": 0.04017857345752418, "rewards/format_reward_func": 0.9933035895228386, "step": 240 }, { "completion_length": 301.75671005249023, "epoch": 0.35529454945861627, "grad_norm": 0.03527319627385377, "kl": 0.080841064453125, "learning_rate": 2.3200186419770823e-07, "loss": 0.0001, "reward": 1.047991119325161, "reward_std": 0.0909366519190371, "rewards/equation_reward_func": 0.05357143096625805, "rewards/format_reward_func": 0.994419664144516, "step": 242 }, { "completion_length": 299.40849781036377, "epoch": 0.35823086804918336, "grad_norm": 0.03761243995367514, "kl": 0.084014892578125, "learning_rate": 2.284104620715807e-07, "loss": 0.0001, "reward": 1.0267857685685158, "reward_std": 0.0821825498715043, "rewards/equation_reward_func": 0.03348214423749596, "rewards/format_reward_func": 0.9933035895228386, "step": 244 }, { "completion_length": 285.0535879135132, "epoch": 0.3611671866397504, "grad_norm": 0.03697547812539244, "kl": 0.0888671875, "learning_rate": 2.2482354350136043e-07, "loss": 0.0001, "reward": 1.0446428954601288, "reward_std": 0.08375557232648134, "rewards/equation_reward_func": 0.045758930733427405, "rewards/format_reward_func": 0.9988839328289032, "step": 246 }, { "completion_length": 288.14063835144043, "epoch": 0.3641035052303175, "grad_norm": 0.04071648544501395, "kl": 0.101226806640625, "learning_rate": 2.2124185339182496e-07, "loss": 0.0001, "reward": 1.0613839700818062, "reward_std": 0.12005118513479829, "rewards/equation_reward_func": 0.06584821722935885, "rewards/format_reward_func": 0.9955357313156128, "step": 248 }, { "completion_length": 291.4319305419922, "epoch": 0.36703982382088457, "grad_norm": 0.0419233106603903, "kl": 0.083465576171875, "learning_rate": 2.1766613556194344e-07, "loss": 0.0001, "reward": 1.0424107685685158, "reward_std": 0.10954663250595331, "rewards/equation_reward_func": 0.04910714505240321, "rewards/format_reward_func": 0.9933035783469677, "step": 250 }, { "completion_length": 281.173002243042, "epoch": 0.36997614241145166, "grad_norm": 0.045983801668794504, "kl": 0.087982177734375, "learning_rate": 2.1409713259040628e-07, "loss": 0.0001, "reward": 1.0390625447034836, "reward_std": 0.09796618251129985, "rewards/equation_reward_func": 0.04241071711294353, "rewards/format_reward_func": 0.9966517984867096, "step": 252 }, { "completion_length": 289.0078248977661, "epoch": 0.3729124610020187, "grad_norm": 0.04107695439935304, "kl": 0.079986572265625, "learning_rate": 2.105355856614115e-07, "loss": 0.0001, "reward": 1.0401786118745804, "reward_std": 0.08883230574429035, "rewards/equation_reward_func": 0.04575893038418144, "rewards/format_reward_func": 0.994419664144516, "step": 254 }, { "completion_length": 285.1651887893677, "epoch": 0.3758487795925858, "grad_norm": 0.046756981687644784, "kl": 0.083038330078125, "learning_rate": 2.069822344107413e-07, "loss": 0.0001, "reward": 1.0435268357396126, "reward_std": 0.09725010581314564, "rewards/equation_reward_func": 0.04799107403960079, "rewards/format_reward_func": 0.9955357313156128, "step": 256 }, { "completion_length": 297.3203296661377, "epoch": 0.37878509818315287, "grad_norm": 0.0392014960390643, "kl": 0.080810546875, "learning_rate": 2.034378167721599e-07, "loss": 0.0001, "reward": 1.03683041036129, "reward_std": 0.09191235108301044, "rewards/equation_reward_func": 0.041294644586741924, "rewards/format_reward_func": 0.9955357238650322, "step": 258 }, { "completion_length": 298.24443340301514, "epoch": 0.38172141677371996, "grad_norm": 0.047185735730817774, "kl": 0.080322265625, "learning_rate": 1.9990306882416485e-07, "loss": 0.0001, "reward": 1.063616119325161, "reward_std": 0.11580956913530827, "rewards/equation_reward_func": 0.06696428812574595, "rewards/format_reward_func": 0.9966517984867096, "step": 260 }, { "completion_length": 309.2422037124634, "epoch": 0.38465773536428705, "grad_norm": 0.044908671018604095, "kl": 0.07867431640625, "learning_rate": 1.9637872463712362e-07, "loss": 0.0001, "reward": 1.063616119325161, "reward_std": 0.12546450505033135, "rewards/equation_reward_func": 0.0680803598370403, "rewards/format_reward_func": 0.9955357313156128, "step": 262 }, { "completion_length": 322.2745637893677, "epoch": 0.3875940539548541, "grad_norm": 0.0532866767137098, "kl": 0.080291748046875, "learning_rate": 1.9286551612082773e-07, "loss": 0.0001, "reward": 1.0580357611179352, "reward_std": 0.1573281823657453, "rewards/equation_reward_func": 0.07254464575089514, "rewards/format_reward_func": 0.9854911044239998, "step": 264 }, { "completion_length": 310.11050605773926, "epoch": 0.39053037254542117, "grad_norm": 0.0314648384232076, "kl": 0.0770263671875, "learning_rate": 1.8936417287249446e-07, "loss": 0.0001, "reward": 1.0345982685685158, "reward_std": 0.1127313463948667, "rewards/equation_reward_func": 0.0424107164144516, "rewards/format_reward_func": 0.9921875149011612, "step": 266 }, { "completion_length": 316.63059425354004, "epoch": 0.39346669113598826, "grad_norm": 0.03920885901252771, "kl": 0.080841064453125, "learning_rate": 1.8587542202524985e-07, "loss": 0.0001, "reward": 1.0513393431901932, "reward_std": 0.1442212238907814, "rewards/equation_reward_func": 0.06473214633297175, "rewards/format_reward_func": 0.9866071864962578, "step": 268 }, { "completion_length": 330.75001525878906, "epoch": 0.39640300972655534, "grad_norm": 0.03988966842604354, "kl": 0.07989501953125, "learning_rate": 1.82399988097123e-07, "loss": 0.0001, "reward": 1.041294690221548, "reward_std": 0.15050739981234074, "rewards/equation_reward_func": 0.059151788242161274, "rewards/format_reward_func": 0.9821428880095482, "step": 270 }, { "completion_length": 323.02680015563965, "epoch": 0.39933932831712243, "grad_norm": 0.03862862085447584, "kl": 0.07598876953125, "learning_rate": 1.7893859284058378e-07, "loss": 0.0001, "reward": 1.0334821790456772, "reward_std": 0.09288874920457602, "rewards/equation_reward_func": 0.04129464423749596, "rewards/format_reward_func": 0.9921875298023224, "step": 272 }, { "completion_length": 313.7120609283447, "epoch": 0.40227564690768947, "grad_norm": 0.04412481986958316, "kl": 0.07879638671875, "learning_rate": 1.7549195509265407e-07, "loss": 0.0001, "reward": 1.0747768431901932, "reward_std": 0.14350473042577505, "rewards/equation_reward_func": 0.07812500454019755, "rewards/format_reward_func": 0.9966517984867096, "step": 274 }, { "completion_length": 322.9977798461914, "epoch": 0.40521196549825655, "grad_norm": 0.04375980104795101, "kl": 0.0792236328125, "learning_rate": 1.7206079062562536e-07, "loss": 0.0001, "reward": 1.0725446939468384, "reward_std": 0.12892362661659718, "rewards/equation_reward_func": 0.07924107450526208, "rewards/format_reward_func": 0.9933035969734192, "step": 276 }, { "completion_length": 320.5089416503906, "epoch": 0.40814828408882364, "grad_norm": 0.03561206298117896, "kl": 0.079742431640625, "learning_rate": 1.6864581199841226e-07, "loss": 0.0001, "reward": 1.0223214626312256, "reward_std": 0.11314251320436597, "rewards/equation_reward_func": 0.037946430034935474, "rewards/format_reward_func": 0.9843750298023224, "step": 278 }, { "completion_length": 341.03349685668945, "epoch": 0.41108460267939073, "grad_norm": 0.03974030944314017, "kl": 0.078216552734375, "learning_rate": 1.6524772840857388e-07, "loss": 0.0001, "reward": 1.0167411230504513, "reward_std": 0.13769615534693003, "rewards/equation_reward_func": 0.03906250244472176, "rewards/format_reward_func": 0.9776786006987095, "step": 280 }, { "completion_length": 334.4274673461914, "epoch": 0.41402092126995776, "grad_norm": 0.047293808732492636, "kl": 0.079193115234375, "learning_rate": 1.6186724554503237e-07, "loss": 0.0001, "reward": 1.0591518431901932, "reward_std": 0.1618042946793139, "rewards/equation_reward_func": 0.07254464610014111, "rewards/format_reward_func": 0.9866071678698063, "step": 282 }, { "completion_length": 327.1138553619385, "epoch": 0.41695723986052485, "grad_norm": 0.04331271460083135, "kl": 0.078521728515625, "learning_rate": 1.5850506544152103e-07, "loss": 0.0001, "reward": 1.0625000484287739, "reward_std": 0.12944608414545655, "rewards/equation_reward_func": 0.07142857520375401, "rewards/format_reward_func": 0.9910714514553547, "step": 284 }, { "completion_length": 335.18974685668945, "epoch": 0.41989355845109194, "grad_norm": 0.03802066786188892, "kl": 0.080474853515625, "learning_rate": 1.5516188633079107e-07, "loss": 0.0001, "reward": 1.0424107536673546, "reward_std": 0.1544543677009642, "rewards/equation_reward_func": 0.06250000279396772, "rewards/format_reward_func": 0.9799107387661934, "step": 286 }, { "completion_length": 347.41854095458984, "epoch": 0.42282987704165903, "grad_norm": 0.03187971705442499, "kl": 0.08209228515625, "learning_rate": 1.5183840249960784e-07, "loss": 0.0001, "reward": 1.0245536267757416, "reward_std": 0.13140886183828115, "rewards/equation_reward_func": 0.043526786961592734, "rewards/format_reward_func": 0.9810268133878708, "step": 288 }, { "completion_length": 343.9810371398926, "epoch": 0.4257661956322261, "grad_norm": 0.043022835792256374, "kl": 0.0875244140625, "learning_rate": 1.4853530414456612e-07, "loss": 0.0001, "reward": 1.0625000596046448, "reward_std": 0.1669562510214746, "rewards/equation_reward_func": 0.07812500395812094, "rewards/format_reward_func": 0.9843750298023224, "step": 290 }, { "completion_length": 337.49108695983887, "epoch": 0.42870251422279315, "grad_norm": 0.047030760078363716, "kl": 0.0819091796875, "learning_rate": 1.4525327722875568e-07, "loss": 0.0001, "reward": 1.0602679029107094, "reward_std": 0.17207134095951915, "rewards/equation_reward_func": 0.07589286204893142, "rewards/format_reward_func": 0.9843750335276127, "step": 292 }, { "completion_length": 351.33818435668945, "epoch": 0.43163883281336024, "grad_norm": 0.03668725152945335, "kl": 0.082733154296875, "learning_rate": 1.4199300333930515e-07, "loss": 0.0001, "reward": 1.0680803954601288, "reward_std": 0.203396650031209, "rewards/equation_reward_func": 0.09151786146685481, "rewards/format_reward_func": 0.9765625223517418, "step": 294 }, { "completion_length": 346.4051513671875, "epoch": 0.43457515140392733, "grad_norm": 0.04950076381454808, "kl": 0.086700439453125, "learning_rate": 1.3875515954583523e-07, "loss": 0.0001, "reward": 1.0468750521540642, "reward_std": 0.18728240253403783, "rewards/equation_reward_func": 0.07254464691504836, "rewards/format_reward_func": 0.9743303954601288, "step": 296 }, { "completion_length": 360.35157775878906, "epoch": 0.4375114699944944, "grad_norm": 0.047052508644481995, "kl": 0.0931396484375, "learning_rate": 1.3554041825985e-07, "loss": 0.0001, "reward": 1.032366119325161, "reward_std": 0.1628238232806325, "rewards/equation_reward_func": 0.05803571699652821, "rewards/format_reward_func": 0.9743303842842579, "step": 298 }, { "completion_length": 341.84376525878906, "epoch": 0.4404477885850615, "grad_norm": 0.04197536421230838, "kl": 0.087127685546875, "learning_rate": 1.323494470950949e-07, "loss": 0.0001, "reward": 1.0412946939468384, "reward_std": 0.1637354022823274, "rewards/equation_reward_func": 0.061383931431919336, "rewards/format_reward_func": 0.9799107536673546, "step": 300 }, { "completion_length": 340.64063835144043, "epoch": 0.44338410717562854, "grad_norm": 0.053637375712495786, "kl": 0.09027099609375, "learning_rate": 1.2918290872891236e-07, "loss": 0.0001, "reward": 1.0725446976721287, "reward_std": 0.1795735191553831, "rewards/equation_reward_func": 0.08705357520375401, "rewards/format_reward_func": 0.9854910969734192, "step": 302 }, { "completion_length": 343.1964473724365, "epoch": 0.4463204257661956, "grad_norm": 0.04823925382374683, "kl": 0.09429931640625, "learning_rate": 1.260414607646213e-07, "loss": 0.0001, "reward": 1.0736607760190964, "reward_std": 0.18405223218724132, "rewards/equation_reward_func": 0.08928571816068143, "rewards/format_reward_func": 0.9843750298023224, "step": 304 }, { "completion_length": 350.75225257873535, "epoch": 0.4492567443567627, "grad_norm": 0.05097139551388257, "kl": 0.089996337890625, "learning_rate": 1.2292575559495143e-07, "loss": 0.0001, "reward": 1.072544701397419, "reward_std": 0.19982971157878637, "rewards/equation_reward_func": 0.09709821979049593, "rewards/format_reward_func": 0.9754464663565159, "step": 306 }, { "completion_length": 359.4218921661377, "epoch": 0.4521930629473298, "grad_norm": 0.04853008603673837, "kl": 0.089630126953125, "learning_rate": 1.1983644026655835e-07, "loss": 0.0001, "reward": 1.0814732648432255, "reward_std": 0.19363146228715777, "rewards/equation_reward_func": 0.09598214726429433, "rewards/format_reward_func": 0.9854911006987095, "step": 308 }, { "completion_length": 346.43081855773926, "epoch": 0.45512938153789684, "grad_norm": 0.04702497359438019, "kl": 0.091400146484375, "learning_rate": 1.1677415634565066e-07, "loss": 0.0001, "reward": 1.0591518357396126, "reward_std": 0.17150792852044106, "rewards/equation_reward_func": 0.07700893189758062, "rewards/format_reward_func": 0.9821428954601288, "step": 310 }, { "completion_length": 356.02010345458984, "epoch": 0.4580657001284639, "grad_norm": 0.034288823432535455, "kl": 0.088470458984375, "learning_rate": 1.1373953978475353e-07, "loss": 0.0001, "reward": 1.0446429140865803, "reward_std": 0.17355853877961636, "rewards/equation_reward_func": 0.07031250337604433, "rewards/format_reward_func": 0.9743304029107094, "step": 312 }, { "completion_length": 348.6417579650879, "epoch": 0.461002018719031, "grad_norm": 0.05696162778862769, "kl": 0.09442138671875, "learning_rate": 1.1073322079063913e-07, "loss": 0.0001, "reward": 1.059151828289032, "reward_std": 0.17560010217130184, "rewards/equation_reward_func": 0.07924107648432255, "rewards/format_reward_func": 0.9799107499420643, "step": 314 }, { "completion_length": 363.2265796661377, "epoch": 0.4639383373095981, "grad_norm": 0.054353334909736144, "kl": 0.08917236328125, "learning_rate": 1.0775582369344946e-07, "loss": 0.0001, "reward": 1.066964328289032, "reward_std": 0.19891262240707874, "rewards/equation_reward_func": 0.09263393236324191, "rewards/format_reward_func": 0.9743303880095482, "step": 316 }, { "completion_length": 343.60157585144043, "epoch": 0.4668746559001652, "grad_norm": 0.041973139726504065, "kl": 0.093017578125, "learning_rate": 1.0480796681704077e-07, "loss": 0.0001, "reward": 1.0535714775323868, "reward_std": 0.1561298966407776, "rewards/equation_reward_func": 0.07254464726429433, "rewards/format_reward_func": 0.9810268133878708, "step": 318 }, { "completion_length": 356.78126525878906, "epoch": 0.4698109744907322, "grad_norm": 0.04263629162591363, "kl": 0.093994140625, "learning_rate": 1.018902623505741e-07, "loss": 0.0001, "reward": 1.0602679066359997, "reward_std": 0.1679113474674523, "rewards/equation_reward_func": 0.0803571465658024, "rewards/format_reward_func": 0.979910746216774, "step": 320 }, { "completion_length": 334.065860748291, "epoch": 0.4727472930812993, "grad_norm": 0.04185249481712557, "kl": 0.099578857421875, "learning_rate": 9.900331622138063e-08, "loss": 0.0001, "reward": 1.0580357648432255, "reward_std": 0.1624443898908794, "rewards/equation_reward_func": 0.0736607180442661, "rewards/format_reward_func": 0.9843750298023224, "step": 322 }, { "completion_length": 364.9464416503906, "epoch": 0.4756836116718664, "grad_norm": 0.05018023548907757, "kl": 0.09326171875, "learning_rate": 9.614772796912681e-08, "loss": 0.0001, "reward": 1.0502232536673546, "reward_std": 0.17889559408649802, "rewards/equation_reward_func": 0.07924107415601611, "rewards/format_reward_func": 0.9709821790456772, "step": 324 }, { "completion_length": 361.39957427978516, "epoch": 0.4786199302624335, "grad_norm": 0.0408224554527788, "kl": 0.09619140625, "learning_rate": 9.332409062130686e-08, "loss": 0.0001, "reward": 1.0535714775323868, "reward_std": 0.18046156875789165, "rewards/equation_reward_func": 0.07589286041911691, "rewards/format_reward_func": 0.9776785932481289, "step": 326 }, { "completion_length": 373.2198791503906, "epoch": 0.4815562488530005, "grad_norm": 0.04378704975777743, "kl": 0.09039306640625, "learning_rate": 9.053299057008699e-08, "loss": 0.0001, "reward": 1.0379464700818062, "reward_std": 0.1679329937323928, "rewards/equation_reward_func": 0.06138393108267337, "rewards/format_reward_func": 0.9765625409781933, "step": 328 }, { "completion_length": 373.792423248291, "epoch": 0.4844925674435676, "grad_norm": 0.049976975281267676, "kl": 0.0914306640625, "learning_rate": 8.777500745052743e-08, "loss": 0.0001, "reward": 1.0758929178118706, "reward_std": 0.18235793197527528, "rewards/equation_reward_func": 0.09598214726429433, "rewards/format_reward_func": 0.9799107387661934, "step": 330 }, { "completion_length": 370.22434997558594, "epoch": 0.4874288860341347, "grad_norm": 0.04464940799424063, "kl": 0.091278076171875, "learning_rate": 8.505071402020892e-08, "loss": 0.0001, "reward": 1.0669643506407738, "reward_std": 0.18469198187813163, "rewards/equation_reward_func": 0.08928571664728224, "rewards/format_reward_func": 0.9776785969734192, "step": 332 }, { "completion_length": 382.16854095458984, "epoch": 0.4903652046247018, "grad_norm": 0.0496328152205412, "kl": 0.132293701171875, "learning_rate": 8.236067604028562e-08, "loss": 0.0001, "reward": 1.0513393431901932, "reward_std": 0.1591361202299595, "rewards/equation_reward_func": 0.07142857427243143, "rewards/format_reward_func": 0.9799107536673546, "step": 334 }, { "completion_length": 384.12167167663574, "epoch": 0.4933015232152689, "grad_norm": 0.04586325949575296, "kl": 0.093780517578125, "learning_rate": 7.970545215799327e-08, "loss": 0.0001, "reward": 1.0368303954601288, "reward_std": 0.18249922152608633, "rewards/equation_reward_func": 0.06919643189758062, "rewards/format_reward_func": 0.9676339626312256, "step": 336 }, { "completion_length": 371.5636348724365, "epoch": 0.4962378418058359, "grad_norm": 0.039612951202073324, "kl": 0.094390869140625, "learning_rate": 7.708559379063204e-08, "loss": 0.0001, "reward": 1.069196492433548, "reward_std": 0.20001838542521, "rewards/equation_reward_func": 0.09151786239817739, "rewards/format_reward_func": 0.9776785932481289, "step": 338 }, { "completion_length": 375.1752414703369, "epoch": 0.499174160396403, "grad_norm": 0.046616949041271845, "kl": 0.092926025390625, "learning_rate": 7.45016450110534e-08, "loss": 0.0001, "reward": 1.0725446864962578, "reward_std": 0.1956021711230278, "rewards/equation_reward_func": 0.09598214738070965, "rewards/format_reward_func": 0.9765625298023224, "step": 340 }, { "completion_length": 371.14622688293457, "epoch": 0.50211047898697, "grad_norm": 0.04763239881868573, "kl": 0.0966796875, "learning_rate": 7.195414243467029e-08, "loss": 0.0001, "reward": 1.0558036267757416, "reward_std": 0.16503770695999265, "rewards/equation_reward_func": 0.07254464668221772, "rewards/format_reward_func": 0.983258955180645, "step": 342 }, { "completion_length": 387.4989013671875, "epoch": 0.5050467975775371, "grad_norm": 0.04756655671271559, "kl": 0.08740234375, "learning_rate": 6.944361510801763e-08, "loss": 0.0001, "reward": 1.0658482685685158, "reward_std": 0.16942863073199987, "rewards/equation_reward_func": 0.08482143247965723, "rewards/format_reward_func": 0.9810268245637417, "step": 344 }, { "completion_length": 386.53461265563965, "epoch": 0.5079831161681042, "grad_norm": 0.044536873399056064, "kl": 0.092315673828125, "learning_rate": 6.697058439888283e-08, "loss": 0.0001, "reward": 1.0993304178118706, "reward_std": 0.25071316212415695, "rewards/equation_reward_func": 0.12946429091971368, "rewards/format_reward_func": 0.9698660969734192, "step": 346 }, { "completion_length": 380.17635345458984, "epoch": 0.5109194347586713, "grad_norm": 0.038330951190360854, "kl": 0.09490966796875, "learning_rate": 6.453556388803288e-08, "loss": 0.0001, "reward": 1.0825893357396126, "reward_std": 0.19026875868439674, "rewards/equation_reward_func": 0.0993303619325161, "rewards/format_reward_func": 0.9832589589059353, "step": 348 }, { "completion_length": 389.3169822692871, "epoch": 0.5138557533492384, "grad_norm": 0.042924839902815745, "kl": 0.092254638671875, "learning_rate": 6.213905926255697e-08, "loss": 0.0001, "reward": 1.0569197051227093, "reward_std": 0.19485764298588037, "rewards/equation_reward_func": 0.08482143247965723, "rewards/format_reward_func": 0.9720982499420643, "step": 350 }, { "completion_length": 386.6774711608887, "epoch": 0.5167920719398055, "grad_norm": 0.052338487607322216, "kl": 0.09173583984375, "learning_rate": 5.978156821084987e-08, "loss": 0.0001, "reward": 1.059151828289032, "reward_std": 0.22638031467795372, "rewards/equation_reward_func": 0.09040179057046771, "rewards/format_reward_func": 0.968750037252903, "step": 352 }, { "completion_length": 390.3426513671875, "epoch": 0.5197283905303726, "grad_norm": 0.05158535991793682, "kl": 0.0892333984375, "learning_rate": 5.7463580319254853e-08, "loss": 0.0001, "reward": 1.0535714775323868, "reward_std": 0.20130603248253465, "rewards/equation_reward_func": 0.08035714738070965, "rewards/format_reward_func": 0.9732143171131611, "step": 354 }, { "completion_length": 369.5725612640381, "epoch": 0.5226647091209397, "grad_norm": 0.05330391501700506, "kl": 0.097869873046875, "learning_rate": 5.518557697039081e-08, "loss": 0.0001, "reward": 1.0703125558793545, "reward_std": 0.19206226477399468, "rewards/equation_reward_func": 0.09375000512227416, "rewards/format_reward_func": 0.9765625409781933, "step": 356 }, { "completion_length": 398.66854667663574, "epoch": 0.5256010277115067, "grad_norm": 0.051887163767993155, "kl": 0.091033935546875, "learning_rate": 5.294803124318145e-08, "loss": 0.0001, "reward": 1.05245541036129, "reward_std": 0.18004788551479578, "rewards/equation_reward_func": 0.07812500337604433, "rewards/format_reward_func": 0.9743303880095482, "step": 358 }, { "completion_length": 404.6495723724365, "epoch": 0.5285373463020738, "grad_norm": 0.050939622333094535, "kl": 0.095245361328125, "learning_rate": 5.07514078146106e-08, "loss": 0.0001, "reward": 1.0546875484287739, "reward_std": 0.2043363954871893, "rewards/equation_reward_func": 0.08705357497092336, "rewards/format_reward_func": 0.967633955180645, "step": 360 }, { "completion_length": 388.80581855773926, "epoch": 0.5314736648926408, "grad_norm": 0.05380538494559486, "kl": 0.093353271484375, "learning_rate": 4.859616286322094e-08, "loss": 0.0001, "reward": 1.0770089849829674, "reward_std": 0.20855529373511672, "rewards/equation_reward_func": 0.09821429126895964, "rewards/format_reward_func": 0.9787946604192257, "step": 362 }, { "completion_length": 391.8716678619385, "epoch": 0.5344099834832079, "grad_norm": 0.053453712412842525, "kl": 0.091156005859375, "learning_rate": 4.648274397437829e-08, "loss": 0.0001, "reward": 1.0647321939468384, "reward_std": 0.2162796063348651, "rewards/equation_reward_func": 0.09263393317814916, "rewards/format_reward_func": 0.972098246216774, "step": 364 }, { "completion_length": 391.14622497558594, "epoch": 0.537346302073775, "grad_norm": 0.06186318024343802, "kl": 0.095550537109375, "learning_rate": 4.4411590047320617e-08, "loss": 0.0001, "reward": 1.0959821864962578, "reward_std": 0.25807888340204954, "rewards/equation_reward_func": 0.12611607741564512, "rewards/format_reward_func": 0.9698660969734192, "step": 366 }, { "completion_length": 377.6551513671875, "epoch": 0.5402826206643421, "grad_norm": 0.053715796741151546, "kl": 0.099456787109375, "learning_rate": 4.2383131204010494e-08, "loss": 0.0001, "reward": 1.12276791036129, "reward_std": 0.2649730620905757, "rewards/equation_reward_func": 0.14508929126895964, "rewards/format_reward_func": 0.9776785895228386, "step": 368 }, { "completion_length": 403.7723388671875, "epoch": 0.5432189392549092, "grad_norm": 0.047203255127769794, "kl": 0.098663330078125, "learning_rate": 4.039778869981064e-08, "loss": 0.0001, "reward": 1.08370541036129, "reward_std": 0.20436841249465942, "rewards/equation_reward_func": 0.10825893376022577, "rewards/format_reward_func": 0.9754464589059353, "step": 370 }, { "completion_length": 392.40403175354004, "epoch": 0.5461552578454762, "grad_norm": 0.05153714808744298, "kl": 0.097930908203125, "learning_rate": 3.845597483600049e-08, "loss": 0.0001, "reward": 1.1093750447034836, "reward_std": 0.2366180717945099, "rewards/equation_reward_func": 0.13058036309666932, "rewards/format_reward_func": 0.9787946753203869, "step": 372 }, { "completion_length": 415.2890796661377, "epoch": 0.5490915764360433, "grad_norm": 0.038295429751397964, "kl": 0.10107421875, "learning_rate": 3.655809287415284e-08, "loss": 0.0001, "reward": 1.0401786118745804, "reward_std": 0.19542105589061975, "rewards/equation_reward_func": 0.07589286018628627, "rewards/format_reward_func": 0.9642857387661934, "step": 374 }, { "completion_length": 403.9542598724365, "epoch": 0.5520278950266104, "grad_norm": 0.04692015937158236, "kl": 0.093994140625, "learning_rate": 3.4704536952387285e-08, "loss": 0.0001, "reward": 1.0703125521540642, "reward_std": 0.18491909513249993, "rewards/equation_reward_func": 0.089285719092004, "rewards/format_reward_func": 0.9810268208384514, "step": 376 }, { "completion_length": 406.22546577453613, "epoch": 0.5549642136171775, "grad_norm": 0.04774051648965683, "kl": 0.094635009765625, "learning_rate": 3.2895692003518575e-08, "loss": 0.0001, "reward": 1.058035746216774, "reward_std": 0.23594195628538728, "rewards/equation_reward_func": 0.09933036204893142, "rewards/format_reward_func": 0.9587053768336773, "step": 378 }, { "completion_length": 386.1004638671875, "epoch": 0.5579005322077445, "grad_norm": 0.05077221624579398, "kl": 0.099517822265625, "learning_rate": 3.113193367511635e-08, "loss": 0.0001, "reward": 1.0736607611179352, "reward_std": 0.19753747899085283, "rewards/equation_reward_func": 0.09263393352739513, "rewards/format_reward_func": 0.9810268171131611, "step": 380 }, { "completion_length": 409.23438835144043, "epoch": 0.5608368507983116, "grad_norm": 0.04807883821547216, "kl": 0.096954345703125, "learning_rate": 2.9413628251493934e-08, "loss": 0.0001, "reward": 1.0703125670552254, "reward_std": 0.20752025907859206, "rewards/equation_reward_func": 0.09598214703146368, "rewards/format_reward_func": 0.9743303842842579, "step": 382 }, { "completion_length": 404.74109077453613, "epoch": 0.5637731693888787, "grad_norm": 0.05267361932547423, "kl": 0.10076904296875, "learning_rate": 2.774113257764066e-08, "loss": 0.0001, "reward": 1.0959821939468384, "reward_std": 0.2620803425088525, "rewards/equation_reward_func": 0.12946429033763707, "rewards/format_reward_func": 0.9665178917348385, "step": 384 }, { "completion_length": 398.66966247558594, "epoch": 0.5667094879794458, "grad_norm": 0.045965645927339774, "kl": 0.093353271484375, "learning_rate": 2.611479398511518e-08, "loss": 0.0001, "reward": 1.0959821939468384, "reward_std": 0.22108421614393592, "rewards/equation_reward_func": 0.1205357207218185, "rewards/format_reward_func": 0.9754464514553547, "step": 386 }, { "completion_length": 395.1495666503906, "epoch": 0.5696458065700128, "grad_norm": 0.05174221228956814, "kl": 0.099700927734375, "learning_rate": 2.4534950219914057e-08, "loss": 0.0001, "reward": 1.1383929029107094, "reward_std": 0.25243138894438744, "rewards/equation_reward_func": 0.16183036472648382, "rewards/format_reward_func": 0.9765625260770321, "step": 388 }, { "completion_length": 406.16854667663574, "epoch": 0.5725821251605799, "grad_norm": 0.05019021020140579, "kl": 0.09979248046875, "learning_rate": 2.300192937233128e-08, "loss": 0.0001, "reward": 1.0892857573926449, "reward_std": 0.21054052747786045, "rewards/equation_reward_func": 0.11049107869621366, "rewards/format_reward_func": 0.9787946864962578, "step": 390 }, { "completion_length": 388.2801513671875, "epoch": 0.575518443751147, "grad_norm": 0.04975482536547829, "kl": 0.101959228515625, "learning_rate": 2.1516049808822935e-08, "loss": 0.0001, "reward": 1.102678619325161, "reward_std": 0.2276111119426787, "rewards/equation_reward_func": 0.11941964831203222, "rewards/format_reward_func": 0.9832589514553547, "step": 392 }, { "completion_length": 397.13059997558594, "epoch": 0.5784547623417141, "grad_norm": 0.0482003869323553, "kl": 0.092620849609375, "learning_rate": 2.007762010589098e-08, "loss": 0.0001, "reward": 1.0915179140865803, "reward_std": 0.22312113596126437, "rewards/equation_reward_func": 0.11607143434230238, "rewards/format_reward_func": 0.9754464589059353, "step": 394 }, { "completion_length": 405.1875171661377, "epoch": 0.5813910809322812, "grad_norm": 0.058761232555913534, "kl": 0.10430908203125, "learning_rate": 1.8686938986000627e-08, "loss": 0.0001, "reward": 1.0970982611179352, "reward_std": 0.2505375109612942, "rewards/equation_reward_func": 0.12388393399305642, "rewards/format_reward_func": 0.9732143208384514, "step": 396 }, { "completion_length": 415.2120723724365, "epoch": 0.5843273995228482, "grad_norm": 0.04939032637161742, "kl": 0.098541259765625, "learning_rate": 1.734429525554365e-08, "loss": 0.0001, "reward": 1.0647321939468384, "reward_std": 0.21378363063558936, "rewards/equation_reward_func": 0.09709821967408061, "rewards/format_reward_func": 0.9676339514553547, "step": 398 }, { "completion_length": 402.12947845458984, "epoch": 0.5872637181134153, "grad_norm": 0.0582226075441157, "kl": 0.101898193359375, "learning_rate": 1.604996774486145e-08, "loss": 0.0001, "reward": 1.0747768357396126, "reward_std": 0.23708053398877382, "rewards/equation_reward_func": 0.10825893213041127, "rewards/format_reward_func": 0.9665178917348385, "step": 400 }, { "completion_length": 419.4643077850342, "epoch": 0.5902000367039824, "grad_norm": 0.05248475625533865, "kl": 0.09747314453125, "learning_rate": 1.4804225250339281e-08, "loss": 0.0001, "reward": 1.056919701397419, "reward_std": 0.2246052329428494, "rewards/equation_reward_func": 0.09040178917348385, "rewards/format_reward_func": 0.9665178917348385, "step": 402 }, { "completion_length": 406.70760917663574, "epoch": 0.5931363552945494, "grad_norm": 0.04188127192836941, "kl": 0.10638427734375, "learning_rate": 1.360732647858498e-08, "loss": 0.0001, "reward": 1.0580357648432255, "reward_std": 0.1819898965768516, "rewards/equation_reward_func": 0.08147321932483464, "rewards/format_reward_func": 0.9765625260770321, "step": 404 }, { "completion_length": 423.35604667663574, "epoch": 0.5960726738851165, "grad_norm": 0.046451328772035054, "kl": 0.09967041015625, "learning_rate": 1.2459519992702311e-08, "loss": 0.0001, "reward": 1.10714291036129, "reward_std": 0.2326993877068162, "rewards/equation_reward_func": 0.13169643562287092, "rewards/format_reward_func": 0.9754464477300644, "step": 406 }, { "completion_length": 392.1015796661377, "epoch": 0.5990089924756836, "grad_norm": 0.05080696425513128, "kl": 0.105438232421875, "learning_rate": 1.1361044160671629e-08, "loss": 0.0001, "reward": 1.1082589849829674, "reward_std": 0.2411864292807877, "rewards/equation_reward_func": 0.13058036274742335, "rewards/format_reward_func": 0.9776786044239998, "step": 408 }, { "completion_length": 396.29577445983887, "epoch": 0.6019453110662507, "grad_norm": 0.05576037207187397, "kl": 0.09539794921875, "learning_rate": 1.0312127105846947e-08, "loss": 0.0001, "reward": 1.1026786267757416, "reward_std": 0.2508852328173816, "rewards/equation_reward_func": 0.13281250547152013, "rewards/format_reward_func": 0.9698660969734192, "step": 410 }, { "completion_length": 399.7053737640381, "epoch": 0.6048816296568178, "grad_norm": 0.059732333663183326, "kl": 0.10064697265625, "learning_rate": 9.312986659581301e-09, "loss": 0.0001, "reward": 1.1049107611179352, "reward_std": 0.23088762862607837, "rewards/equation_reward_func": 0.12834822211880237, "rewards/format_reward_func": 0.9765625298023224, "step": 412 }, { "completion_length": 398.66184997558594, "epoch": 0.6078179482473849, "grad_norm": 0.05324309506242098, "kl": 0.103302001953125, "learning_rate": 8.363830315988945e-09, "loss": 0.0001, "reward": 1.102678619325161, "reward_std": 0.2296189209446311, "rewards/equation_reward_func": 0.12388393364381045, "rewards/format_reward_func": 0.9787946864962578, "step": 414 }, { "completion_length": 417.6897506713867, "epoch": 0.6107542668379519, "grad_norm": 0.04683799522299558, "kl": 0.107086181640625, "learning_rate": 7.46485518885462e-09, "loss": 0.0001, "reward": 1.0669643469154835, "reward_std": 0.20856121368706226, "rewards/equation_reward_func": 0.09263393329456449, "rewards/format_reward_func": 0.9743303842842579, "step": 416 }, { "completion_length": 405.1105079650879, "epoch": 0.613690585428519, "grad_norm": 0.049111930200072484, "kl": 0.096710205078125, "learning_rate": 6.616247970698319e-09, "loss": 0.0001, "reward": 1.09933041036129, "reward_std": 0.26021250896155834, "rewards/equation_reward_func": 0.13281250465661287, "rewards/format_reward_func": 0.9665178917348385, "step": 418 }, { "completion_length": 412.4944381713867, "epoch": 0.616626904019086, "grad_norm": 0.05400451597158681, "kl": 0.102020263671875, "learning_rate": 5.8181848940044855e-09, "loss": 0.0001, "reward": 1.1149553880095482, "reward_std": 0.26809723395854235, "rewards/equation_reward_func": 0.15290179254952818, "rewards/format_reward_func": 0.9620536044239998, "step": 420 }, { "completion_length": 416.1339473724365, "epoch": 0.6195632226096531, "grad_norm": 0.04012774328694736, "kl": 0.099609375, "learning_rate": 5.070831694623135e-09, "loss": 0.0001, "reward": 1.0937500447034836, "reward_std": 0.23063845187425613, "rewards/equation_reward_func": 0.1194196492433548, "rewards/format_reward_func": 0.9743303917348385, "step": 422 }, { "completion_length": 403.98104667663574, "epoch": 0.6224995412002202, "grad_norm": 0.04957974374231148, "kl": 0.097686767578125, "learning_rate": 4.374343577351336e-09, "loss": 0.0001, "reward": 1.0814732536673546, "reward_std": 0.22890522051602602, "rewards/equation_reward_func": 0.10825893364381045, "rewards/format_reward_func": 0.9732143171131611, "step": 424 }, { "completion_length": 403.1160888671875, "epoch": 0.6254358597907873, "grad_norm": 0.04373844754088721, "kl": 0.10205078125, "learning_rate": 3.7288651837012745e-09, "loss": 0.0001, "reward": 1.0870536155998707, "reward_std": 0.20328176161274314, "rewards/equation_reward_func": 0.1116071492433548, "rewards/format_reward_func": 0.9754464775323868, "step": 426 }, { "completion_length": 416.70426177978516, "epoch": 0.6283721783813544, "grad_norm": 0.05687104274871836, "kl": 0.099456787109375, "learning_rate": 3.134530561862081e-09, "loss": 0.0001, "reward": 1.103794690221548, "reward_std": 0.2526063285768032, "rewards/equation_reward_func": 0.13616072200238705, "rewards/format_reward_func": 0.9676339626312256, "step": 428 }, { "completion_length": 402.1964473724365, "epoch": 0.6313084969719215, "grad_norm": 0.044913896708701294, "kl": 0.1141357421875, "learning_rate": 2.5914631388619103e-09, "loss": 0.0001, "reward": 1.0725446864962578, "reward_std": 0.23142389208078384, "rewards/equation_reward_func": 0.10491071839351207, "rewards/format_reward_func": 0.9676339663565159, "step": 430 }, { "completion_length": 395.0424289703369, "epoch": 0.6342448155624886, "grad_norm": 0.045882245640528084, "kl": 0.109832763671875, "learning_rate": 2.0997756949353297e-09, "loss": 0.0001, "reward": 1.0814732536673546, "reward_std": 0.22362250182777643, "rewards/equation_reward_func": 0.11160714749712497, "rewards/format_reward_func": 0.9698661118745804, "step": 432 }, { "completion_length": 426.2924289703369, "epoch": 0.6371811341530557, "grad_norm": 0.04758448749390688, "kl": 0.1451416015625, "learning_rate": 1.6595703401020844e-09, "loss": 0.0001, "reward": 1.0669643357396126, "reward_std": 0.23458201717585325, "rewards/equation_reward_func": 0.10044643271248788, "rewards/format_reward_func": 0.9665178842842579, "step": 434 }, { "completion_length": 406.60493087768555, "epoch": 0.6401174527436226, "grad_norm": 0.06141012337958767, "kl": 0.100128173828125, "learning_rate": 1.2709384929615596e-09, "loss": 0.0001, "reward": 1.0948661044239998, "reward_std": 0.2197393993847072, "rewards/equation_reward_func": 0.11941964959260076, "rewards/format_reward_func": 0.9754464626312256, "step": 436 }, { "completion_length": 409.0055980682373, "epoch": 0.6430537713341897, "grad_norm": 0.05577362369837321, "kl": 0.113677978515625, "learning_rate": 9.339608617077165e-10, "loss": 0.0001, "reward": 1.0814732611179352, "reward_std": 0.22773250937461853, "rewards/equation_reward_func": 0.111607147147879, "rewards/format_reward_func": 0.9698661006987095, "step": 438 }, { "completion_length": 423.28126525878906, "epoch": 0.6459900899247568, "grad_norm": 0.04649263712299981, "kl": 0.10736083984375, "learning_rate": 6.487074273681114e-10, "loss": 0.0001, "reward": 1.0379464700818062, "reward_std": 0.2183691617101431, "rewards/equation_reward_func": 0.08035714703146368, "rewards/format_reward_func": 0.9575893208384514, "step": 440 }, { "completion_length": 424.0346221923828, "epoch": 0.6489264085153239, "grad_norm": 0.051863884871225424, "kl": 0.0963134765625, "learning_rate": 4.152374292708538e-10, "loss": 0.0001, "reward": 1.08370541036129, "reward_std": 0.23894479172304273, "rewards/equation_reward_func": 0.11607143329456449, "rewards/format_reward_func": 0.967633955180645, "step": 442 }, { "completion_length": 419.2076072692871, "epoch": 0.651862727105891, "grad_norm": 0.041908418740650634, "kl": 0.09466552734375, "learning_rate": 2.3359935274214204e-10, "loss": 0.0001, "reward": 1.0781250521540642, "reward_std": 0.20687641110271215, "rewards/equation_reward_func": 0.10602678940631449, "rewards/format_reward_func": 0.9720982424914837, "step": 444 }, { "completion_length": 409.03015327453613, "epoch": 0.6547990456964581, "grad_norm": 0.04991210004252944, "kl": 0.096923828125, "learning_rate": 1.0383091903720665e-10, "loss": 0.0001, "reward": 1.0892857760190964, "reward_std": 0.2569359806366265, "rewards/equation_reward_func": 0.12276786111760885, "rewards/format_reward_func": 0.9665179029107094, "step": 446 }, { "completion_length": 421.3951072692871, "epoch": 0.6577353642870252, "grad_norm": 0.0511078437253658, "kl": 0.09625244140625, "learning_rate": 2.595907750671533e-11, "loss": 0.0001, "reward": 1.1015625447034836, "reward_std": 0.2752464488148689, "rewards/equation_reward_func": 0.13950893515720963, "rewards/format_reward_func": 0.9620535932481289, "step": 448 }, { "completion_length": 429.61497497558594, "epoch": 0.6606716828775923, "grad_norm": 0.049613253545700765, "kl": 0.095245361328125, "learning_rate": 0.0, "loss": 0.0001, "reward": 1.0691964775323868, "reward_std": 0.2418026770465076, "rewards/equation_reward_func": 0.10602679173462093, "rewards/format_reward_func": 0.9631696715950966, "step": 450 }, { "epoch": 0.6606716828775923, "step": 450, "total_flos": 0.0, "train_loss": 6.581872487505476e-05, "train_runtime": 36634.2295, "train_samples_per_second": 0.688, "train_steps_per_second": 0.012 } ], "logging_steps": 2, "max_steps": 450, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }