|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9998343548119927, |
|
"eval_steps": 50, |
|
"global_step": 1509, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 384.27969818115236, |
|
"epoch": 0.006625807520291536, |
|
"grad_norm": 0.9638224244117737, |
|
"kl": 0.000667405128479004, |
|
"learning_rate": 1.3245033112582784e-06, |
|
"loss": 0.0, |
|
"reward": 0.44505209624767306, |
|
"reward_std": 0.42465767413377764, |
|
"rewards/accuracy_reward": 0.14088542019017042, |
|
"rewards/format_reward": 0.3041666761040688, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 186.6330778121948, |
|
"epoch": 0.013251615040583071, |
|
"grad_norm": 0.6953328847885132, |
|
"kl": 0.033650970458984374, |
|
"learning_rate": 2.6490066225165567e-06, |
|
"loss": 0.0013, |
|
"reward": 0.9434896126389504, |
|
"reward_std": 0.258835174748674, |
|
"rewards/accuracy_reward": 0.06432291923556477, |
|
"rewards/format_reward": 0.8791666895151138, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 147.85651473999025, |
|
"epoch": 0.019877422560874606, |
|
"grad_norm": 0.45314687490463257, |
|
"kl": 0.04227294921875, |
|
"learning_rate": 3.973509933774835e-06, |
|
"loss": 0.0017, |
|
"reward": 1.0822917133569718, |
|
"reward_std": 0.1591762812808156, |
|
"rewards/accuracy_reward": 0.09479166874662041, |
|
"rewards/format_reward": 0.9875000178813934, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 210.63177680969238, |
|
"epoch": 0.026503230081166142, |
|
"grad_norm": 0.4808696508407593, |
|
"kl": 0.0356292724609375, |
|
"learning_rate": 5.2980132450331135e-06, |
|
"loss": 0.0014, |
|
"reward": 1.1015625283122064, |
|
"reward_std": 0.22788139712065458, |
|
"rewards/accuracy_reward": 0.1317708377726376, |
|
"rewards/format_reward": 0.9697916865348816, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 262.94844551086425, |
|
"epoch": 0.033129037601457675, |
|
"grad_norm": 0.42699357867240906, |
|
"kl": 0.048345947265625, |
|
"learning_rate": 6.622516556291392e-06, |
|
"loss": 0.0019, |
|
"reward": 1.179687538743019, |
|
"reward_std": 0.28927393443882465, |
|
"rewards/accuracy_reward": 0.21250000605359673, |
|
"rewards/format_reward": 0.9671875178813935, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.033129037601457675, |
|
"eval_completion_length": 312.76158820258246, |
|
"eval_kl": 0.046305338541666664, |
|
"eval_loss": 0.001854513306170702, |
|
"eval_reward": 1.339120414521959, |
|
"eval_reward_std": 0.2885145727131102, |
|
"eval_rewards/accuracy_reward": 0.37615741623772514, |
|
"eval_rewards/format_reward": 0.962962978416019, |
|
"eval_runtime": 54.3599, |
|
"eval_samples_per_second": 1.821, |
|
"eval_steps_per_second": 0.166, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 367.77709159851076, |
|
"epoch": 0.03975484512174921, |
|
"grad_norm": 0.30974721908569336, |
|
"kl": 0.0423248291015625, |
|
"learning_rate": 7.94701986754967e-06, |
|
"loss": 0.0017, |
|
"reward": 1.28098963201046, |
|
"reward_std": 0.3665796037763357, |
|
"rewards/accuracy_reward": 0.33697917461395266, |
|
"rewards/format_reward": 0.9440104380249977, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 389.0981872558594, |
|
"epoch": 0.04638065264204075, |
|
"grad_norm": 0.46462583541870117, |
|
"kl": 0.054632568359375, |
|
"learning_rate": 9.271523178807948e-06, |
|
"loss": 0.0022, |
|
"reward": 1.3294271290302277, |
|
"reward_std": 0.3420734729617834, |
|
"rewards/accuracy_reward": 0.36953126192092894, |
|
"rewards/format_reward": 0.9598958507180214, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 371.92214546203616, |
|
"epoch": 0.053006460162332285, |
|
"grad_norm": 0.3880268633365631, |
|
"kl": 0.068634033203125, |
|
"learning_rate": 1.0596026490066227e-05, |
|
"loss": 0.0027, |
|
"reward": 1.3507812947034836, |
|
"reward_std": 0.3280396033078432, |
|
"rewards/accuracy_reward": 0.3901041779667139, |
|
"rewards/format_reward": 0.9606771051883698, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 326.34297904968264, |
|
"epoch": 0.05963226768262382, |
|
"grad_norm": 0.3763599693775177, |
|
"kl": 1.29837646484375, |
|
"learning_rate": 1.1920529801324505e-05, |
|
"loss": 0.0519, |
|
"reward": 1.323177123069763, |
|
"reward_std": 0.32737944051623347, |
|
"rewards/accuracy_reward": 0.36510417610406876, |
|
"rewards/format_reward": 0.9580729365348816, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 358.67058486938475, |
|
"epoch": 0.06625807520291535, |
|
"grad_norm": 0.9632219672203064, |
|
"kl": 6.1518310546875, |
|
"learning_rate": 1.3245033112582784e-05, |
|
"loss": 0.2462, |
|
"reward": 1.2625000327825546, |
|
"reward_std": 0.4080970410257578, |
|
"rewards/accuracy_reward": 0.369270845875144, |
|
"rewards/format_reward": 0.8932291895151139, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06625807520291535, |
|
"eval_completion_length": 325.4479259914822, |
|
"eval_kl": 0.2577582465277778, |
|
"eval_loss": 0.01030731201171875, |
|
"eval_reward": 1.400462998284234, |
|
"eval_reward_std": 0.3469897309939067, |
|
"eval_rewards/accuracy_reward": 0.49768519401550293, |
|
"eval_rewards/format_reward": 0.902777804268731, |
|
"eval_runtime": 51.973, |
|
"eval_samples_per_second": 1.905, |
|
"eval_steps_per_second": 0.173, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 340.5109470367432, |
|
"epoch": 0.0728838827232069, |
|
"grad_norm": 0.47404468059539795, |
|
"kl": 0.2045654296875, |
|
"learning_rate": 1.456953642384106e-05, |
|
"loss": 0.0082, |
|
"reward": 1.336718785762787, |
|
"reward_std": 0.4285093888640404, |
|
"rewards/accuracy_reward": 0.4596354283392429, |
|
"rewards/format_reward": 0.8770833566784859, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 255.94688339233397, |
|
"epoch": 0.07950969024349842, |
|
"grad_norm": 0.4434707462787628, |
|
"kl": 0.19765625, |
|
"learning_rate": 1.589403973509934e-05, |
|
"loss": 0.0079, |
|
"reward": 1.2223958760499953, |
|
"reward_std": 0.3857662923634052, |
|
"rewards/accuracy_reward": 0.3195312611758709, |
|
"rewards/format_reward": 0.9028646096587181, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 277.33933143615724, |
|
"epoch": 0.08613549776378997, |
|
"grad_norm": 0.3981403410434723, |
|
"kl": 1.035498046875, |
|
"learning_rate": 1.7218543046357617e-05, |
|
"loss": 0.0415, |
|
"reward": 1.249479204416275, |
|
"reward_std": 0.38627928495407104, |
|
"rewards/accuracy_reward": 0.3450520932674408, |
|
"rewards/format_reward": 0.9044271036982536, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 257.51510963439944, |
|
"epoch": 0.0927613052840815, |
|
"grad_norm": 0.6040926575660706, |
|
"kl": 0.2771484375, |
|
"learning_rate": 1.8543046357615895e-05, |
|
"loss": 0.0111, |
|
"reward": 1.186458373069763, |
|
"reward_std": 0.3947778932750225, |
|
"rewards/accuracy_reward": 0.28750000707805157, |
|
"rewards/format_reward": 0.8989583507180214, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 270.6711009979248, |
|
"epoch": 0.09938711280437303, |
|
"grad_norm": 0.3564830422401428, |
|
"kl": 0.252197265625, |
|
"learning_rate": 1.9867549668874173e-05, |
|
"loss": 0.0101, |
|
"reward": 1.2130208671092988, |
|
"reward_std": 0.4041451971977949, |
|
"rewards/accuracy_reward": 0.33229167833924295, |
|
"rewards/format_reward": 0.8807291880249977, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09938711280437303, |
|
"eval_completion_length": 208.04398727416992, |
|
"eval_kl": 0.3232421875, |
|
"eval_loss": 0.013497698120772839, |
|
"eval_reward": 1.3125000264909532, |
|
"eval_reward_std": 0.30849772029452854, |
|
"eval_rewards/accuracy_reward": 0.3773148192299737, |
|
"eval_rewards/format_reward": 0.9351852072609795, |
|
"eval_runtime": 58.9942, |
|
"eval_samples_per_second": 1.678, |
|
"eval_steps_per_second": 0.153, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 186.75338973999024, |
|
"epoch": 0.10601292032466457, |
|
"grad_norm": 0.7736382484436035, |
|
"kl": 0.6962646484375, |
|
"learning_rate": 1.999783259765003e-05, |
|
"loss": 0.0279, |
|
"reward": 1.183854204416275, |
|
"reward_std": 0.3419460911303759, |
|
"rewards/accuracy_reward": 0.2510416740551591, |
|
"rewards/format_reward": 0.9328125193715096, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 169.17083854675292, |
|
"epoch": 0.1126387278449561, |
|
"grad_norm": 0.3634221851825714, |
|
"kl": 0.459130859375, |
|
"learning_rate": 1.99903415488154e-05, |
|
"loss": 0.0183, |
|
"reward": 1.1283854573965073, |
|
"reward_std": 0.3242658941075206, |
|
"rewards/accuracy_reward": 0.20260417312383652, |
|
"rewards/format_reward": 0.9257812693715095, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 275.55860290527346, |
|
"epoch": 0.11926453536524764, |
|
"grad_norm": 0.38542240858078003, |
|
"kl": 0.3050048828125, |
|
"learning_rate": 1.997750410337147e-05, |
|
"loss": 0.0122, |
|
"reward": 1.182031288743019, |
|
"reward_std": 0.32699903920292855, |
|
"rewards/accuracy_reward": 0.24817709140479566, |
|
"rewards/format_reward": 0.9338541835546493, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 373.55287437438966, |
|
"epoch": 0.12589034288553919, |
|
"grad_norm": 1.1466913223266602, |
|
"kl": 0.5844970703125, |
|
"learning_rate": 1.995932713136112e-05, |
|
"loss": 0.0234, |
|
"reward": 1.1554687857627868, |
|
"reward_std": 0.3822615996003151, |
|
"rewards/accuracy_reward": 0.2549479236826301, |
|
"rewards/format_reward": 0.9005208566784859, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 239.1380271911621, |
|
"epoch": 0.1325161504058307, |
|
"grad_norm": 2.0522491931915283, |
|
"kl": 0.338720703125, |
|
"learning_rate": 1.993582036030978e-05, |
|
"loss": 0.0135, |
|
"reward": 1.128385452926159, |
|
"reward_std": 0.33877944238483904, |
|
"rewards/accuracy_reward": 0.2083333382382989, |
|
"rewards/format_reward": 0.9200521066784859, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1325161504058307, |
|
"eval_completion_length": 243.033571879069, |
|
"eval_kl": 0.3569878472222222, |
|
"eval_loss": 0.013369406573474407, |
|
"eval_reward": 1.207175976700253, |
|
"eval_reward_std": 0.37689801057179767, |
|
"eval_rewards/accuracy_reward": 0.32175926284657586, |
|
"eval_rewards/format_reward": 0.8854166865348816, |
|
"eval_runtime": 50.2709, |
|
"eval_samples_per_second": 1.969, |
|
"eval_steps_per_second": 0.179, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 236.56641330718995, |
|
"epoch": 0.13914195792612225, |
|
"grad_norm": 0.3385501503944397, |
|
"kl": 0.2685546875, |
|
"learning_rate": 1.9906996370019692e-05, |
|
"loss": 0.0107, |
|
"reward": 1.1578125387430191, |
|
"reward_std": 0.3522339530289173, |
|
"rewards/accuracy_reward": 0.24557292349636556, |
|
"rewards/format_reward": 0.912239608168602, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 201.9429744720459, |
|
"epoch": 0.1457677654464138, |
|
"grad_norm": 0.7260228395462036, |
|
"kl": 0.4362060546875, |
|
"learning_rate": 1.9872870585837757e-05, |
|
"loss": 0.0174, |
|
"reward": 1.1442708656191827, |
|
"reward_std": 0.3241072274744511, |
|
"rewards/accuracy_reward": 0.22395833912305535, |
|
"rewards/format_reward": 0.9203125178813935, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 171.1460983276367, |
|
"epoch": 0.1523935729667053, |
|
"grad_norm": 0.9234119057655334, |
|
"kl": 0.371435546875, |
|
"learning_rate": 1.983346127040053e-05, |
|
"loss": 0.0149, |
|
"reward": 1.1585937857627868, |
|
"reward_std": 0.3430036876350641, |
|
"rewards/accuracy_reward": 0.2291666740551591, |
|
"rewards/format_reward": 0.9294271036982537, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 184.18906688690186, |
|
"epoch": 0.15901938048699685, |
|
"grad_norm": 0.41350895166397095, |
|
"kl": 0.4388671875, |
|
"learning_rate": 1.9788789513860875e-05, |
|
"loss": 0.0176, |
|
"reward": 1.1466146260499954, |
|
"reward_std": 0.3448056776076555, |
|
"rewards/accuracy_reward": 0.22135417200624943, |
|
"rewards/format_reward": 0.9252604350447655, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 170.5622449874878, |
|
"epoch": 0.1656451880072884, |
|
"grad_norm": 0.4960261881351471, |
|
"kl": 0.430224609375, |
|
"learning_rate": 1.9738879222601425e-05, |
|
"loss": 0.0172, |
|
"reward": 1.1361979573965073, |
|
"reward_std": 0.3519858349114656, |
|
"rewards/accuracy_reward": 0.21927083879709244, |
|
"rewards/format_reward": 0.9169271022081376, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1656451880072884, |
|
"eval_completion_length": 177.629635281033, |
|
"eval_kl": 0.4537760416666667, |
|
"eval_loss": 0.018489297479391098, |
|
"eval_reward": 1.1678241226408217, |
|
"eval_reward_std": 0.40494963857862687, |
|
"eval_rewards/accuracy_reward": 0.2627314892080095, |
|
"eval_rewards/format_reward": 0.9050926036304898, |
|
"eval_runtime": 49.3339, |
|
"eval_samples_per_second": 2.007, |
|
"eval_steps_per_second": 0.182, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 165.07448387145996, |
|
"epoch": 0.17227099552757993, |
|
"grad_norm": 0.4660806655883789, |
|
"kl": 0.3287353515625, |
|
"learning_rate": 1.968375710644093e-05, |
|
"loss": 0.0132, |
|
"reward": 1.1317708671092988, |
|
"reward_std": 0.33136086612939836, |
|
"rewards/accuracy_reward": 0.20364583972841502, |
|
"rewards/format_reward": 0.9281250208616256, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 153.50963916778565, |
|
"epoch": 0.17889680304787145, |
|
"grad_norm": 0.46810558438301086, |
|
"kl": 0.4310546875, |
|
"learning_rate": 1.9623452664340305e-05, |
|
"loss": 0.0173, |
|
"reward": 1.1289062976837159, |
|
"reward_std": 0.3033852633088827, |
|
"rewards/accuracy_reward": 0.19479167063254862, |
|
"rewards/format_reward": 0.9341146022081375, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 127.1726598739624, |
|
"epoch": 0.185522610568163, |
|
"grad_norm": 0.36899781227111816, |
|
"kl": 0.368310546875, |
|
"learning_rate": 1.9557998168616087e-05, |
|
"loss": 0.0147, |
|
"reward": 1.1588542029261588, |
|
"reward_std": 0.2841993160545826, |
|
"rewards/accuracy_reward": 0.20598958879709245, |
|
"rewards/format_reward": 0.9528646022081375, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 126.30937976837158, |
|
"epoch": 0.19214841808845454, |
|
"grad_norm": 0.38908788561820984, |
|
"kl": 0.504345703125, |
|
"learning_rate": 1.9487428647669688e-05, |
|
"loss": 0.0202, |
|
"reward": 1.1286458730697633, |
|
"reward_std": 0.2689166348427534, |
|
"rewards/accuracy_reward": 0.18385417251847685, |
|
"rewards/format_reward": 0.9447916865348815, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 164.8523473739624, |
|
"epoch": 0.19877422560874605, |
|
"grad_norm": 0.580731213092804, |
|
"kl": 0.39619140625, |
|
"learning_rate": 1.9411781867241718e-05, |
|
"loss": 0.0159, |
|
"reward": 1.1351562827825545, |
|
"reward_std": 0.2917447902262211, |
|
"rewards/accuracy_reward": 0.19661458898335696, |
|
"rewards/format_reward": 0.9385416880249977, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.19877422560874605, |
|
"eval_completion_length": 169.12963443332248, |
|
"eval_kl": 0.7450086805555556, |
|
"eval_loss": 0.031219787895679474, |
|
"eval_reward": 1.1620370944341023, |
|
"eval_reward_std": 0.3717506031195323, |
|
"eval_rewards/accuracy_reward": 0.22916667411724725, |
|
"eval_rewards/format_reward": 0.9328703946537442, |
|
"eval_runtime": 48.3046, |
|
"eval_samples_per_second": 2.049, |
|
"eval_steps_per_second": 0.186, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 137.1604206085205, |
|
"epoch": 0.2054000331290376, |
|
"grad_norm": 0.38375967741012573, |
|
"kl": 0.378857421875, |
|
"learning_rate": 1.9331098310201392e-05, |
|
"loss": 0.0152, |
|
"reward": 1.1098958775401115, |
|
"reward_std": 0.265699202939868, |
|
"rewards/accuracy_reward": 0.16588542177341878, |
|
"rewards/format_reward": 0.9440104380249977, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 188.78802585601807, |
|
"epoch": 0.21202584064932914, |
|
"grad_norm": 0.33487841486930847, |
|
"kl": 0.496142578125, |
|
"learning_rate": 1.9245421154881873e-05, |
|
"loss": 0.0199, |
|
"reward": 1.1182292073965072, |
|
"reward_std": 0.31800296930596234, |
|
"rewards/accuracy_reward": 0.18828125610016286, |
|
"rewards/format_reward": 0.9299479365348816, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 186.27344207763673, |
|
"epoch": 0.21865164816962068, |
|
"grad_norm": 1.0283613204956055, |
|
"kl": 0.3497314453125, |
|
"learning_rate": 1.9154796251973092e-05, |
|
"loss": 0.014, |
|
"reward": 1.1596354573965073, |
|
"reward_std": 0.3103053130209446, |
|
"rewards/accuracy_reward": 0.217968756519258, |
|
"rewards/format_reward": 0.9416666910052299, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 204.39453773498536, |
|
"epoch": 0.2252774556899122, |
|
"grad_norm": 0.43050748109817505, |
|
"kl": 0.498681640625, |
|
"learning_rate": 1.905927209998447e-05, |
|
"loss": 0.0199, |
|
"reward": 1.1023437835276126, |
|
"reward_std": 0.30628957897424697, |
|
"rewards/accuracy_reward": 0.212239589355886, |
|
"rewards/format_reward": 0.8901041835546494, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 211.14583930969238, |
|
"epoch": 0.23190326321020374, |
|
"grad_norm": 0.9810725450515747, |
|
"kl": 0.328759765625, |
|
"learning_rate": 1.8958899819290592e-05, |
|
"loss": 0.0132, |
|
"reward": 1.0622396171092987, |
|
"reward_std": 0.2948887083679438, |
|
"rewards/accuracy_reward": 0.15156250395812093, |
|
"rewards/format_reward": 0.9106771036982536, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.23190326321020374, |
|
"eval_completion_length": 187.81829155815973, |
|
"eval_kl": 0.3449435763888889, |
|
"eval_loss": 0.014129959046840668, |
|
"eval_reward": 1.2037037346098158, |
|
"eval_reward_std": 0.3060726622740428, |
|
"eval_rewards/accuracy_reward": 0.27893519235981834, |
|
"eval_rewards/format_reward": 0.9247685339715745, |
|
"eval_runtime": 46.4056, |
|
"eval_samples_per_second": 2.133, |
|
"eval_steps_per_second": 0.194, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 170.20156650543214, |
|
"epoch": 0.23852907073049529, |
|
"grad_norm": 0.31330692768096924, |
|
"kl": 0.476318359375, |
|
"learning_rate": 1.8853733124773837e-05, |
|
"loss": 0.019, |
|
"reward": 1.0606771275401115, |
|
"reward_std": 0.2639134880155325, |
|
"rewards/accuracy_reward": 0.12161458658520133, |
|
"rewards/format_reward": 0.9390625208616257, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 125.54062900543212, |
|
"epoch": 0.24515487825078683, |
|
"grad_norm": 0.34919577836990356, |
|
"kl": 0.339111328125, |
|
"learning_rate": 1.8743828297078485e-05, |
|
"loss": 0.0136, |
|
"reward": 1.1145833671092986, |
|
"reward_std": 0.24877706002444028, |
|
"rewards/accuracy_reward": 0.15442708847112954, |
|
"rewards/format_reward": 0.9601562678813934, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 124.51458683013917, |
|
"epoch": 0.25178068577107837, |
|
"grad_norm": 0.6795300841331482, |
|
"kl": 0.314599609375, |
|
"learning_rate": 1.8629244152491773e-05, |
|
"loss": 0.0126, |
|
"reward": 1.1388021245598794, |
|
"reward_std": 0.25418675877153873, |
|
"rewards/accuracy_reward": 0.17187500512227416, |
|
"rewards/format_reward": 0.9669271022081375, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 185.66432704925538, |
|
"epoch": 0.2584064932913699, |
|
"grad_norm": 0.29387614130973816, |
|
"kl": 0.362744140625, |
|
"learning_rate": 1.8510042011467978e-05, |
|
"loss": 0.0145, |
|
"reward": 1.0606771200895309, |
|
"reward_std": 0.30348861529491844, |
|
"rewards/accuracy_reward": 0.1322916704695672, |
|
"rewards/format_reward": 0.9283854335546493, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 110.68646202087402, |
|
"epoch": 0.2650323008116614, |
|
"grad_norm": 0.3515387177467346, |
|
"kl": 0.326708984375, |
|
"learning_rate": 1.838628566581236e-05, |
|
"loss": 0.0131, |
|
"reward": 1.1135417073965073, |
|
"reward_std": 0.20253405962139368, |
|
"rewards/accuracy_reward": 0.13723958749324083, |
|
"rewards/format_reward": 0.9763021036982537, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2650323008116614, |
|
"eval_completion_length": 116.7835676405165, |
|
"eval_kl": 0.3328993055555556, |
|
"eval_loss": 0.013369088061153889, |
|
"eval_reward": 1.2650463183720906, |
|
"eval_reward_std": 0.2229729178878996, |
|
"eval_rewards/accuracy_reward": 0.2812500033113692, |
|
"eval_rewards/format_reward": 0.9837963117493523, |
|
"eval_runtime": 40.8337, |
|
"eval_samples_per_second": 2.424, |
|
"eval_steps_per_second": 0.22, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 166.33490180969238, |
|
"epoch": 0.271658108331953, |
|
"grad_norm": 0.3290473520755768, |
|
"kl": 0.31640625, |
|
"learning_rate": 1.8258041344542567e-05, |
|
"loss": 0.0126, |
|
"reward": 1.115625037252903, |
|
"reward_std": 0.24871433693915607, |
|
"rewards/accuracy_reward": 0.15520833851769567, |
|
"rewards/format_reward": 0.9604166850447655, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 162.28073406219482, |
|
"epoch": 0.2782839158522445, |
|
"grad_norm": 1.5391435623168945, |
|
"kl": 0.419970703125, |
|
"learning_rate": 1.8125377678445755e-05, |
|
"loss": 0.0168, |
|
"reward": 1.1242187842726707, |
|
"reward_std": 0.2836728408932686, |
|
"rewards/accuracy_reward": 0.17578125637955963, |
|
"rewards/format_reward": 0.9484375208616257, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 84.54088821411133, |
|
"epoch": 0.284909723372536, |
|
"grad_norm": 0.4486382305622101, |
|
"kl": 0.465869140625, |
|
"learning_rate": 1.7988365663350352e-05, |
|
"loss": 0.0186, |
|
"reward": 1.1148437857627869, |
|
"reward_std": 0.25627183392643926, |
|
"rewards/accuracy_reward": 0.16406250447034837, |
|
"rewards/format_reward": 0.9507812678813934, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 111.79870109558105, |
|
"epoch": 0.2915355308928276, |
|
"grad_norm": 0.47133392095565796, |
|
"kl": 0.50244140625, |
|
"learning_rate": 1.7847078622132202e-05, |
|
"loss": 0.0201, |
|
"reward": 1.1361979499459267, |
|
"reward_std": 0.25920494105666875, |
|
"rewards/accuracy_reward": 0.19583333916962148, |
|
"rewards/format_reward": 0.9403646036982536, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 95.4588568687439, |
|
"epoch": 0.2981613384131191, |
|
"grad_norm": 0.4222451150417328, |
|
"kl": 0.48828125, |
|
"learning_rate": 1.770159216547532e-05, |
|
"loss": 0.0195, |
|
"reward": 1.1565104603767395, |
|
"reward_std": 0.2345227889716625, |
|
"rewards/accuracy_reward": 0.1872395884245634, |
|
"rewards/format_reward": 0.9692708507180214, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2981613384131191, |
|
"eval_completion_length": 139.7500059339735, |
|
"eval_kl": 0.4025607638888889, |
|
"eval_loss": 0.016312483698129654, |
|
"eval_reward": 1.1759259833229914, |
|
"eval_reward_std": 0.3269110951158736, |
|
"eval_rewards/accuracy_reward": 0.23148148589664036, |
|
"eval_rewards/format_reward": 0.9444444643126594, |
|
"eval_runtime": 45.7275, |
|
"eval_samples_per_second": 2.165, |
|
"eval_steps_per_second": 0.197, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 169.73099479675292, |
|
"epoch": 0.3047871459334106, |
|
"grad_norm": 0.41585874557495117, |
|
"kl": 0.4705078125, |
|
"learning_rate": 1.7551984151408363e-05, |
|
"loss": 0.0188, |
|
"reward": 1.0976562932133676, |
|
"reward_std": 0.3228786814957857, |
|
"rewards/accuracy_reward": 0.17473958879709245, |
|
"rewards/format_reward": 0.9229166880249977, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 94.51797103881836, |
|
"epoch": 0.3114129534537022, |
|
"grad_norm": 0.40987807512283325, |
|
"kl": 0.56259765625, |
|
"learning_rate": 1.739833464363838e-05, |
|
"loss": 0.0225, |
|
"reward": 1.1330729693174362, |
|
"reward_std": 0.22887265272438526, |
|
"rewards/accuracy_reward": 0.16380208819173275, |
|
"rewards/format_reward": 0.9692708566784859, |
|
"step": 470 |
|
}, |
|
{ |
|
"completion_length": 124.16875343322754, |
|
"epoch": 0.3180387609739937, |
|
"grad_norm": 0.4068983495235443, |
|
"kl": 0.44365234375, |
|
"learning_rate": 1.7240725868704218e-05, |
|
"loss": 0.0177, |
|
"reward": 1.0838542029261589, |
|
"reward_std": 0.25959088616073134, |
|
"rewards/accuracy_reward": 0.13828125388827175, |
|
"rewards/format_reward": 0.9455729335546493, |
|
"step": 480 |
|
}, |
|
{ |
|
"completion_length": 167.1687551498413, |
|
"epoch": 0.32466456849428527, |
|
"grad_norm": 0.34697845578193665, |
|
"kl": 0.405126953125, |
|
"learning_rate": 1.7079242171972417e-05, |
|
"loss": 0.0162, |
|
"reward": 1.1450521230697632, |
|
"reward_std": 0.280022681877017, |
|
"rewards/accuracy_reward": 0.19843750591389836, |
|
"rewards/format_reward": 0.946614608168602, |
|
"step": 490 |
|
}, |
|
{ |
|
"completion_length": 148.58568172454835, |
|
"epoch": 0.3312903760145768, |
|
"grad_norm": 0.3142317235469818, |
|
"kl": 0.427001953125, |
|
"learning_rate": 1.6913969972499272e-05, |
|
"loss": 0.0171, |
|
"reward": 1.1440104603767396, |
|
"reward_std": 0.30612033531069754, |
|
"rewards/accuracy_reward": 0.20625000540167093, |
|
"rewards/format_reward": 0.9377604395151138, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3312903760145768, |
|
"eval_completion_length": 123.2951431274414, |
|
"eval_kl": 0.4292534722222222, |
|
"eval_loss": 0.017204057425260544, |
|
"eval_reward": 1.2835648589664035, |
|
"eval_reward_std": 0.30847717821598053, |
|
"eval_rewards/accuracy_reward": 0.32060185737080044, |
|
"eval_rewards/format_reward": 0.9629629850387573, |
|
"eval_runtime": 44.5786, |
|
"eval_samples_per_second": 2.221, |
|
"eval_steps_per_second": 0.202, |
|
"step": 500 |
|
}, |
|
{ |
|
"completion_length": 114.76849279403686, |
|
"epoch": 0.3379161835348683, |
|
"grad_norm": 0.4374355673789978, |
|
"kl": 0.5587890625, |
|
"learning_rate": 1.674499771678309e-05, |
|
"loss": 0.0224, |
|
"reward": 1.1583333671092988, |
|
"reward_std": 0.2752871666103601, |
|
"rewards/accuracy_reward": 0.20286458814516664, |
|
"rewards/format_reward": 0.9554687708616256, |
|
"step": 510 |
|
}, |
|
{ |
|
"completion_length": 118.95989866256714, |
|
"epoch": 0.34454199105515987, |
|
"grad_norm": 0.3690480887889862, |
|
"kl": 0.47080078125, |
|
"learning_rate": 1.6572415831431466e-05, |
|
"loss": 0.0188, |
|
"reward": 1.1442708760499953, |
|
"reward_std": 0.266701377555728, |
|
"rewards/accuracy_reward": 0.188541672937572, |
|
"rewards/format_reward": 0.9557291850447655, |
|
"step": 520 |
|
}, |
|
{ |
|
"completion_length": 154.7580778121948, |
|
"epoch": 0.3511677985754514, |
|
"grad_norm": 0.33155742287635803, |
|
"kl": 0.51572265625, |
|
"learning_rate": 1.6396316674768914e-05, |
|
"loss": 0.0206, |
|
"reward": 1.108593787252903, |
|
"reward_std": 0.3096882740035653, |
|
"rewards/accuracy_reward": 0.18203125561121852, |
|
"rewards/format_reward": 0.9265625208616257, |
|
"step": 530 |
|
}, |
|
{ |
|
"completion_length": 76.65052275657654, |
|
"epoch": 0.3577936060957429, |
|
"grad_norm": 0.534902036190033, |
|
"kl": 0.69091796875, |
|
"learning_rate": 1.621679448741067e-05, |
|
"loss": 0.0276, |
|
"reward": 1.1270833760499954, |
|
"reward_std": 0.25888577867299317, |
|
"rewards/accuracy_reward": 0.17552083907648922, |
|
"rewards/format_reward": 0.9515625208616256, |
|
"step": 540 |
|
}, |
|
{ |
|
"completion_length": 140.45651397705078, |
|
"epoch": 0.36441941361603447, |
|
"grad_norm": 0.7749128937721252, |
|
"kl": 0.670166015625, |
|
"learning_rate": 1.603394534182925e-05, |
|
"loss": 0.0268, |
|
"reward": 1.0437500402331352, |
|
"reward_std": 0.2872411595657468, |
|
"rewards/accuracy_reward": 0.12812500363215804, |
|
"rewards/format_reward": 0.9156250163912774, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.36441941361603447, |
|
"eval_completion_length": 162.37384711371527, |
|
"eval_kl": 0.5069444444444444, |
|
"eval_loss": 0.020352717489004135, |
|
"eval_reward": 1.1203704012764826, |
|
"eval_reward_std": 0.34640828768412274, |
|
"eval_rewards/accuracy_reward": 0.20717593530813852, |
|
"eval_rewards/format_reward": 0.9131944643126594, |
|
"eval_runtime": 48.3205, |
|
"eval_samples_per_second": 2.049, |
|
"eval_steps_per_second": 0.186, |
|
"step": 550 |
|
}, |
|
{ |
|
"completion_length": 134.0356803894043, |
|
"epoch": 0.371045221136326, |
|
"grad_norm": 0.3628806173801422, |
|
"kl": 0.53544921875, |
|
"learning_rate": 1.5847867090940602e-05, |
|
"loss": 0.0214, |
|
"reward": 1.1044271185994148, |
|
"reward_std": 0.2886835677549243, |
|
"rewards/accuracy_reward": 0.16770833884365857, |
|
"rewards/format_reward": 0.9367187678813934, |
|
"step": 560 |
|
}, |
|
{ |
|
"completion_length": 130.9809928894043, |
|
"epoch": 0.3776710286566175, |
|
"grad_norm": 0.4081813395023346, |
|
"kl": 0.500341796875, |
|
"learning_rate": 1.5658659315737505e-05, |
|
"loss": 0.02, |
|
"reward": 1.13567713201046, |
|
"reward_std": 0.25305410884320734, |
|
"rewards/accuracy_reward": 0.18567708935588598, |
|
"rewards/format_reward": 0.9500000193715096, |
|
"step": 570 |
|
}, |
|
{ |
|
"completion_length": 114.95755519866944, |
|
"epoch": 0.3842968361769091, |
|
"grad_norm": 0.37676241993904114, |
|
"kl": 0.564794921875, |
|
"learning_rate": 1.5466423271998144e-05, |
|
"loss": 0.0226, |
|
"reward": 1.135416714847088, |
|
"reward_std": 0.2524235276505351, |
|
"rewards/accuracy_reward": 0.18697917149402202, |
|
"rewards/format_reward": 0.9484375223517418, |
|
"step": 580 |
|
}, |
|
{ |
|
"completion_length": 115.24349269866943, |
|
"epoch": 0.3909226436972006, |
|
"grad_norm": 0.4308485686779022, |
|
"kl": 0.5263671875, |
|
"learning_rate": 1.5271261836098403e-05, |
|
"loss": 0.0211, |
|
"reward": 1.1546875417232514, |
|
"reward_std": 0.255348096229136, |
|
"rewards/accuracy_reward": 0.19895834159106016, |
|
"rewards/format_reward": 0.9557291865348816, |
|
"step": 590 |
|
}, |
|
{ |
|
"completion_length": 128.86224336624144, |
|
"epoch": 0.3975484512174921, |
|
"grad_norm": 0.42378589510917664, |
|
"kl": 0.762548828125, |
|
"learning_rate": 1.5073279449956916e-05, |
|
"loss": 0.0305, |
|
"reward": 1.1497396290302277, |
|
"reward_std": 0.27525530084967614, |
|
"rewards/accuracy_reward": 0.2031250052154064, |
|
"rewards/format_reward": 0.9466146022081375, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3975484512174921, |
|
"eval_completion_length": 103.54282718234592, |
|
"eval_kl": 0.5060763888888888, |
|
"eval_loss": 0.020621497184038162, |
|
"eval_reward": 1.269675956832038, |
|
"eval_reward_std": 0.27875811523861355, |
|
"eval_rewards/accuracy_reward": 0.29398149251937866, |
|
"eval_rewards/format_reward": 0.9756944643126594, |
|
"eval_runtime": 40.4616, |
|
"eval_samples_per_second": 2.447, |
|
"eval_steps_per_second": 0.222, |
|
"step": 600 |
|
}, |
|
{ |
|
"completion_length": 119.92604503631591, |
|
"epoch": 0.4041742587377837, |
|
"grad_norm": 0.346579909324646, |
|
"kl": 0.48173828125, |
|
"learning_rate": 1.4872582065142285e-05, |
|
"loss": 0.0193, |
|
"reward": 1.184895870089531, |
|
"reward_std": 0.25066950926557185, |
|
"rewards/accuracy_reward": 0.22057292337995021, |
|
"rewards/format_reward": 0.9643229380249977, |
|
"step": 610 |
|
}, |
|
{ |
|
"completion_length": 166.857816696167, |
|
"epoch": 0.4108000662580752, |
|
"grad_norm": 0.3131767809391022, |
|
"kl": 0.425244140625, |
|
"learning_rate": 1.4669277086172406e-05, |
|
"loss": 0.017, |
|
"reward": 1.1731771290302277, |
|
"reward_std": 0.2866227850317955, |
|
"rewards/accuracy_reward": 0.22500000558793545, |
|
"rewards/format_reward": 0.9481771036982536, |
|
"step": 620 |
|
}, |
|
{ |
|
"completion_length": 177.0984432220459, |
|
"epoch": 0.41742587377836676, |
|
"grad_norm": 0.3359943926334381, |
|
"kl": 0.46962890625, |
|
"learning_rate": 1.4463473313036241e-05, |
|
"loss": 0.0188, |
|
"reward": 1.171875038743019, |
|
"reward_std": 0.27172743044793607, |
|
"rewards/accuracy_reward": 0.22317709103226663, |
|
"rewards/format_reward": 0.9486979335546494, |
|
"step": 630 |
|
}, |
|
{ |
|
"completion_length": 160.54766025543213, |
|
"epoch": 0.4240516812986583, |
|
"grad_norm": 0.3603960871696472, |
|
"kl": 0.39130859375, |
|
"learning_rate": 1.4255280882968787e-05, |
|
"loss": 0.0157, |
|
"reward": 1.1869792103767396, |
|
"reward_std": 0.2906502477824688, |
|
"rewards/accuracy_reward": 0.23411459047347308, |
|
"rewards/format_reward": 0.9528646036982537, |
|
"step": 640 |
|
}, |
|
{ |
|
"completion_length": 164.8755266189575, |
|
"epoch": 0.4306774888189498, |
|
"grad_norm": 0.336618572473526, |
|
"kl": 0.445849609375, |
|
"learning_rate": 1.4044811211510419e-05, |
|
"loss": 0.0178, |
|
"reward": 1.1713542014360427, |
|
"reward_std": 0.30316176153719426, |
|
"rewards/accuracy_reward": 0.23437500689178706, |
|
"rewards/format_reward": 0.9369791880249977, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.4306774888189498, |
|
"eval_completion_length": 113.57755109998915, |
|
"eval_kl": 0.4815538194444444, |
|
"eval_loss": 0.01964624412357807, |
|
"eval_reward": 1.2708333730697632, |
|
"eval_reward_std": 0.3395750116970804, |
|
"eval_rewards/accuracy_reward": 0.3090277901954121, |
|
"eval_rewards/format_reward": 0.9618055688010322, |
|
"eval_runtime": 43.035, |
|
"eval_samples_per_second": 2.3, |
|
"eval_steps_per_second": 0.209, |
|
"step": 650 |
|
}, |
|
{ |
|
"completion_length": 125.93802452087402, |
|
"epoch": 0.43730329633924137, |
|
"grad_norm": 0.4743092656135559, |
|
"kl": 0.461376953125, |
|
"learning_rate": 1.3832176932882136e-05, |
|
"loss": 0.0184, |
|
"reward": 1.1882812917232513, |
|
"reward_std": 0.2539562493562698, |
|
"rewards/accuracy_reward": 0.22656250707805156, |
|
"rewards/format_reward": 0.9617187693715096, |
|
"step": 660 |
|
}, |
|
{ |
|
"completion_length": 139.05703525543214, |
|
"epoch": 0.4439291038595329, |
|
"grad_norm": 0.4012630581855774, |
|
"kl": 0.507470703125, |
|
"learning_rate": 1.3617491839708614e-05, |
|
"loss": 0.0203, |
|
"reward": 1.171093788743019, |
|
"reward_std": 0.2858675643801689, |
|
"rewards/accuracy_reward": 0.22265625689178706, |
|
"rewards/format_reward": 0.9484375208616257, |
|
"step": 670 |
|
}, |
|
{ |
|
"completion_length": 117.47630653381347, |
|
"epoch": 0.4505549113798244, |
|
"grad_norm": 0.3696196675300598, |
|
"kl": 0.542919921875, |
|
"learning_rate": 1.3400870822121348e-05, |
|
"loss": 0.0217, |
|
"reward": 1.1838542059063912, |
|
"reward_std": 0.2748897645622492, |
|
"rewards/accuracy_reward": 0.2341145918238908, |
|
"rewards/format_reward": 0.9497396051883698, |
|
"step": 680 |
|
}, |
|
{ |
|
"completion_length": 145.46432704925536, |
|
"epoch": 0.45718071890011597, |
|
"grad_norm": 5.13590145111084, |
|
"kl": 0.692041015625, |
|
"learning_rate": 1.3182429806274442e-05, |
|
"loss": 0.0277, |
|
"reward": 1.1864583760499954, |
|
"reward_std": 0.270385118573904, |
|
"rewards/accuracy_reward": 0.23776042237877845, |
|
"rewards/format_reward": 0.9486979350447655, |
|
"step": 690 |
|
}, |
|
{ |
|
"completion_length": 135.6513063430786, |
|
"epoch": 0.4638065264204075, |
|
"grad_norm": 0.39522725343704224, |
|
"kl": 0.557568359375, |
|
"learning_rate": 1.2962285692305964e-05, |
|
"loss": 0.0223, |
|
"reward": 1.1851562783122063, |
|
"reward_std": 0.2556427549570799, |
|
"rewards/accuracy_reward": 0.23046875661239027, |
|
"rewards/format_reward": 0.9546875208616257, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4638065264204075, |
|
"eval_completion_length": 148.33218044704861, |
|
"eval_kl": 0.4103732638888889, |
|
"eval_loss": 0.016540652140975, |
|
"eval_reward": 1.2708333730697632, |
|
"eval_reward_std": 0.33350396156311035, |
|
"eval_rewards/accuracy_reward": 0.3240740845600764, |
|
"eval_rewards/format_reward": 0.9467592835426331, |
|
"eval_runtime": 46.6863, |
|
"eval_samples_per_second": 2.121, |
|
"eval_steps_per_second": 0.193, |
|
"step": 700 |
|
}, |
|
{ |
|
"completion_length": 148.985941696167, |
|
"epoch": 0.470432333940699, |
|
"grad_norm": 0.41000673174858093, |
|
"kl": 0.51171875, |
|
"learning_rate": 1.2740556291778096e-05, |
|
"loss": 0.0205, |
|
"reward": 1.1614583790302277, |
|
"reward_std": 0.3124166313558817, |
|
"rewards/accuracy_reward": 0.23463542368263007, |
|
"rewards/format_reward": 0.9268229380249977, |
|
"step": 710 |
|
}, |
|
{ |
|
"completion_length": 69.19583520889282, |
|
"epoch": 0.47705814146099057, |
|
"grad_norm": 0.3913906216621399, |
|
"kl": 0.63359375, |
|
"learning_rate": 1.2517360264629463e-05, |
|
"loss": 0.0254, |
|
"reward": 1.1914062932133676, |
|
"reward_std": 0.2280671002343297, |
|
"rewards/accuracy_reward": 0.2223958398681134, |
|
"rewards/format_reward": 0.9690104380249978, |
|
"step": 720 |
|
}, |
|
{ |
|
"completion_length": 108.44713830947876, |
|
"epoch": 0.4836839489812821, |
|
"grad_norm": 0.6712866425514221, |
|
"kl": 0.609033203125, |
|
"learning_rate": 1.2292817055673543e-05, |
|
"loss": 0.0244, |
|
"reward": 1.1658854559063911, |
|
"reward_std": 0.25000986782833934, |
|
"rewards/accuracy_reward": 0.21458333956543357, |
|
"rewards/format_reward": 0.9513021036982536, |
|
"step": 730 |
|
}, |
|
{ |
|
"completion_length": 144.48672218322753, |
|
"epoch": 0.49030975650157366, |
|
"grad_norm": 0.5536672472953796, |
|
"kl": 0.535302734375, |
|
"learning_rate": 1.2067046830676947e-05, |
|
"loss": 0.0214, |
|
"reward": 1.1919271260499955, |
|
"reward_std": 0.29237424544990065, |
|
"rewards/accuracy_reward": 0.24843750819563865, |
|
"rewards/format_reward": 0.9434896007180213, |
|
"step": 740 |
|
}, |
|
{ |
|
"completion_length": 139.08802452087403, |
|
"epoch": 0.4969355640218652, |
|
"grad_norm": 0.48829564452171326, |
|
"kl": 0.497509765625, |
|
"learning_rate": 1.1840170412051957e-05, |
|
"loss": 0.0199, |
|
"reward": 1.1994792014360427, |
|
"reward_std": 0.24491893574595452, |
|
"rewards/accuracy_reward": 0.2432291721459478, |
|
"rewards/format_reward": 0.9562500178813934, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4969355640218652, |
|
"eval_completion_length": 172.40972900390625, |
|
"eval_kl": 0.3891059027777778, |
|
"eval_loss": 0.015797864645719528, |
|
"eval_reward": 1.290509303410848, |
|
"eval_reward_std": 0.2870280941327413, |
|
"eval_rewards/accuracy_reward": 0.3414351973268721, |
|
"eval_rewards/format_reward": 0.9490740829043918, |
|
"eval_runtime": 46.928, |
|
"eval_samples_per_second": 2.11, |
|
"eval_steps_per_second": 0.192, |
|
"step": 750 |
|
}, |
|
{ |
|
"completion_length": 154.06302490234376, |
|
"epoch": 0.5035613715421567, |
|
"grad_norm": 0.4157249331474304, |
|
"kl": 0.80810546875, |
|
"learning_rate": 1.1612309214197599e-05, |
|
"loss": 0.0323, |
|
"reward": 1.2119792103767395, |
|
"reward_std": 0.2926496058702469, |
|
"rewards/accuracy_reward": 0.265885423310101, |
|
"rewards/format_reward": 0.9460937693715096, |
|
"step": 760 |
|
}, |
|
{ |
|
"completion_length": 109.2606798171997, |
|
"epoch": 0.5101871790624483, |
|
"grad_norm": 0.35986068844795227, |
|
"kl": 0.56611328125, |
|
"learning_rate": 1.1383585178523955e-05, |
|
"loss": 0.0227, |
|
"reward": 1.204166702926159, |
|
"reward_std": 0.2575360298156738, |
|
"rewards/accuracy_reward": 0.2502604253590107, |
|
"rewards/format_reward": 0.9539062708616257, |
|
"step": 770 |
|
}, |
|
{ |
|
"completion_length": 144.10937929153442, |
|
"epoch": 0.5168129865827398, |
|
"grad_norm": 0.30369916558265686, |
|
"kl": 0.592578125, |
|
"learning_rate": 1.1154120708194398e-05, |
|
"loss": 0.0237, |
|
"reward": 1.1294271171092987, |
|
"reward_std": 0.30647876001894475, |
|
"rewards/accuracy_reward": 0.20937500689178706, |
|
"rewards/format_reward": 0.9200521036982536, |
|
"step": 780 |
|
}, |
|
{ |
|
"completion_length": 92.92942972183228, |
|
"epoch": 0.5234387941030313, |
|
"grad_norm": 0.40557360649108887, |
|
"kl": 0.62392578125, |
|
"learning_rate": 1.0924038602620757e-05, |
|
"loss": 0.025, |
|
"reward": 1.1776042118668557, |
|
"reward_std": 0.2612913876771927, |
|
"rewards/accuracy_reward": 0.2195312575204298, |
|
"rewards/format_reward": 0.9580729380249977, |
|
"step": 790 |
|
}, |
|
{ |
|
"completion_length": 139.11432666778563, |
|
"epoch": 0.5300646016233228, |
|
"grad_norm": 0.4020339846611023, |
|
"kl": 0.4591796875, |
|
"learning_rate": 1.0693461991746389e-05, |
|
"loss": 0.0184, |
|
"reward": 1.1885417088866235, |
|
"reward_std": 0.29046612568199637, |
|
"rewards/accuracy_reward": 0.24661459056660534, |
|
"rewards/format_reward": 0.9419271036982536, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5300646016233228, |
|
"eval_completion_length": 95.66782718234592, |
|
"eval_kl": 0.5199652777777778, |
|
"eval_loss": 0.021310841664671898, |
|
"eval_reward": 1.271990762816535, |
|
"eval_reward_std": 0.2451818229423629, |
|
"eval_rewards/accuracy_reward": 0.305555565489663, |
|
"eval_rewards/format_reward": 0.9664352072609795, |
|
"eval_runtime": 41.3833, |
|
"eval_samples_per_second": 2.392, |
|
"eval_steps_per_second": 0.217, |
|
"step": 800 |
|
}, |
|
{ |
|
"completion_length": 88.05521087646484, |
|
"epoch": 0.5366904091436144, |
|
"grad_norm": 0.3212931156158447, |
|
"kl": 0.6345703125, |
|
"learning_rate": 1.046251427015241e-05, |
|
"loss": 0.0254, |
|
"reward": 1.2151042073965073, |
|
"reward_std": 0.21537913139909506, |
|
"rewards/accuracy_reward": 0.24713542337995023, |
|
"rewards/format_reward": 0.9679687708616257, |
|
"step": 810 |
|
}, |
|
{ |
|
"completion_length": 114.12812786102295, |
|
"epoch": 0.543316216663906, |
|
"grad_norm": 0.36236339807510376, |
|
"kl": 0.535205078125, |
|
"learning_rate": 1.023131903102226e-05, |
|
"loss": 0.0214, |
|
"reward": 1.1872396260499953, |
|
"reward_std": 0.25381856635212896, |
|
"rewards/accuracy_reward": 0.22812500651925802, |
|
"rewards/format_reward": 0.9591146022081375, |
|
"step": 820 |
|
}, |
|
{ |
|
"completion_length": 120.23437843322753, |
|
"epoch": 0.5499420241841975, |
|
"grad_norm": 0.33346128463745117, |
|
"kl": 0.491455078125, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0197, |
|
"reward": 1.1893229603767395, |
|
"reward_std": 0.26861554831266404, |
|
"rewards/accuracy_reward": 0.2398437585681677, |
|
"rewards/format_reward": 0.9494791865348816, |
|
"step": 830 |
|
}, |
|
{ |
|
"completion_length": 104.39948310852051, |
|
"epoch": 0.556567831704489, |
|
"grad_norm": 0.287349671125412, |
|
"kl": 0.52109375, |
|
"learning_rate": 9.768680968977743e-06, |
|
"loss": 0.0208, |
|
"reward": 1.2109375417232513, |
|
"reward_std": 0.23620927650481463, |
|
"rewards/accuracy_reward": 0.24895834140479564, |
|
"rewards/format_reward": 0.9619791895151139, |
|
"step": 840 |
|
}, |
|
{ |
|
"completion_length": 141.53802490234375, |
|
"epoch": 0.5631936392247805, |
|
"grad_norm": 0.40245750546455383, |
|
"kl": 0.515869140625, |
|
"learning_rate": 9.537485729847594e-06, |
|
"loss": 0.0206, |
|
"reward": 1.1757812902331353, |
|
"reward_std": 0.2983800694346428, |
|
"rewards/accuracy_reward": 0.23984375763684512, |
|
"rewards/format_reward": 0.9359375193715096, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.5631936392247805, |
|
"eval_completion_length": 127.03241221110027, |
|
"eval_kl": 0.5015190972222222, |
|
"eval_loss": 0.020167144015431404, |
|
"eval_reward": 1.275462998284234, |
|
"eval_reward_std": 0.2603969905111525, |
|
"eval_rewards/accuracy_reward": 0.3298611177338494, |
|
"eval_rewards/format_reward": 0.9456018673049079, |
|
"eval_runtime": 45.7247, |
|
"eval_samples_per_second": 2.165, |
|
"eval_steps_per_second": 0.197, |
|
"step": 850 |
|
}, |
|
{ |
|
"completion_length": 102.3765655517578, |
|
"epoch": 0.569819446745072, |
|
"grad_norm": 0.3569597601890564, |
|
"kl": 0.54814453125, |
|
"learning_rate": 9.306538008253611e-06, |
|
"loss": 0.0219, |
|
"reward": 1.191406287252903, |
|
"reward_std": 0.2607411756180227, |
|
"rewards/accuracy_reward": 0.23203125474974512, |
|
"rewards/format_reward": 0.9593750193715096, |
|
"step": 860 |
|
}, |
|
{ |
|
"completion_length": 92.75234651565552, |
|
"epoch": 0.5764452542653636, |
|
"grad_norm": 1.2177760601043701, |
|
"kl": 0.564990234375, |
|
"learning_rate": 9.075961397379247e-06, |
|
"loss": 0.0226, |
|
"reward": 1.2388021230697632, |
|
"reward_std": 0.224729376193136, |
|
"rewards/accuracy_reward": 0.2682291749864817, |
|
"rewards/format_reward": 0.9705729380249977, |
|
"step": 870 |
|
}, |
|
{ |
|
"completion_length": 129.5666706085205, |
|
"epoch": 0.5830710617856552, |
|
"grad_norm": 0.3381529450416565, |
|
"kl": 0.604443359375, |
|
"learning_rate": 8.845879291805605e-06, |
|
"loss": 0.0242, |
|
"reward": 1.163541705906391, |
|
"reward_std": 0.27646171739324926, |
|
"rewards/accuracy_reward": 0.22343750707805157, |
|
"rewards/format_reward": 0.9401041865348816, |
|
"step": 880 |
|
}, |
|
{ |
|
"completion_length": 111.10937881469727, |
|
"epoch": 0.5896968693059467, |
|
"grad_norm": 0.3694465160369873, |
|
"kl": 0.60166015625, |
|
"learning_rate": 8.616414821476048e-06, |
|
"loss": 0.0241, |
|
"reward": 1.1927083715796472, |
|
"reward_std": 0.2659361926838756, |
|
"rewards/accuracy_reward": 0.24088542442768812, |
|
"rewards/format_reward": 0.9518229395151139, |
|
"step": 890 |
|
}, |
|
{ |
|
"completion_length": 120.31693096160889, |
|
"epoch": 0.5963226768262382, |
|
"grad_norm": 0.2243836671113968, |
|
"kl": 0.60830078125, |
|
"learning_rate": 8.387690785802403e-06, |
|
"loss": 0.0243, |
|
"reward": 1.1526041999459267, |
|
"reward_std": 0.25904723536223173, |
|
"rewards/accuracy_reward": 0.2093750056810677, |
|
"rewards/format_reward": 0.9432291865348816, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5963226768262382, |
|
"eval_completion_length": 118.34143914116754, |
|
"eval_kl": 0.5698784722222222, |
|
"eval_loss": 0.023158123716711998, |
|
"eval_reward": 1.29166669315762, |
|
"eval_reward_std": 0.26393843856122756, |
|
"eval_rewards/accuracy_reward": 0.34143519401550293, |
|
"eval_rewards/format_reward": 0.9502315057648553, |
|
"eval_runtime": 45.2036, |
|
"eval_samples_per_second": 2.19, |
|
"eval_steps_per_second": 0.199, |
|
"step": 900 |
|
}, |
|
{ |
|
"completion_length": 120.57187824249267, |
|
"epoch": 0.6029484843465297, |
|
"grad_norm": 0.4404103755950928, |
|
"kl": 0.54873046875, |
|
"learning_rate": 8.159829587948048e-06, |
|
"loss": 0.0219, |
|
"reward": 1.199218788743019, |
|
"reward_std": 0.28078800477087495, |
|
"rewards/accuracy_reward": 0.2500000085681677, |
|
"rewards/format_reward": 0.9492187678813935, |
|
"step": 910 |
|
}, |
|
{ |
|
"completion_length": 118.68646268844604, |
|
"epoch": 0.6095742918668212, |
|
"grad_norm": 0.33018258213996887, |
|
"kl": 0.52841796875, |
|
"learning_rate": 7.932953169323057e-06, |
|
"loss": 0.0211, |
|
"reward": 1.2174479514360428, |
|
"reward_std": 0.2616381015628576, |
|
"rewards/accuracy_reward": 0.26302084103226664, |
|
"rewards/format_reward": 0.9544271022081375, |
|
"step": 920 |
|
}, |
|
{ |
|
"completion_length": 109.28151321411133, |
|
"epoch": 0.6162000993871128, |
|
"grad_norm": 0.2895837128162384, |
|
"kl": 0.546044921875, |
|
"learning_rate": 7.70718294432646e-06, |
|
"loss": 0.0218, |
|
"reward": 1.2164062917232514, |
|
"reward_std": 0.2531652105972171, |
|
"rewards/accuracy_reward": 0.25807292461395265, |
|
"rewards/format_reward": 0.958333358168602, |
|
"step": 930 |
|
}, |
|
{ |
|
"completion_length": 119.46328525543213, |
|
"epoch": 0.6228259069074044, |
|
"grad_norm": 0.41079920530319214, |
|
"kl": 0.64697265625, |
|
"learning_rate": 7.482639735370536e-06, |
|
"loss": 0.0259, |
|
"reward": 1.1802083820104599, |
|
"reward_std": 0.2645995236933231, |
|
"rewards/accuracy_reward": 0.23177084028720857, |
|
"rewards/format_reward": 0.9484375163912773, |
|
"step": 940 |
|
}, |
|
{ |
|
"completion_length": 121.86719093322753, |
|
"epoch": 0.6294517144276959, |
|
"grad_norm": 0.3652746379375458, |
|
"kl": 0.551953125, |
|
"learning_rate": 7.2594437082219074e-06, |
|
"loss": 0.0221, |
|
"reward": 1.2138021171092988, |
|
"reward_std": 0.2821790289133787, |
|
"rewards/accuracy_reward": 0.26484375884756445, |
|
"rewards/format_reward": 0.9489583566784858, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.6294517144276959, |
|
"eval_completion_length": 100.6828727722168, |
|
"eval_kl": 0.53515625, |
|
"eval_loss": 0.021696042269468307, |
|
"eval_reward": 1.31250003973643, |
|
"eval_reward_std": 0.2856230421198739, |
|
"eval_rewards/accuracy_reward": 0.346064825852712, |
|
"eval_rewards/format_reward": 0.9664352138837179, |
|
"eval_runtime": 45.1594, |
|
"eval_samples_per_second": 2.192, |
|
"eval_steps_per_second": 0.199, |
|
"step": 950 |
|
}, |
|
{ |
|
"completion_length": 118.0130241394043, |
|
"epoch": 0.6360775219479874, |
|
"grad_norm": 0.34392473101615906, |
|
"kl": 0.523291015625, |
|
"learning_rate": 7.037714307694038e-06, |
|
"loss": 0.0209, |
|
"reward": 1.214843788743019, |
|
"reward_std": 0.26948558650910853, |
|
"rewards/accuracy_reward": 0.2541666762903333, |
|
"rewards/format_reward": 0.9606771022081375, |
|
"step": 960 |
|
}, |
|
{ |
|
"completion_length": 129.96614933013916, |
|
"epoch": 0.6427033294682789, |
|
"grad_norm": 0.352062463760376, |
|
"kl": 0.509033203125, |
|
"learning_rate": 6.8175701937255645e-06, |
|
"loss": 0.0204, |
|
"reward": 1.208854216337204, |
|
"reward_std": 0.2845765814185143, |
|
"rewards/accuracy_reward": 0.25598959121853115, |
|
"rewards/format_reward": 0.9528646022081375, |
|
"step": 970 |
|
}, |
|
{ |
|
"completion_length": 164.04010906219483, |
|
"epoch": 0.6493291369885705, |
|
"grad_norm": 0.3061555325984955, |
|
"kl": 0.554931640625, |
|
"learning_rate": 6.5991291778786556e-06, |
|
"loss": 0.0222, |
|
"reward": 1.2091146275401115, |
|
"reward_std": 0.34185091145336627, |
|
"rewards/accuracy_reward": 0.29166667480021713, |
|
"rewards/format_reward": 0.9174479395151138, |
|
"step": 980 |
|
}, |
|
{ |
|
"completion_length": 109.25807552337646, |
|
"epoch": 0.655954944508862, |
|
"grad_norm": 0.42008864879608154, |
|
"kl": 0.569482421875, |
|
"learning_rate": 6.38250816029139e-06, |
|
"loss": 0.0228, |
|
"reward": 1.2244792103767395, |
|
"reward_std": 0.2647375027649105, |
|
"rewards/accuracy_reward": 0.2721354251727462, |
|
"rewards/format_reward": 0.9523437708616257, |
|
"step": 990 |
|
}, |
|
{ |
|
"completion_length": 90.79297113418579, |
|
"epoch": 0.6625807520291536, |
|
"grad_norm": 0.37791749835014343, |
|
"kl": 0.554296875, |
|
"learning_rate": 6.167823067117868e-06, |
|
"loss": 0.0222, |
|
"reward": 1.2403646260499954, |
|
"reward_std": 0.2317359633743763, |
|
"rewards/accuracy_reward": 0.27578125735744835, |
|
"rewards/format_reward": 0.9645833507180214, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6625807520291536, |
|
"eval_completion_length": 90.40393786960178, |
|
"eval_kl": 0.5368923611111112, |
|
"eval_loss": 0.021688800305128098, |
|
"eval_reward": 1.3587963183720906, |
|
"eval_reward_std": 0.21392729216151768, |
|
"eval_rewards/accuracy_reward": 0.38657407959302265, |
|
"eval_rewards/format_reward": 0.9722222288449606, |
|
"eval_runtime": 38.2921, |
|
"eval_samples_per_second": 2.585, |
|
"eval_steps_per_second": 0.235, |
|
"step": 1000 |
|
}, |
|
{ |
|
"completion_length": 118.42370166778565, |
|
"epoch": 0.6692065595494451, |
|
"grad_norm": 0.32570692896842957, |
|
"kl": 0.533837890625, |
|
"learning_rate": 5.955188788489583e-06, |
|
"loss": 0.0214, |
|
"reward": 1.2507812827825546, |
|
"reward_std": 0.27148876488208773, |
|
"rewards/accuracy_reward": 0.30078125968575475, |
|
"rewards/format_reward": 0.9500000193715096, |
|
"step": 1010 |
|
}, |
|
{ |
|
"completion_length": 134.26146202087403, |
|
"epoch": 0.6758323670697366, |
|
"grad_norm": 0.42975538969039917, |
|
"kl": 0.4955078125, |
|
"learning_rate": 5.744719117031217e-06, |
|
"loss": 0.0198, |
|
"reward": 1.2127604454755783, |
|
"reward_std": 0.2835965741425753, |
|
"rewards/accuracy_reward": 0.26354167312383653, |
|
"rewards/format_reward": 0.9492187723517418, |
|
"step": 1020 |
|
}, |
|
{ |
|
"completion_length": 134.5783903121948, |
|
"epoch": 0.6824581745900281, |
|
"grad_norm": 0.36342352628707886, |
|
"kl": 0.459912109375, |
|
"learning_rate": 5.536526686963762e-06, |
|
"loss": 0.0184, |
|
"reward": 1.2119792059063912, |
|
"reward_std": 0.283594464790076, |
|
"rewards/accuracy_reward": 0.2687500087544322, |
|
"rewards/format_reward": 0.9432291865348816, |
|
"step": 1030 |
|
}, |
|
{ |
|
"completion_length": 108.19531602859497, |
|
"epoch": 0.6890839821103197, |
|
"grad_norm": 0.332270085811615, |
|
"kl": 0.5037109375, |
|
"learning_rate": 5.330722913827594e-06, |
|
"loss": 0.0202, |
|
"reward": 1.2489583656191825, |
|
"reward_std": 0.26699374951422217, |
|
"rewards/accuracy_reward": 0.29557292610406877, |
|
"rewards/format_reward": 0.9533854365348816, |
|
"step": 1040 |
|
}, |
|
{ |
|
"completion_length": 120.81276388168335, |
|
"epoch": 0.6957097896306113, |
|
"grad_norm": 0.32219088077545166, |
|
"kl": 0.5189453125, |
|
"learning_rate": 5.127417934857718e-06, |
|
"loss": 0.0208, |
|
"reward": 1.211718785762787, |
|
"reward_std": 0.2897366590797901, |
|
"rewards/accuracy_reward": 0.2705729234963655, |
|
"rewards/format_reward": 0.9411458536982537, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.6957097896306113, |
|
"eval_completion_length": 106.52430894639757, |
|
"eval_kl": 0.5453559027777778, |
|
"eval_loss": 0.02215980552136898, |
|
"eval_reward": 1.3611111508475409, |
|
"eval_reward_std": 0.23599610891607073, |
|
"eval_rewards/accuracy_reward": 0.4062500099341075, |
|
"eval_rewards/format_reward": 0.9548611243565878, |
|
"eval_runtime": 44.9455, |
|
"eval_samples_per_second": 2.203, |
|
"eval_steps_per_second": 0.2, |
|
"step": 1050 |
|
}, |
|
{ |
|
"completion_length": 107.07890901565551, |
|
"epoch": 0.7023355971509028, |
|
"grad_norm": 0.2685917019844055, |
|
"kl": 0.576708984375, |
|
"learning_rate": 4.926720550043089e-06, |
|
"loss": 0.0231, |
|
"reward": 1.229166704416275, |
|
"reward_std": 0.2693328620865941, |
|
"rewards/accuracy_reward": 0.2820312589406967, |
|
"rewards/format_reward": 0.9471354424953461, |
|
"step": 1060 |
|
}, |
|
{ |
|
"completion_length": 102.21718997955323, |
|
"epoch": 0.7089614046711943, |
|
"grad_norm": 0.42000600695610046, |
|
"kl": 0.53173828125, |
|
"learning_rate": 4.728738163901597e-06, |
|
"loss": 0.0213, |
|
"reward": 1.2791667014360428, |
|
"reward_std": 0.2512880745343864, |
|
"rewards/accuracy_reward": 0.32005209028720855, |
|
"rewards/format_reward": 0.9591146051883698, |
|
"step": 1070 |
|
}, |
|
{ |
|
"completion_length": 106.98437767028808, |
|
"epoch": 0.7155872121914858, |
|
"grad_norm": 0.350292444229126, |
|
"kl": 0.598828125, |
|
"learning_rate": 4.533576728001858e-06, |
|
"loss": 0.0239, |
|
"reward": 1.210156288743019, |
|
"reward_std": 0.2518887486308813, |
|
"rewards/accuracy_reward": 0.257031256519258, |
|
"rewards/format_reward": 0.9531250268220901, |
|
"step": 1080 |
|
}, |
|
{ |
|
"completion_length": 113.01172313690185, |
|
"epoch": 0.7222130197117774, |
|
"grad_norm": 0.41227367520332336, |
|
"kl": 0.553515625, |
|
"learning_rate": 4.341340684262498e-06, |
|
"loss": 0.0221, |
|
"reward": 1.2361979573965072, |
|
"reward_std": 0.24922715383581817, |
|
"rewards/accuracy_reward": 0.28906251015141604, |
|
"rewards/format_reward": 0.9471354365348816, |
|
"step": 1090 |
|
}, |
|
{ |
|
"completion_length": 120.65026445388794, |
|
"epoch": 0.7288388272320689, |
|
"grad_norm": 0.35520121455192566, |
|
"kl": 0.53955078125, |
|
"learning_rate": 4.152132909059402e-06, |
|
"loss": 0.0216, |
|
"reward": 1.2221354484558105, |
|
"reward_std": 0.26356869330629706, |
|
"rewards/accuracy_reward": 0.27500000689178705, |
|
"rewards/format_reward": 0.9471354350447655, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7288388272320689, |
|
"eval_completion_length": 118.43750423855252, |
|
"eval_kl": 0.4963107638888889, |
|
"eval_loss": 0.01998673938214779, |
|
"eval_reward": 1.339120414521959, |
|
"eval_reward_std": 0.21001804454459083, |
|
"eval_rewards/accuracy_reward": 0.3865740829043918, |
|
"eval_rewards/format_reward": 0.9525463117493523, |
|
"eval_runtime": 45.4912, |
|
"eval_samples_per_second": 2.176, |
|
"eval_steps_per_second": 0.198, |
|
"step": 1100 |
|
}, |
|
{ |
|
"completion_length": 137.40937881469728, |
|
"epoch": 0.7354646347523605, |
|
"grad_norm": 0.48482638597488403, |
|
"kl": 0.568701171875, |
|
"learning_rate": 3.966054658170754e-06, |
|
"loss": 0.0227, |
|
"reward": 1.2109375342726707, |
|
"reward_std": 0.29270930401980877, |
|
"rewards/accuracy_reward": 0.2791666746139526, |
|
"rewards/format_reward": 0.931770858168602, |
|
"step": 1110 |
|
}, |
|
{ |
|
"completion_length": 126.11432609558105, |
|
"epoch": 0.742090442272652, |
|
"grad_norm": 0.27705782651901245, |
|
"kl": 0.532177734375, |
|
"learning_rate": 3.7832055125893318e-06, |
|
"loss": 0.0213, |
|
"reward": 1.207291704416275, |
|
"reward_std": 0.246893934533, |
|
"rewards/accuracy_reward": 0.26119792368263006, |
|
"rewards/format_reward": 0.9460937708616257, |
|
"step": 1120 |
|
}, |
|
{ |
|
"completion_length": 131.4484426498413, |
|
"epoch": 0.7487162497929435, |
|
"grad_norm": 0.530246376991272, |
|
"kl": 0.55673828125, |
|
"learning_rate": 3.6036833252310887e-06, |
|
"loss": 0.0223, |
|
"reward": 1.2307292073965073, |
|
"reward_std": 0.29129046984016893, |
|
"rewards/accuracy_reward": 0.2937500076368451, |
|
"rewards/format_reward": 0.9369791865348815, |
|
"step": 1130 |
|
}, |
|
{ |
|
"completion_length": 120.24609689712524, |
|
"epoch": 0.755342057313235, |
|
"grad_norm": 0.24524401128292084, |
|
"kl": 0.5646484375, |
|
"learning_rate": 3.427584168568535e-06, |
|
"loss": 0.0226, |
|
"reward": 1.2190104603767395, |
|
"reward_std": 0.2685457500629127, |
|
"rewards/accuracy_reward": 0.27187500726431607, |
|
"rewards/format_reward": 0.9471354380249977, |
|
"step": 1140 |
|
}, |
|
{ |
|
"completion_length": 102.51015863418579, |
|
"epoch": 0.7619678648335266, |
|
"grad_norm": 0.3422842025756836, |
|
"kl": 0.618212890625, |
|
"learning_rate": 3.2550022832169125e-06, |
|
"loss": 0.0247, |
|
"reward": 1.238281285762787, |
|
"reward_std": 0.23695164285600184, |
|
"rewards/accuracy_reward": 0.2778645927086473, |
|
"rewards/format_reward": 0.9604166835546494, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.7619678648335266, |
|
"eval_completion_length": 107.59259626600478, |
|
"eval_kl": 0.6030815972222222, |
|
"eval_loss": 0.023637007921934128, |
|
"eval_reward": 1.3310185670852661, |
|
"eval_reward_std": 0.23936733272340563, |
|
"eval_rewards/accuracy_reward": 0.37731482254134285, |
|
"eval_rewards/format_reward": 0.9537037213643392, |
|
"eval_runtime": 44.103, |
|
"eval_samples_per_second": 2.245, |
|
"eval_steps_per_second": 0.204, |
|
"step": 1150 |
|
}, |
|
{ |
|
"completion_length": 113.57943124771118, |
|
"epoch": 0.7685936723538181, |
|
"grad_norm": 0.5340375900268555, |
|
"kl": 0.68115234375, |
|
"learning_rate": 3.086030027500728e-06, |
|
"loss": 0.0272, |
|
"reward": 1.2208333790302277, |
|
"reward_std": 0.27482070587575436, |
|
"rewards/accuracy_reward": 0.2721354249864817, |
|
"rewards/format_reward": 0.94869794100523, |
|
"step": 1160 |
|
}, |
|
{ |
|
"completion_length": 121.99713821411133, |
|
"epoch": 0.7752194798741097, |
|
"grad_norm": 0.42114803194999695, |
|
"kl": 0.538671875, |
|
"learning_rate": 2.920757828027586e-06, |
|
"loss": 0.0216, |
|
"reward": 1.2221354573965073, |
|
"reward_std": 0.29092769548296926, |
|
"rewards/accuracy_reward": 0.2809895912185311, |
|
"rewards/format_reward": 0.9411458536982537, |
|
"step": 1170 |
|
}, |
|
{ |
|
"completion_length": 109.28125295639038, |
|
"epoch": 0.7818452873944012, |
|
"grad_norm": 0.39234262704849243, |
|
"kl": 0.55908203125, |
|
"learning_rate": 2.759274131295787e-06, |
|
"loss": 0.0224, |
|
"reward": 1.261979202926159, |
|
"reward_std": 0.2803305257111788, |
|
"rewards/accuracy_reward": 0.3132812598254532, |
|
"rewards/format_reward": 0.9486979350447655, |
|
"step": 1180 |
|
}, |
|
{ |
|
"completion_length": 104.11823234558105, |
|
"epoch": 0.7884710949146927, |
|
"grad_norm": 0.49963539838790894, |
|
"kl": 0.56357421875, |
|
"learning_rate": 2.60166535636162e-06, |
|
"loss": 0.0225, |
|
"reward": 1.253125038743019, |
|
"reward_std": 0.25115325963124635, |
|
"rewards/accuracy_reward": 0.2994791757315397, |
|
"rewards/format_reward": 0.9536458536982536, |
|
"step": 1190 |
|
}, |
|
{ |
|
"completion_length": 106.06250343322753, |
|
"epoch": 0.7950969024349842, |
|
"grad_norm": 0.42556238174438477, |
|
"kl": 0.54150390625, |
|
"learning_rate": 2.448015848591638e-06, |
|
"loss": 0.0217, |
|
"reward": 1.253906285762787, |
|
"reward_std": 0.2610521188005805, |
|
"rewards/accuracy_reward": 0.2997395915910602, |
|
"rewards/format_reward": 0.9541666850447654, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7950969024349842, |
|
"eval_completion_length": 104.84143914116754, |
|
"eval_kl": 0.5355902777777778, |
|
"eval_loss": 0.02168433926999569, |
|
"eval_reward": 1.376157455974155, |
|
"eval_reward_std": 0.18614381965663698, |
|
"eval_rewards/accuracy_reward": 0.4178240829043918, |
|
"eval_rewards/format_reward": 0.9583333532015482, |
|
"eval_runtime": 41.5934, |
|
"eval_samples_per_second": 2.38, |
|
"eval_steps_per_second": 0.216, |
|
"step": 1200 |
|
}, |
|
{ |
|
"completion_length": 113.52969074249268, |
|
"epoch": 0.8017227099552758, |
|
"grad_norm": 0.3254588842391968, |
|
"kl": 0.56806640625, |
|
"learning_rate": 2.298407834524682e-06, |
|
"loss": 0.0227, |
|
"reward": 1.269270870089531, |
|
"reward_std": 0.253192731551826, |
|
"rewards/accuracy_reward": 0.31822917647659776, |
|
"rewards/format_reward": 0.9510416895151138, |
|
"step": 1210 |
|
}, |
|
{ |
|
"completion_length": 112.07031574249268, |
|
"epoch": 0.8083485174755674, |
|
"grad_norm": 0.2393941879272461, |
|
"kl": 0.50380859375, |
|
"learning_rate": 2.1529213778677993e-06, |
|
"loss": 0.0202, |
|
"reward": 1.2791667133569717, |
|
"reward_std": 0.24380711056292056, |
|
"rewards/accuracy_reward": 0.32239584140479566, |
|
"rewards/format_reward": 0.9567708551883698, |
|
"step": 1220 |
|
}, |
|
{ |
|
"completion_length": 103.30625295639038, |
|
"epoch": 0.8149743249958589, |
|
"grad_norm": 0.5480020046234131, |
|
"kl": 0.520361328125, |
|
"learning_rate": 2.0116343366496493e-06, |
|
"loss": 0.0208, |
|
"reward": 1.289062535762787, |
|
"reward_std": 0.2560120256617665, |
|
"rewards/accuracy_reward": 0.33281251094304026, |
|
"rewards/format_reward": 0.9562500178813934, |
|
"step": 1230 |
|
}, |
|
{ |
|
"completion_length": 111.6716178894043, |
|
"epoch": 0.8216001325161504, |
|
"grad_norm": 0.4021623134613037, |
|
"kl": 0.5818359375, |
|
"learning_rate": 1.8746223215542482e-06, |
|
"loss": 0.0233, |
|
"reward": 1.2690104588866233, |
|
"reward_std": 0.2605089288204908, |
|
"rewards/accuracy_reward": 0.31901042386889455, |
|
"rewards/format_reward": 0.9500000193715096, |
|
"step": 1240 |
|
}, |
|
{ |
|
"completion_length": 110.94271154403687, |
|
"epoch": 0.8282259400364419, |
|
"grad_norm": 1.2661796808242798, |
|
"kl": 0.56083984375, |
|
"learning_rate": 1.7419586554574364e-06, |
|
"loss": 0.0224, |
|
"reward": 1.2833333700895309, |
|
"reward_std": 0.27457276433706285, |
|
"rewards/accuracy_reward": 0.32838542591780423, |
|
"rewards/format_reward": 0.95494794100523, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.8282259400364419, |
|
"eval_completion_length": 103.17708587646484, |
|
"eval_kl": 0.4845920138888889, |
|
"eval_loss": 0.019484883174300194, |
|
"eval_reward": 1.3888889286253188, |
|
"eval_reward_std": 0.19302751620610556, |
|
"eval_rewards/accuracy_reward": 0.4236111210452186, |
|
"eval_rewards/format_reward": 0.9652777910232544, |
|
"eval_runtime": 44.455, |
|
"eval_samples_per_second": 2.227, |
|
"eval_steps_per_second": 0.202, |
|
"step": 1250 |
|
}, |
|
{ |
|
"completion_length": 113.02031555175782, |
|
"epoch": 0.8348517475567335, |
|
"grad_norm": 0.4074391722679138, |
|
"kl": 0.49287109375, |
|
"learning_rate": 1.6137143341876439e-06, |
|
"loss": 0.0197, |
|
"reward": 1.2604166984558105, |
|
"reward_std": 0.25771117191761733, |
|
"rewards/accuracy_reward": 0.3046875074505806, |
|
"rewards/format_reward": 0.9557291865348816, |
|
"step": 1260 |
|
}, |
|
{ |
|
"completion_length": 128.30312900543214, |
|
"epoch": 0.841477555077025, |
|
"grad_norm": 0.3753833472728729, |
|
"kl": 0.550244140625, |
|
"learning_rate": 1.4899579885320237e-06, |
|
"loss": 0.022, |
|
"reward": 1.2690104633569717, |
|
"reward_std": 0.29735342264175413, |
|
"rewards/accuracy_reward": 0.3283854264765978, |
|
"rewards/format_reward": 0.9406250223517418, |
|
"step": 1270 |
|
}, |
|
{ |
|
"completion_length": 131.21823406219482, |
|
"epoch": 0.8481033625973166, |
|
"grad_norm": 0.3492971658706665, |
|
"kl": 0.51591796875, |
|
"learning_rate": 1.370755847508226e-06, |
|
"loss": 0.0206, |
|
"reward": 1.251562537252903, |
|
"reward_std": 0.28482332453131676, |
|
"rewards/accuracy_reward": 0.31432292619720104, |
|
"rewards/format_reward": 0.9372396051883698, |
|
"step": 1280 |
|
}, |
|
{ |
|
"completion_length": 122.89974346160889, |
|
"epoch": 0.8547291701176081, |
|
"grad_norm": 0.4898810088634491, |
|
"kl": 0.5181640625, |
|
"learning_rate": 1.256171702921516e-06, |
|
"loss": 0.0207, |
|
"reward": 1.2828125476837158, |
|
"reward_std": 0.271611943654716, |
|
"rewards/accuracy_reward": 0.34062500968575476, |
|
"rewards/format_reward": 0.9421875193715096, |
|
"step": 1290 |
|
}, |
|
{ |
|
"completion_length": 113.66250410079957, |
|
"epoch": 0.8613549776378996, |
|
"grad_norm": 0.4513114392757416, |
|
"kl": 0.523193359375, |
|
"learning_rate": 1.1462668752261652e-06, |
|
"loss": 0.0209, |
|
"reward": 1.2484375417232514, |
|
"reward_std": 0.2623301435261965, |
|
"rewards/accuracy_reward": 0.2979166738688946, |
|
"rewards/format_reward": 0.9505208566784858, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8613549776378996, |
|
"eval_completion_length": 99.94213189019098, |
|
"eval_kl": 0.4971788194444444, |
|
"eval_loss": 0.02001408487558365, |
|
"eval_reward": 1.378472261958652, |
|
"eval_reward_std": 0.20051221052805582, |
|
"eval_rewards/accuracy_reward": 0.41203704807493424, |
|
"eval_rewards/format_reward": 0.9664352072609795, |
|
"eval_runtime": 44.0864, |
|
"eval_samples_per_second": 2.246, |
|
"eval_steps_per_second": 0.204, |
|
"step": 1300 |
|
}, |
|
{ |
|
"completion_length": 104.8481806755066, |
|
"epoch": 0.8679807851581911, |
|
"grad_norm": 0.4572502672672272, |
|
"kl": 0.547412109375, |
|
"learning_rate": 1.04110018070941e-06, |
|
"loss": 0.0219, |
|
"reward": 1.236718800663948, |
|
"reward_std": 0.2456181443296373, |
|
"rewards/accuracy_reward": 0.2802083441987634, |
|
"rewards/format_reward": 0.9565104335546494, |
|
"step": 1310 |
|
}, |
|
{ |
|
"completion_length": 106.76484680175781, |
|
"epoch": 0.8746065926784827, |
|
"grad_norm": 0.4408782422542572, |
|
"kl": 0.542333984375, |
|
"learning_rate": 9.407279000155311e-07, |
|
"loss": 0.0217, |
|
"reward": 1.2541667014360427, |
|
"reward_std": 0.24311191439628602, |
|
"rewards/accuracy_reward": 0.3036458421498537, |
|
"rewards/format_reward": 0.9505208551883697, |
|
"step": 1320 |
|
}, |
|
{ |
|
"completion_length": 107.94062728881836, |
|
"epoch": 0.8812324001987742, |
|
"grad_norm": 0.3998057246208191, |
|
"kl": 0.515771484375, |
|
"learning_rate": 8.452037480269082e-07, |
|
"loss": 0.0206, |
|
"reward": 1.2924479633569717, |
|
"reward_std": 0.2722974482923746, |
|
"rewards/accuracy_reward": 0.3382812574505806, |
|
"rewards/format_reward": 0.9541666865348816, |
|
"step": 1330 |
|
}, |
|
{ |
|
"completion_length": 95.11302337646484, |
|
"epoch": 0.8878582077190658, |
|
"grad_norm": 0.6737642288208008, |
|
"kl": 0.559130859375, |
|
"learning_rate": 7.545788451181313e-07, |
|
"loss": 0.0224, |
|
"reward": 1.2661458790302276, |
|
"reward_std": 0.23880463000386953, |
|
"rewards/accuracy_reward": 0.3049479253590107, |
|
"rewards/format_reward": 0.9611979395151138, |
|
"step": 1340 |
|
}, |
|
{ |
|
"completion_length": 101.32344017028808, |
|
"epoch": 0.8944840152393573, |
|
"grad_norm": 0.4838802218437195, |
|
"kl": 0.556005859375, |
|
"learning_rate": 6.689016897986123e-07, |
|
"loss": 0.0222, |
|
"reward": 1.2570312947034836, |
|
"reward_std": 0.2493739674333483, |
|
"rewards/accuracy_reward": 0.30286459140479566, |
|
"rewards/format_reward": 0.9541666895151139, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.8944840152393573, |
|
"eval_completion_length": 93.02315097384982, |
|
"eval_kl": 0.5193142361111112, |
|
"eval_loss": 0.021245010197162628, |
|
"eval_reward": 1.3842592901653714, |
|
"eval_reward_std": 0.20025065706835854, |
|
"eval_rewards/accuracy_reward": 0.41550926367441815, |
|
"eval_rewards/format_reward": 0.968750019868215, |
|
"eval_runtime": 40.9157, |
|
"eval_samples_per_second": 2.42, |
|
"eval_steps_per_second": 0.22, |
|
"step": 1350 |
|
}, |
|
{ |
|
"completion_length": 101.27943019866943, |
|
"epoch": 0.9011098227596488, |
|
"grad_norm": 0.5689716339111328, |
|
"kl": 0.5677734375, |
|
"learning_rate": 5.88218132758287e-07, |
|
"loss": 0.0227, |
|
"reward": 1.2895833730697632, |
|
"reward_std": 0.25514377616345885, |
|
"rewards/accuracy_reward": 0.3346354255452752, |
|
"rewards/format_reward": 0.9549479365348816, |
|
"step": 1360 |
|
}, |
|
{ |
|
"completion_length": 106.12890930175782, |
|
"epoch": 0.9077356302799404, |
|
"grad_norm": 0.39908653497695923, |
|
"kl": 0.56484375, |
|
"learning_rate": 5.125713523303133e-07, |
|
"loss": 0.0226, |
|
"reward": 1.2458333760499953, |
|
"reward_std": 0.24504322968423367, |
|
"rewards/accuracy_reward": 0.29401042591780424, |
|
"rewards/format_reward": 0.9518229365348816, |
|
"step": 1370 |
|
}, |
|
{ |
|
"completion_length": 115.99088821411132, |
|
"epoch": 0.9143614378002319, |
|
"grad_norm": 1.2312554121017456, |
|
"kl": 0.5458984375, |
|
"learning_rate": 4.420018313839147e-07, |
|
"loss": 0.0218, |
|
"reward": 1.2518229544162751, |
|
"reward_std": 0.27393123134970665, |
|
"rewards/accuracy_reward": 0.30729167610406877, |
|
"rewards/format_reward": 0.9445312738418579, |
|
"step": 1380 |
|
}, |
|
{ |
|
"completion_length": 103.67317962646484, |
|
"epoch": 0.9209872453205235, |
|
"grad_norm": 0.2879469394683838, |
|
"kl": 0.53359375, |
|
"learning_rate": 3.7654733565969826e-07, |
|
"loss": 0.0213, |
|
"reward": 1.284375038743019, |
|
"reward_std": 0.26313075572252276, |
|
"rewards/accuracy_reward": 0.33307292610406875, |
|
"rewards/format_reward": 0.9513021066784859, |
|
"step": 1390 |
|
}, |
|
{ |
|
"completion_length": 108.21693058013916, |
|
"epoch": 0.927613052840815, |
|
"grad_norm": 0.45352187752723694, |
|
"kl": 0.548974609375, |
|
"learning_rate": 3.1624289355907334e-07, |
|
"loss": 0.022, |
|
"reward": 1.2638021171092988, |
|
"reward_std": 0.25347734354436396, |
|
"rewards/accuracy_reward": 0.31223959382623434, |
|
"rewards/format_reward": 0.9515625208616256, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.927613052840815, |
|
"eval_completion_length": 89.58449384901259, |
|
"eval_kl": 0.5271267361111112, |
|
"eval_loss": 0.021207302808761597, |
|
"eval_reward": 1.3958333730697632, |
|
"eval_reward_std": 0.21908769508202872, |
|
"eval_rewards/accuracy_reward": 0.42939815587467617, |
|
"eval_rewards/format_reward": 0.9664352006382413, |
|
"eval_runtime": 40.9484, |
|
"eval_samples_per_second": 2.418, |
|
"eval_steps_per_second": 0.22, |
|
"step": 1400 |
|
}, |
|
{ |
|
"completion_length": 103.04479503631592, |
|
"epoch": 0.9342388603611065, |
|
"grad_norm": 0.3517305552959442, |
|
"kl": 0.640283203125, |
|
"learning_rate": 2.6112077739857465e-07, |
|
"loss": 0.0256, |
|
"reward": 1.2666667103767395, |
|
"reward_std": 0.24853656738996505, |
|
"rewards/accuracy_reward": 0.3114583415910602, |
|
"rewards/format_reward": 0.9552083522081375, |
|
"step": 1410 |
|
}, |
|
{ |
|
"completion_length": 101.159898853302, |
|
"epoch": 0.940864667881398, |
|
"grad_norm": 0.5444666743278503, |
|
"kl": 0.58466796875, |
|
"learning_rate": 2.1121048613912843e-07, |
|
"loss": 0.0234, |
|
"reward": 1.269270870089531, |
|
"reward_std": 0.2637180283665657, |
|
"rewards/accuracy_reward": 0.31562500819563866, |
|
"rewards/format_reward": 0.9536458507180214, |
|
"step": 1420 |
|
}, |
|
{ |
|
"completion_length": 107.42942943572999, |
|
"epoch": 0.9474904754016896, |
|
"grad_norm": 0.3939830958843231, |
|
"kl": 0.549267578125, |
|
"learning_rate": 1.665387295994747e-07, |
|
"loss": 0.022, |
|
"reward": 1.2708333641290666, |
|
"reward_std": 0.2713921457529068, |
|
"rewards/accuracy_reward": 0.3205729253590107, |
|
"rewards/format_reward": 0.9502604350447654, |
|
"step": 1430 |
|
}, |
|
{ |
|
"completion_length": 110.02474231719971, |
|
"epoch": 0.9541162829219811, |
|
"grad_norm": 0.45775726437568665, |
|
"kl": 0.557666015625, |
|
"learning_rate": 1.271294141622459e-07, |
|
"loss": 0.0223, |
|
"reward": 1.2721354573965074, |
|
"reward_std": 0.2720890769734979, |
|
"rewards/accuracy_reward": 0.3260416738688946, |
|
"rewards/format_reward": 0.9460937708616257, |
|
"step": 1440 |
|
}, |
|
{ |
|
"completion_length": 107.73151302337646, |
|
"epoch": 0.9607420904422727, |
|
"grad_norm": 0.5883992910385132, |
|
"kl": 0.538671875, |
|
"learning_rate": 9.300362998030832e-08, |
|
"loss": 0.0215, |
|
"reward": 1.290364620089531, |
|
"reward_std": 0.27429741993546486, |
|
"rewards/accuracy_reward": 0.3398437587544322, |
|
"rewards/format_reward": 0.9505208551883697, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.9607420904422727, |
|
"eval_completion_length": 102.1273176405165, |
|
"eval_kl": 0.5319010416666666, |
|
"eval_loss": 0.021446138620376587, |
|
"eval_reward": 1.3912037346098158, |
|
"eval_reward_std": 0.23611565927664438, |
|
"eval_rewards/accuracy_reward": 0.4351851973268721, |
|
"eval_rewards/format_reward": 0.9560185339715745, |
|
"eval_runtime": 44.1752, |
|
"eval_samples_per_second": 2.241, |
|
"eval_steps_per_second": 0.204, |
|
"step": 1450 |
|
}, |
|
{ |
|
"completion_length": 107.99531545639039, |
|
"epoch": 0.9673678979625642, |
|
"grad_norm": 0.46655991673469543, |
|
"kl": 0.56240234375, |
|
"learning_rate": 6.417963969022389e-08, |
|
"loss": 0.0225, |
|
"reward": 1.2661458730697632, |
|
"reward_std": 0.2660699520260096, |
|
"rewards/accuracy_reward": 0.3182291751727462, |
|
"rewards/format_reward": 0.9479166835546493, |
|
"step": 1460 |
|
}, |
|
{ |
|
"completion_length": 113.53489923477173, |
|
"epoch": 0.9739937054828557, |
|
"grad_norm": 0.38386690616607666, |
|
"kl": 0.53984375, |
|
"learning_rate": 4.067286863888131e-08, |
|
"loss": 0.0216, |
|
"reward": 1.2630208790302277, |
|
"reward_std": 0.2672860164195299, |
|
"rewards/accuracy_reward": 0.3145833432674408, |
|
"rewards/format_reward": 0.9484375193715096, |
|
"step": 1470 |
|
}, |
|
{ |
|
"completion_length": 101.46328430175781, |
|
"epoch": 0.9806195130031473, |
|
"grad_norm": 0.6313057541847229, |
|
"kl": 0.540576171875, |
|
"learning_rate": 2.2495896628529355e-08, |
|
"loss": 0.0216, |
|
"reward": 1.2971354603767395, |
|
"reward_std": 0.24062121249735355, |
|
"rewards/accuracy_reward": 0.34166667591780425, |
|
"rewards/format_reward": 0.9554687708616256, |
|
"step": 1480 |
|
}, |
|
{ |
|
"completion_length": 106.94739875793456, |
|
"epoch": 0.9872453205234388, |
|
"grad_norm": 0.3773626685142517, |
|
"kl": 0.53203125, |
|
"learning_rate": 9.658451184600959e-09, |
|
"loss": 0.0213, |
|
"reward": 1.2572917103767396, |
|
"reward_std": 0.25889183739200233, |
|
"rewards/accuracy_reward": 0.3059895936399698, |
|
"rewards/format_reward": 0.9513021022081375, |
|
"step": 1490 |
|
}, |
|
{ |
|
"completion_length": 110.40052452087403, |
|
"epoch": 0.9938711280437303, |
|
"grad_norm": 0.40007439255714417, |
|
"kl": 0.56728515625, |
|
"learning_rate": 2.167402349972925e-09, |
|
"loss": 0.0227, |
|
"reward": 1.252343788743019, |
|
"reward_std": 0.2644004987552762, |
|
"rewards/accuracy_reward": 0.30546875689178704, |
|
"rewards/format_reward": 0.946875025331974, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.9938711280437303, |
|
"eval_completion_length": 100.44560368855794, |
|
"eval_kl": 0.5397135416666666, |
|
"eval_loss": 0.0218037161976099, |
|
"eval_reward": 1.371527804268731, |
|
"eval_reward_std": 0.23327534563011593, |
|
"eval_rewards/accuracy_reward": 0.41782407959302265, |
|
"eval_rewards/format_reward": 0.9537037147416009, |
|
"eval_runtime": 43.5394, |
|
"eval_samples_per_second": 2.274, |
|
"eval_steps_per_second": 0.207, |
|
"step": 1500 |
|
}, |
|
{ |
|
"completion_length": 108.79543187883165, |
|
"epoch": 0.9998343548119927, |
|
"kl": 0.5859917534722222, |
|
"reward": 1.2821180986033545, |
|
"reward_std": 0.28485159451762837, |
|
"rewards/accuracy_reward": 0.3333333449231254, |
|
"rewards/format_reward": 0.9487847404347526, |
|
"step": 1509, |
|
"total_flos": 0.0, |
|
"train_loss": 0.021261748100807297, |
|
"train_runtime": 42718.5249, |
|
"train_samples_per_second": 1.696, |
|
"train_steps_per_second": 0.035 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1509, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|