|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9991673605328892, |
|
"eval_steps": 50, |
|
"global_step": 450, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011101859561476548, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 1.0928, |
|
"mean_token_accuracy": 0.717196784696785, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.022203719122953096, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 1.1024, |
|
"mean_token_accuracy": 0.7158291392437733, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03330557868442964, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.0782, |
|
"mean_token_accuracy": 0.7194658119658119, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04440743824590619, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 1.0777, |
|
"mean_token_accuracy": 0.7197588522588524, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.055509297807382736, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 1.1111111111111113e-05, |
|
"loss": 1.0614, |
|
"mean_token_accuracy": 0.7216748066748067, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06661115736885928, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 1.059, |
|
"mean_token_accuracy": 0.719067969067969, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07771301693033583, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 1.555555555555556e-05, |
|
"loss": 1.0328, |
|
"mean_token_accuracy": 0.7237962962962966, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08881487649181238, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 1.7777777777777777e-05, |
|
"loss": 1.014, |
|
"mean_token_accuracy": 0.7259676434676435, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09991673605328892, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 2e-05, |
|
"loss": 1.0066, |
|
"mean_token_accuracy": 0.725960520960521, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.11101859561476547, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 1.9992479525042305e-05, |
|
"loss": 0.9537, |
|
"mean_token_accuracy": 0.7386558811558809, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11101859561476547, |
|
"eval_loss": 0.9796226024627686, |
|
"eval_mean_token_accuracy": 0.7271781595311009, |
|
"eval_runtime": 9.7194, |
|
"eval_samples_per_second": 13.272, |
|
"eval_steps_per_second": 3.395, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12212045517624202, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 1.996992941167792e-05, |
|
"loss": 0.9602, |
|
"mean_token_accuracy": 0.7342826617826618, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.13322231473771856, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 1.9932383577419432e-05, |
|
"loss": 0.916, |
|
"mean_token_accuracy": 0.74437307661244, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1443241742991951, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.9879898494768093e-05, |
|
"loss": 0.909, |
|
"mean_token_accuracy": 0.7461548636548637, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.15542603386067166, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 1.9812553106273848e-05, |
|
"loss": 0.9079, |
|
"mean_token_accuracy": 0.74519129019129, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.16652789342214822, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 1.973044870579824e-05, |
|
"loss": 0.9071, |
|
"mean_token_accuracy": 0.7446357346357346, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.17762975298362477, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 1.9633708786158803e-05, |
|
"loss": 0.8953, |
|
"mean_token_accuracy": 0.746184371184371, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.18873161254510132, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 1.9522478853384154e-05, |
|
"loss": 0.8582, |
|
"mean_token_accuracy": 0.7562728937728938, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.19983347210657784, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.8719, |
|
"mean_token_accuracy": 0.7528663003663003, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2109353316680544, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 1.9257239692688907e-05, |
|
"loss": 0.8735, |
|
"mean_token_accuracy": 0.7512274542096703, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.22203719122953094, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 1.9103629409661468e-05, |
|
"loss": 0.8656, |
|
"mean_token_accuracy": 0.7536752136752136, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.22203719122953094, |
|
"eval_loss": 0.9016062617301941, |
|
"eval_mean_token_accuracy": 0.7405376758317936, |
|
"eval_runtime": 9.7188, |
|
"eval_samples_per_second": 13.273, |
|
"eval_steps_per_second": 3.395, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2331390507910075, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 1.8936326403234125e-05, |
|
"loss": 0.8687, |
|
"mean_token_accuracy": 0.7531064306064306, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.24424091035248405, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 1.8755582313020912e-05, |
|
"loss": 0.8548, |
|
"mean_token_accuracy": 0.75515422674015, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2553427699139606, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 1.8561668995302668e-05, |
|
"loss": 0.8543, |
|
"mean_token_accuracy": 0.7542501017501018, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2664446294754371, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 1.8354878114129368e-05, |
|
"loss": 0.8373, |
|
"mean_token_accuracy": 0.7589896214896215, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2775464890369137, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 1.8135520702629677e-05, |
|
"loss": 0.8489, |
|
"mean_token_accuracy": 0.7568091168091166, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.2886483485983902, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 1.7903926695187595e-05, |
|
"loss": 0.8557, |
|
"mean_token_accuracy": 0.7542979242979242, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2997502081598668, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 1.766044443118978e-05, |
|
"loss": 0.8329, |
|
"mean_token_accuracy": 0.7595054945054944, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.31085206772134333, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 1.740544013109005e-05, |
|
"loss": 0.874, |
|
"mean_token_accuracy": 0.748472730972731, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.32195392728281985, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 1.7139297345578992e-05, |
|
"loss": 0.8295, |
|
"mean_token_accuracy": 0.7612077737077737, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.33305578684429643, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 1.686241637868734e-05, |
|
"loss": 0.8092, |
|
"mean_token_accuracy": 0.7669424094424093, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.33305578684429643, |
|
"eval_loss": 0.8762778639793396, |
|
"eval_mean_token_accuracy": 0.7449022654905009, |
|
"eval_runtime": 9.7018, |
|
"eval_samples_per_second": 13.296, |
|
"eval_steps_per_second": 3.401, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.34415764640577295, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 1.657521368569064e-05, |
|
"loss": 0.8355, |
|
"mean_token_accuracy": 0.7576159951159951, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.35525950596724953, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 1.627812124672099e-05, |
|
"loss": 0.8398, |
|
"mean_token_accuracy": 0.7564778061483249, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.36636136552872606, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 1.5971585917027864e-05, |
|
"loss": 0.8576, |
|
"mean_token_accuracy": 0.7524267399267397, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.37746322509020264, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 1.5656068754865388e-05, |
|
"loss": 0.8254, |
|
"mean_token_accuracy": 0.760271918721826, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.38856508465167916, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 1.5332044328016916e-05, |
|
"loss": 0.8502, |
|
"mean_token_accuracy": 0.7546072446072445, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3996669442131557, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.8418, |
|
"mean_token_accuracy": 0.7569129019129018, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.41076880377463226, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 1.4660435197025391e-05, |
|
"loss": 0.8346, |
|
"mean_token_accuracy": 0.7576149776149775, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.4218706633361088, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 1.4313860656812537e-05, |
|
"loss": 0.8478, |
|
"mean_token_accuracy": 0.7535816035816034, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.43297252289758537, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 1.396079766039157e-05, |
|
"loss": 0.8107, |
|
"mean_token_accuracy": 0.7654395604395604, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.4440743824590619, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 1.3601777248047105e-05, |
|
"loss": 0.8124, |
|
"mean_token_accuracy": 0.763194953194953, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4440743824590619, |
|
"eval_loss": 0.8637130260467529, |
|
"eval_mean_token_accuracy": 0.7472996502408268, |
|
"eval_runtime": 9.6971, |
|
"eval_samples_per_second": 13.303, |
|
"eval_steps_per_second": 3.403, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4551762420205384, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 1.3237339420583213e-05, |
|
"loss": 0.8375, |
|
"mean_token_accuracy": 0.757132682132682, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.466278101582015, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 1.2868032327110904e-05, |
|
"loss": 0.8577, |
|
"mean_token_accuracy": 0.750907610907611, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4773799611434915, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 1.2494411440579814e-05, |
|
"loss": 0.82, |
|
"mean_token_accuracy": 0.7625814000814001, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.4884818207049681, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 1.211703872229411e-05, |
|
"loss": 0.8099, |
|
"mean_token_accuracy": 0.7646743996743997, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4995836802664446, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 0.8195, |
|
"mean_token_accuracy": 0.7627518372346456, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5106855398279212, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 1.1353312997501313e-05, |
|
"loss": 0.8248, |
|
"mean_token_accuracy": 0.7611874236874236, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5217873993893978, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 1.0968108707031792e-05, |
|
"loss": 0.8174, |
|
"mean_token_accuracy": 0.7629131054131055, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5328892589508742, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 1.0581448289104759e-05, |
|
"loss": 0.8105, |
|
"mean_token_accuracy": 0.7650734632542214, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5439911185123508, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 1.0193913317718245e-05, |
|
"loss": 0.81, |
|
"mean_token_accuracy": 0.7640618640618639, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5550929780738274, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 9.806086682281759e-06, |
|
"loss": 0.8205, |
|
"mean_token_accuracy": 0.7610907610907611, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5550929780738274, |
|
"eval_loss": 0.8572535514831543, |
|
"eval_mean_token_accuracy": 0.748435551376728, |
|
"eval_runtime": 9.7051, |
|
"eval_samples_per_second": 13.292, |
|
"eval_steps_per_second": 3.4, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5661948376353039, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 9.418551710895243e-06, |
|
"loss": 0.8235, |
|
"mean_token_accuracy": 0.7599084249084251, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5772966971967805, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 9.03189129296821e-06, |
|
"loss": 0.8286, |
|
"mean_token_accuracy": 0.7585877085877086, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.588398556758257, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 8.646687002498692e-06, |
|
"loss": 0.8256, |
|
"mean_token_accuracy": 0.7592775742775744, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.5995004163197336, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 8.263518223330698e-06, |
|
"loss": 0.8217, |
|
"mean_token_accuracy": 0.7614479039479042, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6106022758812101, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 7.882961277705897e-06, |
|
"loss": 0.8354, |
|
"mean_token_accuracy": 0.7570135306548853, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6217041354426867, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 7.505588559420188e-06, |
|
"loss": 0.8222, |
|
"mean_token_accuracy": 0.7614336589336589, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6328059950041632, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 7.131967672889101e-06, |
|
"loss": 0.8579, |
|
"mean_token_accuracy": 0.7512146494805758, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6439078545656397, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 6.762660579416791e-06, |
|
"loss": 0.8253, |
|
"mean_token_accuracy": 0.7587220187220187, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6550097141271163, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 6.3982227519528986e-06, |
|
"loss": 0.8294, |
|
"mean_token_accuracy": 0.7600905599174735, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.6661115736885929, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 6.039202339608432e-06, |
|
"loss": 0.8129, |
|
"mean_token_accuracy": 0.763727106227106, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6661115736885929, |
|
"eval_loss": 0.854372501373291, |
|
"eval_mean_token_accuracy": 0.7488219841161018, |
|
"eval_runtime": 9.716, |
|
"eval_samples_per_second": 13.277, |
|
"eval_steps_per_second": 3.396, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6772134332500694, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 5.686139343187468e-06, |
|
"loss": 0.8089, |
|
"mean_token_accuracy": 0.7635978835978834, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.6883152928115459, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 5.339564802974615e-06, |
|
"loss": 0.8162, |
|
"mean_token_accuracy": 0.7625620675620676, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6994171523730225, |
|
"grad_norm": 0.25, |
|
"learning_rate": 5.000000000000003e-06, |
|
"loss": 0.818, |
|
"mean_token_accuracy": 0.7617165242165239, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7105190119344991, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 4.66795567198309e-06, |
|
"loss": 0.8164, |
|
"mean_token_accuracy": 0.7618395600557701, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7216208714959755, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 4.343931245134616e-06, |
|
"loss": 0.8391, |
|
"mean_token_accuracy": 0.7558740333740334, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.7327227310574521, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 4.028414082972141e-06, |
|
"loss": 0.8288, |
|
"mean_token_accuracy": 0.7581603581603582, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7438245906189287, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 3.7218787532790167e-06, |
|
"loss": 0.8266, |
|
"mean_token_accuracy": 0.758972323972324, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.7549264501804053, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 3.424786314309365e-06, |
|
"loss": 0.8169, |
|
"mean_token_accuracy": 0.7615511468202194, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7660283097418817, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 3.1375836213126653e-06, |
|
"loss": 0.8207, |
|
"mean_token_accuracy": 0.7615923890923892, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.7771301693033583, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 2.8607026544210115e-06, |
|
"loss": 0.8251, |
|
"mean_token_accuracy": 0.7595614570614571, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7771301693033583, |
|
"eval_loss": 0.8534859418869019, |
|
"eval_mean_token_accuracy": 0.7491665197547551, |
|
"eval_runtime": 9.7117, |
|
"eval_samples_per_second": 13.283, |
|
"eval_steps_per_second": 3.398, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7882320288648349, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 2.594559868909956e-06, |
|
"loss": 0.789, |
|
"mean_token_accuracy": 0.7690638990638987, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.7993338884263114, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 2.339555568810221e-06, |
|
"loss": 0.8195, |
|
"mean_token_accuracy": 0.76007733007733, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.810435747987788, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 2.0960733048124082e-06, |
|
"loss": 0.8358, |
|
"mean_token_accuracy": 0.7567429792429792, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.8215376075492645, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 1.8644792973703252e-06, |
|
"loss": 0.797, |
|
"mean_token_accuracy": 0.7672700447700449, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.832639467110741, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 1.6451218858706374e-06, |
|
"loss": 0.8269, |
|
"mean_token_accuracy": 0.7588715913715915, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.8437413266722176, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 1.4383310046973365e-06, |
|
"loss": 0.8243, |
|
"mean_token_accuracy": 0.7594324261790015, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8548431862336942, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 1.2444176869790925e-06, |
|
"loss": 0.8229, |
|
"mean_token_accuracy": 0.7609747659747659, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.8659450457951707, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 1.0636735967658785e-06, |
|
"loss": 0.804, |
|
"mean_token_accuracy": 0.7651831501831503, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8770469053566472, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 8.963705903385344e-07, |
|
"loss": 0.8512, |
|
"mean_token_accuracy": 0.7523127798127797, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.8881487649181238, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 7.427603073110967e-07, |
|
"loss": 0.8437, |
|
"mean_token_accuracy": 0.7555453805453806, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8881487649181238, |
|
"eval_loss": 0.8532679080963135, |
|
"eval_mean_token_accuracy": 0.748890434184552, |
|
"eval_runtime": 9.7147, |
|
"eval_samples_per_second": 13.279, |
|
"eval_steps_per_second": 3.397, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8992506244796004, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 6.030737921409169e-07, |
|
"loss": 0.8155, |
|
"mean_token_accuracy": 0.7616107041107041, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.9103524840410768, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 4.775211466158469e-07, |
|
"loss": 0.8045, |
|
"mean_token_accuracy": 0.7646845746845747, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9214543436025534, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 3.662912138411967e-07, |
|
"loss": 0.848, |
|
"mean_token_accuracy": 0.7535592185592186, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.93255620316403, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 2.6955129420176193e-07, |
|
"loss": 0.796, |
|
"mean_token_accuracy": 0.7675030525030523, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9436580627255066, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 1.874468937261531e-07, |
|
"loss": 0.8322, |
|
"mean_token_accuracy": 0.7581685062193277, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.954759922286983, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 1.201015052319099e-07, |
|
"loss": 0.8023, |
|
"mean_token_accuracy": 0.7657336182336185, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9658617818484596, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 6.761642258056977e-08, |
|
"loss": 0.8094, |
|
"mean_token_accuracy": 0.7642338217338217, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.9769636414099362, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 3.0070588322079765e-08, |
|
"loss": 0.8206, |
|
"mean_token_accuracy": 0.7607519332519332, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9880655009714127, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 7.520474957699586e-09, |
|
"loss": 0.8151, |
|
"mean_token_accuracy": 0.7614092389092388, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.9991673605328892, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0, |
|
"loss": 0.7954, |
|
"mean_token_accuracy": 0.7673087098087098, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9991673605328892, |
|
"eval_loss": 0.8533338904380798, |
|
"eval_mean_token_accuracy": 0.748884884179002, |
|
"eval_runtime": 9.7042, |
|
"eval_samples_per_second": 13.293, |
|
"eval_steps_per_second": 3.401, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9991673605328892, |
|
"step": 450, |
|
"total_flos": 6.955833048956928e+17, |
|
"train_loss": 0.8593901687198215, |
|
"train_runtime": 5693.7016, |
|
"train_samples_per_second": 3.797, |
|
"train_steps_per_second": 0.079 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 450, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.955833048956928e+17, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|