|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.07635336336565626, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0015270672673131252, |
|
"grad_norm": 1.7919381856918335, |
|
"learning_rate": 0.00019980267284282717, |
|
"loss": 2.5081, |
|
"mean_token_accuracy": 0.5467605337500572, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0030541345346262504, |
|
"grad_norm": 2.8579838275909424, |
|
"learning_rate": 0.0001992114701314478, |
|
"loss": 1.7298, |
|
"mean_token_accuracy": 0.6468371748924255, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.004581201801939375, |
|
"grad_norm": 1.7089707851409912, |
|
"learning_rate": 0.0001982287250728689, |
|
"loss": 1.1637, |
|
"mean_token_accuracy": 0.7540688991546631, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.006108269069252501, |
|
"grad_norm": 0.9711195826530457, |
|
"learning_rate": 0.0001968583161128631, |
|
"loss": 0.8899, |
|
"mean_token_accuracy": 0.8186888545751572, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.007635336336565626, |
|
"grad_norm": 0.8973957896232605, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 0.8256, |
|
"mean_token_accuracy": 0.8281645506620408, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.00916240360387875, |
|
"grad_norm": 0.9836689829826355, |
|
"learning_rate": 0.00019297764858882514, |
|
"loss": 0.754, |
|
"mean_token_accuracy": 0.8368939191102982, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.010689470871191877, |
|
"grad_norm": 1.0244220495224, |
|
"learning_rate": 0.00019048270524660196, |
|
"loss": 0.7019, |
|
"mean_token_accuracy": 0.8408633172512054, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.012216538138505002, |
|
"grad_norm": 1.1119804382324219, |
|
"learning_rate": 0.00018763066800438636, |
|
"loss": 0.6498, |
|
"mean_token_accuracy": 0.8521306127309799, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.013743605405818126, |
|
"grad_norm": 0.7104642987251282, |
|
"learning_rate": 0.00018443279255020152, |
|
"loss": 0.6679, |
|
"mean_token_accuracy": 0.8470089048147201, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.015270672673131251, |
|
"grad_norm": 0.7890557646751404, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.6581, |
|
"mean_token_accuracy": 0.8512038260698318, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.016797739940444378, |
|
"grad_norm": 0.6953688859939575, |
|
"learning_rate": 0.00017705132427757895, |
|
"loss": 0.6358, |
|
"mean_token_accuracy": 0.8534704208374023, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0183248072077575, |
|
"grad_norm": 0.8764053583145142, |
|
"learning_rate": 0.00017289686274214118, |
|
"loss": 0.6124, |
|
"mean_token_accuracy": 0.857587480545044, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.019851874475070627, |
|
"grad_norm": 0.8632748126983643, |
|
"learning_rate": 0.00016845471059286887, |
|
"loss": 0.6382, |
|
"mean_token_accuracy": 0.8533297270536423, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.021378941742383754, |
|
"grad_norm": 0.9536803960800171, |
|
"learning_rate": 0.000163742398974869, |
|
"loss": 0.6252, |
|
"mean_token_accuracy": 0.8563201695680618, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.022906009009696877, |
|
"grad_norm": 0.7280343174934387, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 0.6105, |
|
"mean_token_accuracy": 0.8567106693983078, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.024433076277010003, |
|
"grad_norm": 0.6883084177970886, |
|
"learning_rate": 0.00015358267949789966, |
|
"loss": 0.6134, |
|
"mean_token_accuracy": 0.8576973646879196, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.025960143544323126, |
|
"grad_norm": 0.7537200450897217, |
|
"learning_rate": 0.00014817536741017152, |
|
"loss": 0.6182, |
|
"mean_token_accuracy": 0.8572364389896393, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.027487210811636253, |
|
"grad_norm": 0.8564761877059937, |
|
"learning_rate": 0.00014257792915650728, |
|
"loss": 0.604, |
|
"mean_token_accuracy": 0.8564972043037414, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.02901427807894938, |
|
"grad_norm": 0.9395449757575989, |
|
"learning_rate": 0.00013681245526846783, |
|
"loss": 0.6082, |
|
"mean_token_accuracy": 0.8573319047689438, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.030541345346262502, |
|
"grad_norm": 1.4290846586227417, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.5852, |
|
"mean_token_accuracy": 0.859686890244484, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.032068412613575625, |
|
"grad_norm": 0.8474317789077759, |
|
"learning_rate": 0.0001248689887164855, |
|
"loss": 0.5912, |
|
"mean_token_accuracy": 0.863548994064331, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.033595479880888755, |
|
"grad_norm": 0.8646250367164612, |
|
"learning_rate": 0.00011873813145857249, |
|
"loss": 0.5862, |
|
"mean_token_accuracy": 0.861778911948204, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.03512254714820188, |
|
"grad_norm": 0.9591971635818481, |
|
"learning_rate": 0.00011253332335643043, |
|
"loss": 0.5656, |
|
"mean_token_accuracy": 0.8678788006305694, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.036649614415515, |
|
"grad_norm": 0.7952109575271606, |
|
"learning_rate": 0.00010627905195293135, |
|
"loss": 0.581, |
|
"mean_token_accuracy": 0.8648124188184738, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.03817668168282813, |
|
"grad_norm": 0.7618094086647034, |
|
"learning_rate": 0.0001, |
|
"loss": 0.576, |
|
"mean_token_accuracy": 0.8639664649963379, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.039703748950141254, |
|
"grad_norm": 0.730375349521637, |
|
"learning_rate": 9.372094804706867e-05, |
|
"loss": 0.5915, |
|
"mean_token_accuracy": 0.8597521275281906, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.04123081621745438, |
|
"grad_norm": 0.8533547520637512, |
|
"learning_rate": 8.746667664356956e-05, |
|
"loss": 0.5891, |
|
"mean_token_accuracy": 0.859322988986969, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.04275788348476751, |
|
"grad_norm": 0.7875514626502991, |
|
"learning_rate": 8.126186854142752e-05, |
|
"loss": 0.5957, |
|
"mean_token_accuracy": 0.8590130746364594, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.04428495075208063, |
|
"grad_norm": 0.8987193703651428, |
|
"learning_rate": 7.513101128351454e-05, |
|
"loss": 0.5997, |
|
"mean_token_accuracy": 0.8575950294733048, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.04581201801939375, |
|
"grad_norm": 0.7341880202293396, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.5614, |
|
"mean_token_accuracy": 0.8668889582157135, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.047339085286706876, |
|
"grad_norm": 0.7949256300926208, |
|
"learning_rate": 6.318754473153221e-05, |
|
"loss": 0.5936, |
|
"mean_token_accuracy": 0.8569323867559433, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.048866152554020006, |
|
"grad_norm": 0.840946614742279, |
|
"learning_rate": 5.7422070843492734e-05, |
|
"loss": 0.5901, |
|
"mean_token_accuracy": 0.863608232140541, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.05039321982133313, |
|
"grad_norm": 0.9876859188079834, |
|
"learning_rate": 5.182463258982846e-05, |
|
"loss": 0.5787, |
|
"mean_token_accuracy": 0.8655537277460098, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.05192028708864625, |
|
"grad_norm": 0.7635581493377686, |
|
"learning_rate": 4.6417320502100316e-05, |
|
"loss": 0.6074, |
|
"mean_token_accuracy": 0.8565149992704392, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.05344735435595938, |
|
"grad_norm": 0.7946358919143677, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 0.6015, |
|
"mean_token_accuracy": 0.8584772288799286, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.054974421623272506, |
|
"grad_norm": 0.8052517771720886, |
|
"learning_rate": 3.6257601025131026e-05, |
|
"loss": 0.5552, |
|
"mean_token_accuracy": 0.8673796206712723, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.05650148889058563, |
|
"grad_norm": 0.7466617226600647, |
|
"learning_rate": 3.154528940713113e-05, |
|
"loss": 0.5954, |
|
"mean_token_accuracy": 0.8603360831737519, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.05802855615789876, |
|
"grad_norm": 0.7672128081321716, |
|
"learning_rate": 2.7103137257858868e-05, |
|
"loss": 0.5764, |
|
"mean_token_accuracy": 0.8672617733478546, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.05955562342521188, |
|
"grad_norm": 0.7813590168952942, |
|
"learning_rate": 2.2948675722421086e-05, |
|
"loss": 0.5812, |
|
"mean_token_accuracy": 0.8635528743267059, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.061082690692525005, |
|
"grad_norm": 0.7313287854194641, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 0.5563, |
|
"mean_token_accuracy": 0.8667043030261994, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.06260975795983813, |
|
"grad_norm": 0.9430689215660095, |
|
"learning_rate": 1.5567207449798515e-05, |
|
"loss": 0.5452, |
|
"mean_token_accuracy": 0.871171161532402, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.06413682522715125, |
|
"grad_norm": 0.8281370401382446, |
|
"learning_rate": 1.2369331995613665e-05, |
|
"loss": 0.5809, |
|
"mean_token_accuracy": 0.8636080652475357, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.06566389249446439, |
|
"grad_norm": 0.704873263835907, |
|
"learning_rate": 9.517294753398064e-06, |
|
"loss": 0.5975, |
|
"mean_token_accuracy": 0.859060087800026, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.06719095976177751, |
|
"grad_norm": 0.8515197038650513, |
|
"learning_rate": 7.022351411174866e-06, |
|
"loss": 0.5867, |
|
"mean_token_accuracy": 0.8638981640338897, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.06871802702909063, |
|
"grad_norm": 0.7860581874847412, |
|
"learning_rate": 4.8943483704846475e-06, |
|
"loss": 0.5802, |
|
"mean_token_accuracy": 0.8683864206075669, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.07024509429640376, |
|
"grad_norm": 0.8138625621795654, |
|
"learning_rate": 3.1416838871368924e-06, |
|
"loss": 0.5918, |
|
"mean_token_accuracy": 0.8596341729164123, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.07177216156371688, |
|
"grad_norm": 0.8401088714599609, |
|
"learning_rate": 1.771274927131139e-06, |
|
"loss": 0.6209, |
|
"mean_token_accuracy": 0.8542895227670669, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.07329922883103, |
|
"grad_norm": 0.993623673915863, |
|
"learning_rate": 7.885298685522235e-07, |
|
"loss": 0.5791, |
|
"mean_token_accuracy": 0.865888985991478, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.07482629609834313, |
|
"grad_norm": 0.8312510848045349, |
|
"learning_rate": 1.973271571728441e-07, |
|
"loss": 0.5615, |
|
"mean_token_accuracy": 0.8679140955209732, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.07635336336565626, |
|
"grad_norm": 0.9254234433174133, |
|
"learning_rate": 0.0, |
|
"loss": 0.5822, |
|
"mean_token_accuracy": 0.8632416158914566, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5806155567955968.0, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|