|
{ |
|
"best_metric": 0.48943638801574707, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.0940203083866115, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.000188040616773223, |
|
"eval_loss": 1.0817307233810425, |
|
"eval_runtime": 133.3846, |
|
"eval_samples_per_second": 16.794, |
|
"eval_steps_per_second": 4.198, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00188040616773223, |
|
"grad_norm": 5.176900863647461, |
|
"learning_rate": 4.22e-05, |
|
"loss": 1.7009, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00376081233546446, |
|
"grad_norm": 3.7416343688964844, |
|
"learning_rate": 8.44e-05, |
|
"loss": 1.21, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.005641218503196691, |
|
"grad_norm": 3.444662570953369, |
|
"learning_rate": 0.0001266, |
|
"loss": 1.1852, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.00752162467092892, |
|
"grad_norm": 3.319375991821289, |
|
"learning_rate": 0.0001688, |
|
"loss": 1.1785, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.009402030838661151, |
|
"grad_norm": 5.216305255889893, |
|
"learning_rate": 0.000211, |
|
"loss": 1.3661, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.009402030838661151, |
|
"eval_loss": 0.7779666185379028, |
|
"eval_runtime": 133.0975, |
|
"eval_samples_per_second": 16.83, |
|
"eval_steps_per_second": 4.207, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.011282437006393382, |
|
"grad_norm": 2.7655200958251953, |
|
"learning_rate": 0.00021074300730241147, |
|
"loss": 1.3996, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.013162843174125612, |
|
"grad_norm": 2.7388007640838623, |
|
"learning_rate": 0.00020997328125223568, |
|
"loss": 1.2458, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.01504324934185784, |
|
"grad_norm": 2.6569647789001465, |
|
"learning_rate": 0.0002086945718774165, |
|
"loss": 1.3008, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01692365550959007, |
|
"grad_norm": 2.989269733428955, |
|
"learning_rate": 0.00020691310892149265, |
|
"loss": 1.2504, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.018804061677322303, |
|
"grad_norm": 4.817940711975098, |
|
"learning_rate": 0.00020463757149291335, |
|
"loss": 1.4111, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.018804061677322303, |
|
"eval_loss": 0.8434422016143799, |
|
"eval_runtime": 133.1617, |
|
"eval_samples_per_second": 16.822, |
|
"eval_steps_per_second": 4.205, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02068446784505453, |
|
"grad_norm": 2.5777533054351807, |
|
"learning_rate": 0.0002018790457812944, |
|
"loss": 1.5197, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.022564874012786763, |
|
"grad_norm": 2.3088767528533936, |
|
"learning_rate": 0.0001986509710466168, |
|
"loss": 1.2673, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.024445280180518992, |
|
"grad_norm": 2.4268999099731445, |
|
"learning_rate": 0.00019496907414450293, |
|
"loss": 1.2102, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.026325686348251224, |
|
"grad_norm": 2.8331105709075928, |
|
"learning_rate": 0.00019085129290655697, |
|
"loss": 1.192, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.028206092515983452, |
|
"grad_norm": 5.091329574584961, |
|
"learning_rate": 0.00018631768874905217, |
|
"loss": 1.3753, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.028206092515983452, |
|
"eval_loss": 0.7753247618675232, |
|
"eval_runtime": 133.6028, |
|
"eval_samples_per_second": 16.766, |
|
"eval_steps_per_second": 4.192, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03008649868371568, |
|
"grad_norm": 251.54493713378906, |
|
"learning_rate": 0.0001813903489357277, |
|
"loss": 1.5184, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.03196690485144791, |
|
"grad_norm": 2.6304538249969482, |
|
"learning_rate": 0.00017609327897085954, |
|
"loss": 1.2063, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.03384731101918014, |
|
"grad_norm": 2.115901231765747, |
|
"learning_rate": 0.00017045228564685694, |
|
"loss": 1.1798, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.035727717186912374, |
|
"grad_norm": 2.3057830333709717, |
|
"learning_rate": 0.0001644948513161638, |
|
"loss": 1.0979, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.037608123354644606, |
|
"grad_norm": 4.125862121582031, |
|
"learning_rate": 0.00015825, |
|
"loss": 1.356, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.037608123354644606, |
|
"eval_loss": 0.7662692666053772, |
|
"eval_runtime": 133.4554, |
|
"eval_samples_per_second": 16.785, |
|
"eval_steps_per_second": 4.196, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03948852952237683, |
|
"grad_norm": 2.323758125305176, |
|
"learning_rate": 0.00015174815598624768, |
|
"loss": 1.4656, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.04136893569010906, |
|
"grad_norm": 2.185595750808716, |
|
"learning_rate": 0.00014502099560537873, |
|
"loss": 1.1591, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.043249341857841295, |
|
"grad_norm": 2.143188953399658, |
|
"learning_rate": 0.00013810129290655696, |
|
"loss": 1.0983, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.04512974802557353, |
|
"grad_norm": 2.9297232627868652, |
|
"learning_rate": 0.00013102275998576495, |
|
"loss": 1.158, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.04701015419330575, |
|
"grad_norm": 3.86443829536438, |
|
"learning_rate": 0.00012381988274386116, |
|
"loss": 1.2593, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.04701015419330575, |
|
"eval_loss": 0.669979989528656, |
|
"eval_runtime": 134.6215, |
|
"eval_samples_per_second": 16.639, |
|
"eval_steps_per_second": 4.16, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.048890560361037984, |
|
"grad_norm": 1.8978098630905151, |
|
"learning_rate": 0.00011652775287473745, |
|
"loss": 1.3589, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.050770966528770216, |
|
"grad_norm": 1.964809536933899, |
|
"learning_rate": 0.00010918189690211387, |
|
"loss": 1.1163, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.05265137269650245, |
|
"grad_norm": 1.9690378904342651, |
|
"learning_rate": 0.00010181810309788618, |
|
"loss": 1.1062, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.05453177886423467, |
|
"grad_norm": 2.6126410961151123, |
|
"learning_rate": 9.447224712526258e-05, |
|
"loss": 1.0986, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.056412185031966905, |
|
"grad_norm": 3.1689352989196777, |
|
"learning_rate": 8.718011725613886e-05, |
|
"loss": 1.1686, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.056412185031966905, |
|
"eval_loss": 0.6255946755409241, |
|
"eval_runtime": 133.189, |
|
"eval_samples_per_second": 16.818, |
|
"eval_steps_per_second": 4.205, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05829259119969914, |
|
"grad_norm": 2.104492664337158, |
|
"learning_rate": 7.997724001423507e-05, |
|
"loss": 1.3156, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.06017299736743136, |
|
"grad_norm": 2.1799798011779785, |
|
"learning_rate": 7.289870709344306e-05, |
|
"loss": 1.0996, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.062053403535163594, |
|
"grad_norm": 2.0943219661712646, |
|
"learning_rate": 6.597900439462128e-05, |
|
"loss": 0.9772, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.06393380970289582, |
|
"grad_norm": 2.013209819793701, |
|
"learning_rate": 5.9251844013752326e-05, |
|
"loss": 0.9912, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.06581421587062805, |
|
"grad_norm": 3.92903733253479, |
|
"learning_rate": 5.275000000000002e-05, |
|
"loss": 1.1292, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.06581421587062805, |
|
"eval_loss": 0.5762282609939575, |
|
"eval_runtime": 133.4566, |
|
"eval_samples_per_second": 16.784, |
|
"eval_steps_per_second": 4.196, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.06769462203836028, |
|
"grad_norm": 1.7651203870773315, |
|
"learning_rate": 4.650514868383623e-05, |
|
"loss": 1.2481, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.06957502820609252, |
|
"grad_norm": 1.6707273721694946, |
|
"learning_rate": 4.054771435314305e-05, |
|
"loss": 1.0573, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.07145543437382475, |
|
"grad_norm": 1.9455937147140503, |
|
"learning_rate": 3.4906721029140495e-05, |
|
"loss": 1.0288, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.07333584054155698, |
|
"grad_norm": 1.6634033918380737, |
|
"learning_rate": 2.9609651064272323e-05, |
|
"loss": 0.9095, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.07521624670928921, |
|
"grad_norm": 2.96443510055542, |
|
"learning_rate": 2.468231125094783e-05, |
|
"loss": 1.0613, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07521624670928921, |
|
"eval_loss": 0.5196256637573242, |
|
"eval_runtime": 133.6892, |
|
"eval_samples_per_second": 16.755, |
|
"eval_steps_per_second": 4.189, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07709665287702144, |
|
"grad_norm": 1.5204122066497803, |
|
"learning_rate": 2.0148707093443057e-05, |
|
"loss": 1.1007, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.07897705904475366, |
|
"grad_norm": 1.678540587425232, |
|
"learning_rate": 1.603092585549706e-05, |
|
"loss": 0.9935, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.0808574652124859, |
|
"grad_norm": 1.6980760097503662, |
|
"learning_rate": 1.2349028953383204e-05, |
|
"loss": 0.9701, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.08273787138021813, |
|
"grad_norm": 1.9784950017929077, |
|
"learning_rate": 9.120954218705596e-06, |
|
"loss": 0.908, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.08461827754795036, |
|
"grad_norm": 2.8648924827575684, |
|
"learning_rate": 6.362428507086673e-06, |
|
"loss": 1.1085, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.08461827754795036, |
|
"eval_loss": 0.49475958943367004, |
|
"eval_runtime": 133.6063, |
|
"eval_samples_per_second": 16.766, |
|
"eval_steps_per_second": 4.191, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.08649868371568259, |
|
"grad_norm": 1.6609992980957031, |
|
"learning_rate": 4.0868910785073565e-06, |
|
"loss": 1.0254, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.08837908988341482, |
|
"grad_norm": 1.700790524482727, |
|
"learning_rate": 2.3054281225835e-06, |
|
"loss": 0.9438, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.09025949605114705, |
|
"grad_norm": 1.5010980367660522, |
|
"learning_rate": 1.026718747764327e-06, |
|
"loss": 0.897, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.09213990221887927, |
|
"grad_norm": 1.916372537612915, |
|
"learning_rate": 2.5699269758854715e-07, |
|
"loss": 0.9306, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.0940203083866115, |
|
"grad_norm": 3.343224048614502, |
|
"learning_rate": 0.0, |
|
"loss": 1.0691, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0940203083866115, |
|
"eval_loss": 0.48943638801574707, |
|
"eval_runtime": 134.0884, |
|
"eval_samples_per_second": 16.705, |
|
"eval_steps_per_second": 4.176, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.988466360614912e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|