|
{ |
|
"best_metric": 1.5858772993087769, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.01620929440941436, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 3.241858881882872e-05, |
|
"eval_loss": 2.906552791595459, |
|
"eval_runtime": 397.3517, |
|
"eval_samples_per_second": 32.686, |
|
"eval_steps_per_second": 8.172, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0003241858881882872, |
|
"grad_norm": 6.263719081878662, |
|
"learning_rate": 4.1400000000000003e-05, |
|
"loss": 2.7332, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0006483717763765744, |
|
"grad_norm": 5.530974388122559, |
|
"learning_rate": 8.280000000000001e-05, |
|
"loss": 1.9108, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0009725576645648615, |
|
"grad_norm": 3.9441070556640625, |
|
"learning_rate": 0.00012419999999999998, |
|
"loss": 1.8745, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0012967435527531487, |
|
"grad_norm": 4.260671138763428, |
|
"learning_rate": 0.00016560000000000001, |
|
"loss": 1.7333, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0016209294409414358, |
|
"grad_norm": 4.642955303192139, |
|
"learning_rate": 0.000207, |
|
"loss": 1.6797, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0016209294409414358, |
|
"eval_loss": 1.8519707918167114, |
|
"eval_runtime": 398.2732, |
|
"eval_samples_per_second": 32.611, |
|
"eval_steps_per_second": 8.153, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.001945115329129723, |
|
"grad_norm": 4.400924205780029, |
|
"learning_rate": 0.00020674787920189178, |
|
"loss": 1.868, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.00226930121731801, |
|
"grad_norm": 5.454312324523926, |
|
"learning_rate": 0.00020599274511475253, |
|
"loss": 2.0069, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0025934871055062975, |
|
"grad_norm": 4.467811107635498, |
|
"learning_rate": 0.00020473827667594888, |
|
"loss": 1.7519, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0029176729936945846, |
|
"grad_norm": 4.802621841430664, |
|
"learning_rate": 0.00020299058552961598, |
|
"loss": 1.6975, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0032418588818828717, |
|
"grad_norm": 4.396114826202393, |
|
"learning_rate": 0.00020075818625134152, |
|
"loss": 1.6838, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0032418588818828717, |
|
"eval_loss": 1.8345727920532227, |
|
"eval_runtime": 398.358, |
|
"eval_samples_per_second": 32.604, |
|
"eval_steps_per_second": 8.151, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0035660447700711587, |
|
"grad_norm": 3.908578872680664, |
|
"learning_rate": 0.00019805195486600916, |
|
"loss": 2.0062, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.003890230658259446, |
|
"grad_norm": 3.713029623031616, |
|
"learning_rate": 0.00019488507586089894, |
|
"loss": 1.7925, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.004214416546447733, |
|
"grad_norm": 4.107682704925537, |
|
"learning_rate": 0.00019127297795219008, |
|
"loss": 1.8635, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.00453860243463602, |
|
"grad_norm": 5.115502834320068, |
|
"learning_rate": 0.00018723325891780706, |
|
"loss": 1.8397, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.004862788322824307, |
|
"grad_norm": 4.171757221221924, |
|
"learning_rate": 0.0001827855998628142, |
|
"loss": 1.8328, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.004862788322824307, |
|
"eval_loss": 1.7966933250427246, |
|
"eval_runtime": 397.1601, |
|
"eval_samples_per_second": 32.702, |
|
"eval_steps_per_second": 8.176, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.005186974211012595, |
|
"grad_norm": 3.389646053314209, |
|
"learning_rate": 0.0001779516693350504, |
|
"loss": 1.8123, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.005511160099200882, |
|
"grad_norm": 4.33231258392334, |
|
"learning_rate": 0.00017275501775814182, |
|
"loss": 1.8088, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.005835345987389169, |
|
"grad_norm": 4.176562309265137, |
|
"learning_rate": 0.00016722096269620562, |
|
"loss": 1.7369, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.006159531875577456, |
|
"grad_norm": 3.4682700634002686, |
|
"learning_rate": 0.00016137646550922228, |
|
"loss": 1.6268, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.006483717763765743, |
|
"grad_norm": 4.2149200439453125, |
|
"learning_rate": 0.00015525, |
|
"loss": 1.6822, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.006483717763765743, |
|
"eval_loss": 1.7772059440612793, |
|
"eval_runtime": 397.9979, |
|
"eval_samples_per_second": 32.633, |
|
"eval_steps_per_second": 8.158, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.00680790365195403, |
|
"grad_norm": 2.8696839809417725, |
|
"learning_rate": 0.0001488714136926695, |
|
"loss": 1.8906, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.0071320895401423175, |
|
"grad_norm": 3.76540207862854, |
|
"learning_rate": 0.0001422717824185469, |
|
"loss": 1.7974, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.0074562754283306045, |
|
"grad_norm": 4.557528972625732, |
|
"learning_rate": 0.00013548325891780705, |
|
"loss": 1.8021, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.007780461316518892, |
|
"grad_norm": 4.325812339782715, |
|
"learning_rate": 0.0001285389161945656, |
|
"loss": 1.7236, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.00810464720470718, |
|
"grad_norm": 3.879993200302124, |
|
"learning_rate": 0.0001214725863885273, |
|
"loss": 1.7255, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.00810464720470718, |
|
"eval_loss": 1.7434029579162598, |
|
"eval_runtime": 397.2014, |
|
"eval_samples_per_second": 32.699, |
|
"eval_steps_per_second": 8.175, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.008428833092895466, |
|
"grad_norm": 3.901002883911133, |
|
"learning_rate": 0.00011431869594820213, |
|
"loss": 1.7344, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.008753018981083754, |
|
"grad_norm": 4.044349670410156, |
|
"learning_rate": 0.00010711209790870886, |
|
"loss": 1.576, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.00907720486927204, |
|
"grad_norm": 3.8595011234283447, |
|
"learning_rate": 9.988790209129117e-05, |
|
"loss": 1.802, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.009401390757460328, |
|
"grad_norm": 3.7126569747924805, |
|
"learning_rate": 9.268130405179787e-05, |
|
"loss": 1.6513, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.009725576645648614, |
|
"grad_norm": 4.118965148925781, |
|
"learning_rate": 8.55274136114727e-05, |
|
"loss": 1.5466, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.009725576645648614, |
|
"eval_loss": 1.6769126653671265, |
|
"eval_runtime": 399.0787, |
|
"eval_samples_per_second": 32.545, |
|
"eval_steps_per_second": 8.136, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.010049762533836902, |
|
"grad_norm": 3.541172981262207, |
|
"learning_rate": 7.84610838054344e-05, |
|
"loss": 1.8639, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.01037394842202519, |
|
"grad_norm": 4.0853753089904785, |
|
"learning_rate": 7.151674108219295e-05, |
|
"loss": 1.6957, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.010698134310213476, |
|
"grad_norm": 3.440962076187134, |
|
"learning_rate": 6.472821758145309e-05, |
|
"loss": 1.6691, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.011022320198401764, |
|
"grad_norm": 3.6532485485076904, |
|
"learning_rate": 5.8128586307330475e-05, |
|
"loss": 1.5867, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.01134650608659005, |
|
"grad_norm": 3.979214906692505, |
|
"learning_rate": 5.175000000000002e-05, |
|
"loss": 1.5499, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.01134650608659005, |
|
"eval_loss": 1.6310955286026, |
|
"eval_runtime": 398.0556, |
|
"eval_samples_per_second": 32.629, |
|
"eval_steps_per_second": 8.157, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.011670691974778338, |
|
"grad_norm": 3.600574254989624, |
|
"learning_rate": 4.5623534490777714e-05, |
|
"loss": 1.8409, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.011994877862966625, |
|
"grad_norm": 3.5220589637756348, |
|
"learning_rate": 3.9779037303794365e-05, |
|
"loss": 1.5889, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.012319063751154912, |
|
"grad_norm": 3.7355663776397705, |
|
"learning_rate": 3.42449822418582e-05, |
|
"loss": 1.5124, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.012643249639343199, |
|
"grad_norm": 3.8457112312316895, |
|
"learning_rate": 2.9048330664949622e-05, |
|
"loss": 1.5491, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.012967435527531487, |
|
"grad_norm": 3.4412333965301514, |
|
"learning_rate": 2.4214400137185785e-05, |
|
"loss": 1.549, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.012967435527531487, |
|
"eval_loss": 1.606103539466858, |
|
"eval_runtime": 396.0436, |
|
"eval_samples_per_second": 32.794, |
|
"eval_steps_per_second": 8.199, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.013291621415719775, |
|
"grad_norm": 3.6701438426971436, |
|
"learning_rate": 1.976674108219295e-05, |
|
"loss": 1.6744, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.01361580730390806, |
|
"grad_norm": 4.545055389404297, |
|
"learning_rate": 1.572702204780991e-05, |
|
"loss": 1.6089, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.013939993192096349, |
|
"grad_norm": 3.4341204166412354, |
|
"learning_rate": 1.2114924139101056e-05, |
|
"loss": 1.5792, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.014264179080284635, |
|
"grad_norm": 3.332812547683716, |
|
"learning_rate": 8.948045133990798e-06, |
|
"loss": 1.5303, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.014588364968472923, |
|
"grad_norm": 3.597540855407715, |
|
"learning_rate": 6.241813748658489e-06, |
|
"loss": 1.5332, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.014588364968472923, |
|
"eval_loss": 1.5877035856246948, |
|
"eval_runtime": 397.7506, |
|
"eval_samples_per_second": 32.654, |
|
"eval_steps_per_second": 8.163, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.014912550856661209, |
|
"grad_norm": 3.2884576320648193, |
|
"learning_rate": 4.009414470383994e-06, |
|
"loss": 1.8572, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.015236736744849497, |
|
"grad_norm": 3.6394126415252686, |
|
"learning_rate": 2.261723324051111e-06, |
|
"loss": 1.7304, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.015560922633037783, |
|
"grad_norm": 2.9843626022338867, |
|
"learning_rate": 1.0072548852474675e-06, |
|
"loss": 1.631, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.01588510852122607, |
|
"grad_norm": 3.780142068862915, |
|
"learning_rate": 2.5212079810819554e-07, |
|
"loss": 1.2314, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.01620929440941436, |
|
"grad_norm": 3.420269012451172, |
|
"learning_rate": 0.0, |
|
"loss": 1.4883, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.01620929440941436, |
|
"eval_loss": 1.5858772993087769, |
|
"eval_runtime": 398.3709, |
|
"eval_samples_per_second": 32.603, |
|
"eval_steps_per_second": 8.151, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.789971686378701e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|