|
{ |
|
"best_metric": 0.7794634103775024, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.030554876558298704, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 6.11097531165974e-05, |
|
"eval_loss": 1.0963929891586304, |
|
"eval_runtime": 283.1419, |
|
"eval_samples_per_second": 24.334, |
|
"eval_steps_per_second": 6.085, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0006110975311659741, |
|
"grad_norm": 0.1534706950187683, |
|
"learning_rate": 4.2000000000000004e-05, |
|
"loss": 1.0834, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0012221950623319481, |
|
"grad_norm": 0.1410927176475525, |
|
"learning_rate": 8.400000000000001e-05, |
|
"loss": 0.962, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0018332925934979223, |
|
"grad_norm": 0.14470386505126953, |
|
"learning_rate": 0.000126, |
|
"loss": 1.0437, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0024443901246638962, |
|
"grad_norm": 0.1557340919971466, |
|
"learning_rate": 0.00016800000000000002, |
|
"loss": 0.9856, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0030554876558298704, |
|
"grad_norm": 0.14385275542736053, |
|
"learning_rate": 0.00021, |
|
"loss": 1.0426, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0030554876558298704, |
|
"eval_loss": 0.9096205830574036, |
|
"eval_runtime": 283.3102, |
|
"eval_samples_per_second": 24.32, |
|
"eval_steps_per_second": 6.082, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0036665851869958446, |
|
"grad_norm": 0.1434783935546875, |
|
"learning_rate": 0.00020974422527728155, |
|
"loss": 0.8451, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.004277682718161819, |
|
"grad_norm": 0.13006432354450226, |
|
"learning_rate": 0.0002089781472178649, |
|
"loss": 0.8604, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0048887802493277925, |
|
"grad_norm": 0.13252924382686615, |
|
"learning_rate": 0.0002077054980770496, |
|
"loss": 0.8868, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.005499877780493767, |
|
"grad_norm": 0.12162353843450546, |
|
"learning_rate": 0.00020593247807352348, |
|
"loss": 0.8893, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.006110975311659741, |
|
"grad_norm": 0.1493881493806839, |
|
"learning_rate": 0.00020366772518252038, |
|
"loss": 0.9245, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.006110975311659741, |
|
"eval_loss": 0.8621656894683838, |
|
"eval_runtime": 283.2071, |
|
"eval_samples_per_second": 24.328, |
|
"eval_steps_per_second": 6.084, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.006722072842825715, |
|
"grad_norm": 0.12663449347019196, |
|
"learning_rate": 0.0002009222730524731, |
|
"loss": 0.8044, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.007333170373991689, |
|
"grad_norm": 0.13809515535831451, |
|
"learning_rate": 0.00019770949725018733, |
|
"loss": 0.852, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.007944267905157664, |
|
"grad_norm": 0.13587471842765808, |
|
"learning_rate": 0.00019404505009642473, |
|
"loss": 0.8401, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.008555365436323637, |
|
"grad_norm": 0.13504283130168915, |
|
"learning_rate": 0.0001899467844093695, |
|
"loss": 0.8743, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.009166462967489611, |
|
"grad_norm": 0.14619627594947815, |
|
"learning_rate": 0.00018543466652749268, |
|
"loss": 0.9113, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.009166462967489611, |
|
"eval_loss": 0.837764322757721, |
|
"eval_runtime": 283.8331, |
|
"eval_samples_per_second": 24.275, |
|
"eval_steps_per_second": 6.07, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.009777560498655585, |
|
"grad_norm": 0.12556147575378418, |
|
"learning_rate": 0.00018053067903555837, |
|
"loss": 0.7812, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.010388658029821559, |
|
"grad_norm": 0.13120540976524353, |
|
"learning_rate": 0.00017525871366768012, |
|
"loss": 0.8049, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.010999755560987534, |
|
"grad_norm": 0.1468738466501236, |
|
"learning_rate": 0.00016964445490919413, |
|
"loss": 0.8328, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.011610853092153508, |
|
"grad_norm": 0.13799065351486206, |
|
"learning_rate": 0.00016371525486442843, |
|
"loss": 0.8505, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.012221950623319482, |
|
"grad_norm": 0.1520494669675827, |
|
"learning_rate": 0.0001575, |
|
"loss": 0.8542, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.012221950623319482, |
|
"eval_loss": 0.8198666572570801, |
|
"eval_runtime": 284.1903, |
|
"eval_samples_per_second": 24.244, |
|
"eval_steps_per_second": 6.063, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.012833048154485455, |
|
"grad_norm": 0.13851197063922882, |
|
"learning_rate": 0.00015102897041285315, |
|
"loss": 0.8161, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.01344414568565143, |
|
"grad_norm": 0.14233830571174622, |
|
"learning_rate": 0.00014433369230867077, |
|
"loss": 0.7929, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.014055243216817405, |
|
"grad_norm": 0.13967365026474, |
|
"learning_rate": 0.0001374467844093695, |
|
"loss": 0.8082, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.014666340747983378, |
|
"grad_norm": 0.14153410494327545, |
|
"learning_rate": 0.0001304017990379651, |
|
"loss": 0.787, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.015277438279149352, |
|
"grad_norm": 0.15393155813217163, |
|
"learning_rate": 0.0001232330586550277, |
|
"loss": 0.8947, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.015277438279149352, |
|
"eval_loss": 0.8052677512168884, |
|
"eval_runtime": 283.3575, |
|
"eval_samples_per_second": 24.316, |
|
"eval_steps_per_second": 6.081, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.015888535810315327, |
|
"grad_norm": 0.15014930069446564, |
|
"learning_rate": 0.00011597548864310363, |
|
"loss": 0.7755, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0164996333414813, |
|
"grad_norm": 0.14606203138828278, |
|
"learning_rate": 0.00010866444715376263, |
|
"loss": 0.7551, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.017110730872647275, |
|
"grad_norm": 0.14750610291957855, |
|
"learning_rate": 0.00010133555284623744, |
|
"loss": 0.81, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.01772182840381325, |
|
"grad_norm": 0.18205717206001282, |
|
"learning_rate": 9.402451135689641e-05, |
|
"loss": 0.804, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.018332925934979222, |
|
"grad_norm": 0.14796751737594604, |
|
"learning_rate": 8.676694134497232e-05, |
|
"loss": 0.872, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.018332925934979222, |
|
"eval_loss": 0.7966746091842651, |
|
"eval_runtime": 282.5751, |
|
"eval_samples_per_second": 24.383, |
|
"eval_steps_per_second": 6.097, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.018944023466145196, |
|
"grad_norm": 0.13817603886127472, |
|
"learning_rate": 7.95982009620349e-05, |
|
"loss": 0.746, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.01955512099731117, |
|
"grad_norm": 0.14924216270446777, |
|
"learning_rate": 7.255321559063053e-05, |
|
"loss": 0.7323, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.020166218528477144, |
|
"grad_norm": 0.14804905652999878, |
|
"learning_rate": 6.566630769132923e-05, |
|
"loss": 0.8057, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.020777316059643117, |
|
"grad_norm": 0.1513608694076538, |
|
"learning_rate": 5.897102958714686e-05, |
|
"loss": 0.781, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.021388413590809095, |
|
"grad_norm": 0.176563560962677, |
|
"learning_rate": 5.250000000000002e-05, |
|
"loss": 0.8638, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.021388413590809095, |
|
"eval_loss": 0.7875542640686035, |
|
"eval_runtime": 283.0303, |
|
"eval_samples_per_second": 24.344, |
|
"eval_steps_per_second": 6.088, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.02199951112197507, |
|
"grad_norm": 0.13537642359733582, |
|
"learning_rate": 4.62847451355716e-05, |
|
"loss": 0.7125, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.022610608653141042, |
|
"grad_norm": 0.15358619391918182, |
|
"learning_rate": 4.035554509080588e-05, |
|
"loss": 0.7337, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.023221706184307016, |
|
"grad_norm": 0.16124075651168823, |
|
"learning_rate": 3.474128633231992e-05, |
|
"loss": 0.7682, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.02383280371547299, |
|
"grad_norm": 0.15972477197647095, |
|
"learning_rate": 2.946932096444165e-05, |
|
"loss": 0.7327, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.024443901246638963, |
|
"grad_norm": 0.16827918589115143, |
|
"learning_rate": 2.456533347250732e-05, |
|
"loss": 0.819, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.024443901246638963, |
|
"eval_loss": 0.7825099229812622, |
|
"eval_runtime": 284.0898, |
|
"eval_samples_per_second": 24.253, |
|
"eval_steps_per_second": 6.065, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.025054998777804937, |
|
"grad_norm": 0.1699787676334381, |
|
"learning_rate": 2.005321559063053e-05, |
|
"loss": 0.7788, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.02566609630897091, |
|
"grad_norm": 0.16614125669002533, |
|
"learning_rate": 1.5954949903575276e-05, |
|
"loss": 0.7578, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.026277193840136884, |
|
"grad_norm": 0.1688673496246338, |
|
"learning_rate": 1.2290502749812666e-05, |
|
"loss": 0.7544, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.02688829137130286, |
|
"grad_norm": 0.16808409988880157, |
|
"learning_rate": 9.077726947526898e-06, |
|
"loss": 0.8112, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.027499388902468835, |
|
"grad_norm": 0.17651791870594025, |
|
"learning_rate": 6.332274817479627e-06, |
|
"loss": 0.8681, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.027499388902468835, |
|
"eval_loss": 0.7797022461891174, |
|
"eval_runtime": 283.4645, |
|
"eval_samples_per_second": 24.306, |
|
"eval_steps_per_second": 6.078, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.02811048643363481, |
|
"grad_norm": 0.15380319952964783, |
|
"learning_rate": 4.067521926476516e-06, |
|
"loss": 0.7337, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.028721583964800783, |
|
"grad_norm": 0.1493958830833435, |
|
"learning_rate": 2.294501922950403e-06, |
|
"loss": 0.7669, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.029332681495966757, |
|
"grad_norm": 0.1568552553653717, |
|
"learning_rate": 1.021852782135112e-06, |
|
"loss": 0.8023, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.02994377902713273, |
|
"grad_norm": 0.15516310930252075, |
|
"learning_rate": 2.5577472271845927e-07, |
|
"loss": 0.7542, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.030554876558298704, |
|
"grad_norm": 0.15687133371829987, |
|
"learning_rate": 0.0, |
|
"loss": 0.8238, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.030554876558298704, |
|
"eval_loss": 0.7794634103775024, |
|
"eval_runtime": 283.3578, |
|
"eval_samples_per_second": 24.316, |
|
"eval_steps_per_second": 6.081, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.130095680926515e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|