lesso10's picture
Training in progress, step 500, checkpoint
6d69296 verified
{
"best_metric": 0.7794634103775024,
"best_model_checkpoint": "miner_id_24/checkpoint-500",
"epoch": 0.030554876558298704,
"eval_steps": 50,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 6.11097531165974e-05,
"eval_loss": 1.0963929891586304,
"eval_runtime": 283.1419,
"eval_samples_per_second": 24.334,
"eval_steps_per_second": 6.085,
"step": 1
},
{
"epoch": 0.0006110975311659741,
"grad_norm": 0.1534706950187683,
"learning_rate": 4.2000000000000004e-05,
"loss": 1.0834,
"step": 10
},
{
"epoch": 0.0012221950623319481,
"grad_norm": 0.1410927176475525,
"learning_rate": 8.400000000000001e-05,
"loss": 0.962,
"step": 20
},
{
"epoch": 0.0018332925934979223,
"grad_norm": 0.14470386505126953,
"learning_rate": 0.000126,
"loss": 1.0437,
"step": 30
},
{
"epoch": 0.0024443901246638962,
"grad_norm": 0.1557340919971466,
"learning_rate": 0.00016800000000000002,
"loss": 0.9856,
"step": 40
},
{
"epoch": 0.0030554876558298704,
"grad_norm": 0.14385275542736053,
"learning_rate": 0.00021,
"loss": 1.0426,
"step": 50
},
{
"epoch": 0.0030554876558298704,
"eval_loss": 0.9096205830574036,
"eval_runtime": 283.3102,
"eval_samples_per_second": 24.32,
"eval_steps_per_second": 6.082,
"step": 50
},
{
"epoch": 0.0036665851869958446,
"grad_norm": 0.1434783935546875,
"learning_rate": 0.00020974422527728155,
"loss": 0.8451,
"step": 60
},
{
"epoch": 0.004277682718161819,
"grad_norm": 0.13006432354450226,
"learning_rate": 0.0002089781472178649,
"loss": 0.8604,
"step": 70
},
{
"epoch": 0.0048887802493277925,
"grad_norm": 0.13252924382686615,
"learning_rate": 0.0002077054980770496,
"loss": 0.8868,
"step": 80
},
{
"epoch": 0.005499877780493767,
"grad_norm": 0.12162353843450546,
"learning_rate": 0.00020593247807352348,
"loss": 0.8893,
"step": 90
},
{
"epoch": 0.006110975311659741,
"grad_norm": 0.1493881493806839,
"learning_rate": 0.00020366772518252038,
"loss": 0.9245,
"step": 100
},
{
"epoch": 0.006110975311659741,
"eval_loss": 0.8621656894683838,
"eval_runtime": 283.2071,
"eval_samples_per_second": 24.328,
"eval_steps_per_second": 6.084,
"step": 100
},
{
"epoch": 0.006722072842825715,
"grad_norm": 0.12663449347019196,
"learning_rate": 0.0002009222730524731,
"loss": 0.8044,
"step": 110
},
{
"epoch": 0.007333170373991689,
"grad_norm": 0.13809515535831451,
"learning_rate": 0.00019770949725018733,
"loss": 0.852,
"step": 120
},
{
"epoch": 0.007944267905157664,
"grad_norm": 0.13587471842765808,
"learning_rate": 0.00019404505009642473,
"loss": 0.8401,
"step": 130
},
{
"epoch": 0.008555365436323637,
"grad_norm": 0.13504283130168915,
"learning_rate": 0.0001899467844093695,
"loss": 0.8743,
"step": 140
},
{
"epoch": 0.009166462967489611,
"grad_norm": 0.14619627594947815,
"learning_rate": 0.00018543466652749268,
"loss": 0.9113,
"step": 150
},
{
"epoch": 0.009166462967489611,
"eval_loss": 0.837764322757721,
"eval_runtime": 283.8331,
"eval_samples_per_second": 24.275,
"eval_steps_per_second": 6.07,
"step": 150
},
{
"epoch": 0.009777560498655585,
"grad_norm": 0.12556147575378418,
"learning_rate": 0.00018053067903555837,
"loss": 0.7812,
"step": 160
},
{
"epoch": 0.010388658029821559,
"grad_norm": 0.13120540976524353,
"learning_rate": 0.00017525871366768012,
"loss": 0.8049,
"step": 170
},
{
"epoch": 0.010999755560987534,
"grad_norm": 0.1468738466501236,
"learning_rate": 0.00016964445490919413,
"loss": 0.8328,
"step": 180
},
{
"epoch": 0.011610853092153508,
"grad_norm": 0.13799065351486206,
"learning_rate": 0.00016371525486442843,
"loss": 0.8505,
"step": 190
},
{
"epoch": 0.012221950623319482,
"grad_norm": 0.1520494669675827,
"learning_rate": 0.0001575,
"loss": 0.8542,
"step": 200
},
{
"epoch": 0.012221950623319482,
"eval_loss": 0.8198666572570801,
"eval_runtime": 284.1903,
"eval_samples_per_second": 24.244,
"eval_steps_per_second": 6.063,
"step": 200
},
{
"epoch": 0.012833048154485455,
"grad_norm": 0.13851197063922882,
"learning_rate": 0.00015102897041285315,
"loss": 0.8161,
"step": 210
},
{
"epoch": 0.01344414568565143,
"grad_norm": 0.14233830571174622,
"learning_rate": 0.00014433369230867077,
"loss": 0.7929,
"step": 220
},
{
"epoch": 0.014055243216817405,
"grad_norm": 0.13967365026474,
"learning_rate": 0.0001374467844093695,
"loss": 0.8082,
"step": 230
},
{
"epoch": 0.014666340747983378,
"grad_norm": 0.14153410494327545,
"learning_rate": 0.0001304017990379651,
"loss": 0.787,
"step": 240
},
{
"epoch": 0.015277438279149352,
"grad_norm": 0.15393155813217163,
"learning_rate": 0.0001232330586550277,
"loss": 0.8947,
"step": 250
},
{
"epoch": 0.015277438279149352,
"eval_loss": 0.8052677512168884,
"eval_runtime": 283.3575,
"eval_samples_per_second": 24.316,
"eval_steps_per_second": 6.081,
"step": 250
},
{
"epoch": 0.015888535810315327,
"grad_norm": 0.15014930069446564,
"learning_rate": 0.00011597548864310363,
"loss": 0.7755,
"step": 260
},
{
"epoch": 0.0164996333414813,
"grad_norm": 0.14606203138828278,
"learning_rate": 0.00010866444715376263,
"loss": 0.7551,
"step": 270
},
{
"epoch": 0.017110730872647275,
"grad_norm": 0.14750610291957855,
"learning_rate": 0.00010133555284623744,
"loss": 0.81,
"step": 280
},
{
"epoch": 0.01772182840381325,
"grad_norm": 0.18205717206001282,
"learning_rate": 9.402451135689641e-05,
"loss": 0.804,
"step": 290
},
{
"epoch": 0.018332925934979222,
"grad_norm": 0.14796751737594604,
"learning_rate": 8.676694134497232e-05,
"loss": 0.872,
"step": 300
},
{
"epoch": 0.018332925934979222,
"eval_loss": 0.7966746091842651,
"eval_runtime": 282.5751,
"eval_samples_per_second": 24.383,
"eval_steps_per_second": 6.097,
"step": 300
},
{
"epoch": 0.018944023466145196,
"grad_norm": 0.13817603886127472,
"learning_rate": 7.95982009620349e-05,
"loss": 0.746,
"step": 310
},
{
"epoch": 0.01955512099731117,
"grad_norm": 0.14924216270446777,
"learning_rate": 7.255321559063053e-05,
"loss": 0.7323,
"step": 320
},
{
"epoch": 0.020166218528477144,
"grad_norm": 0.14804905652999878,
"learning_rate": 6.566630769132923e-05,
"loss": 0.8057,
"step": 330
},
{
"epoch": 0.020777316059643117,
"grad_norm": 0.1513608694076538,
"learning_rate": 5.897102958714686e-05,
"loss": 0.781,
"step": 340
},
{
"epoch": 0.021388413590809095,
"grad_norm": 0.176563560962677,
"learning_rate": 5.250000000000002e-05,
"loss": 0.8638,
"step": 350
},
{
"epoch": 0.021388413590809095,
"eval_loss": 0.7875542640686035,
"eval_runtime": 283.0303,
"eval_samples_per_second": 24.344,
"eval_steps_per_second": 6.088,
"step": 350
},
{
"epoch": 0.02199951112197507,
"grad_norm": 0.13537642359733582,
"learning_rate": 4.62847451355716e-05,
"loss": 0.7125,
"step": 360
},
{
"epoch": 0.022610608653141042,
"grad_norm": 0.15358619391918182,
"learning_rate": 4.035554509080588e-05,
"loss": 0.7337,
"step": 370
},
{
"epoch": 0.023221706184307016,
"grad_norm": 0.16124075651168823,
"learning_rate": 3.474128633231992e-05,
"loss": 0.7682,
"step": 380
},
{
"epoch": 0.02383280371547299,
"grad_norm": 0.15972477197647095,
"learning_rate": 2.946932096444165e-05,
"loss": 0.7327,
"step": 390
},
{
"epoch": 0.024443901246638963,
"grad_norm": 0.16827918589115143,
"learning_rate": 2.456533347250732e-05,
"loss": 0.819,
"step": 400
},
{
"epoch": 0.024443901246638963,
"eval_loss": 0.7825099229812622,
"eval_runtime": 284.0898,
"eval_samples_per_second": 24.253,
"eval_steps_per_second": 6.065,
"step": 400
},
{
"epoch": 0.025054998777804937,
"grad_norm": 0.1699787676334381,
"learning_rate": 2.005321559063053e-05,
"loss": 0.7788,
"step": 410
},
{
"epoch": 0.02566609630897091,
"grad_norm": 0.16614125669002533,
"learning_rate": 1.5954949903575276e-05,
"loss": 0.7578,
"step": 420
},
{
"epoch": 0.026277193840136884,
"grad_norm": 0.1688673496246338,
"learning_rate": 1.2290502749812666e-05,
"loss": 0.7544,
"step": 430
},
{
"epoch": 0.02688829137130286,
"grad_norm": 0.16808409988880157,
"learning_rate": 9.077726947526898e-06,
"loss": 0.8112,
"step": 440
},
{
"epoch": 0.027499388902468835,
"grad_norm": 0.17651791870594025,
"learning_rate": 6.332274817479627e-06,
"loss": 0.8681,
"step": 450
},
{
"epoch": 0.027499388902468835,
"eval_loss": 0.7797022461891174,
"eval_runtime": 283.4645,
"eval_samples_per_second": 24.306,
"eval_steps_per_second": 6.078,
"step": 450
},
{
"epoch": 0.02811048643363481,
"grad_norm": 0.15380319952964783,
"learning_rate": 4.067521926476516e-06,
"loss": 0.7337,
"step": 460
},
{
"epoch": 0.028721583964800783,
"grad_norm": 0.1493958830833435,
"learning_rate": 2.294501922950403e-06,
"loss": 0.7669,
"step": 470
},
{
"epoch": 0.029332681495966757,
"grad_norm": 0.1568552553653717,
"learning_rate": 1.021852782135112e-06,
"loss": 0.8023,
"step": 480
},
{
"epoch": 0.02994377902713273,
"grad_norm": 0.15516310930252075,
"learning_rate": 2.5577472271845927e-07,
"loss": 0.7542,
"step": 490
},
{
"epoch": 0.030554876558298704,
"grad_norm": 0.15687133371829987,
"learning_rate": 0.0,
"loss": 0.8238,
"step": 500
},
{
"epoch": 0.030554876558298704,
"eval_loss": 0.7794634103775024,
"eval_runtime": 283.3578,
"eval_samples_per_second": 24.316,
"eval_steps_per_second": 6.081,
"step": 500
}
],
"logging_steps": 10,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.130095680926515e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}