lesso's picture
Training in progress, step 100, checkpoint
735efb4 verified
raw
history blame
18.8 kB
{
"best_metric": 0.6350612640380859,
"best_model_checkpoint": "miner_id_24/checkpoint-100",
"epoch": 0.10085728693898134,
"eval_steps": 50,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010085728693898135,
"grad_norm": 1.3238667249679565,
"learning_rate": 1.013e-05,
"loss": 0.928,
"step": 1
},
{
"epoch": 0.0010085728693898135,
"eval_loss": 1.1040126085281372,
"eval_runtime": 101.7212,
"eval_samples_per_second": 4.109,
"eval_steps_per_second": 1.032,
"step": 1
},
{
"epoch": 0.002017145738779627,
"grad_norm": 1.4647407531738281,
"learning_rate": 2.026e-05,
"loss": 1.1291,
"step": 2
},
{
"epoch": 0.0030257186081694403,
"grad_norm": 1.7250173091888428,
"learning_rate": 3.039e-05,
"loss": 1.0937,
"step": 3
},
{
"epoch": 0.004034291477559254,
"grad_norm": 1.4756126403808594,
"learning_rate": 4.052e-05,
"loss": 1.0637,
"step": 4
},
{
"epoch": 0.005042864346949067,
"grad_norm": 1.4971727132797241,
"learning_rate": 5.065e-05,
"loss": 1.1541,
"step": 5
},
{
"epoch": 0.006051437216338881,
"grad_norm": 1.1345500946044922,
"learning_rate": 6.078e-05,
"loss": 0.8789,
"step": 6
},
{
"epoch": 0.0070600100857286935,
"grad_norm": 1.1694258451461792,
"learning_rate": 7.091e-05,
"loss": 0.7942,
"step": 7
},
{
"epoch": 0.008068582955118508,
"grad_norm": 0.9738374948501587,
"learning_rate": 8.104e-05,
"loss": 0.7897,
"step": 8
},
{
"epoch": 0.009077155824508321,
"grad_norm": 1.2070674896240234,
"learning_rate": 9.117e-05,
"loss": 0.8907,
"step": 9
},
{
"epoch": 0.010085728693898134,
"grad_norm": 0.9936267733573914,
"learning_rate": 0.0001013,
"loss": 0.7356,
"step": 10
},
{
"epoch": 0.011094301563287948,
"grad_norm": 0.9822749495506287,
"learning_rate": 0.00010076684210526316,
"loss": 0.787,
"step": 11
},
{
"epoch": 0.012102874432677761,
"grad_norm": 0.9206593632698059,
"learning_rate": 0.0001002336842105263,
"loss": 0.7268,
"step": 12
},
{
"epoch": 0.013111447302067574,
"grad_norm": 0.8007137179374695,
"learning_rate": 9.970052631578946e-05,
"loss": 0.6785,
"step": 13
},
{
"epoch": 0.014120020171457387,
"grad_norm": 0.7873572707176208,
"learning_rate": 9.916736842105263e-05,
"loss": 0.65,
"step": 14
},
{
"epoch": 0.015128593040847202,
"grad_norm": 0.8010468482971191,
"learning_rate": 9.863421052631579e-05,
"loss": 0.6508,
"step": 15
},
{
"epoch": 0.016137165910237016,
"grad_norm": 0.7741969227790833,
"learning_rate": 9.810105263157895e-05,
"loss": 0.5805,
"step": 16
},
{
"epoch": 0.01714573877962683,
"grad_norm": 0.7493349313735962,
"learning_rate": 9.756789473684211e-05,
"loss": 0.564,
"step": 17
},
{
"epoch": 0.018154311649016642,
"grad_norm": 0.9663587212562561,
"learning_rate": 9.703473684210525e-05,
"loss": 0.808,
"step": 18
},
{
"epoch": 0.019162884518406455,
"grad_norm": 0.7793949246406555,
"learning_rate": 9.650157894736842e-05,
"loss": 0.5491,
"step": 19
},
{
"epoch": 0.020171457387796268,
"grad_norm": 0.7455626726150513,
"learning_rate": 9.596842105263158e-05,
"loss": 0.6404,
"step": 20
},
{
"epoch": 0.02118003025718608,
"grad_norm": 0.7438361048698425,
"learning_rate": 9.543526315789474e-05,
"loss": 0.6354,
"step": 21
},
{
"epoch": 0.022188603126575897,
"grad_norm": 0.740470826625824,
"learning_rate": 9.49021052631579e-05,
"loss": 0.5225,
"step": 22
},
{
"epoch": 0.02319717599596571,
"grad_norm": 0.8656465411186218,
"learning_rate": 9.436894736842105e-05,
"loss": 0.7408,
"step": 23
},
{
"epoch": 0.024205748865355523,
"grad_norm": 0.9166726469993591,
"learning_rate": 9.38357894736842e-05,
"loss": 0.6548,
"step": 24
},
{
"epoch": 0.025214321734745335,
"grad_norm": 0.8170962333679199,
"learning_rate": 9.330263157894737e-05,
"loss": 0.654,
"step": 25
},
{
"epoch": 0.026222894604135148,
"grad_norm": 0.7673491835594177,
"learning_rate": 9.276947368421051e-05,
"loss": 0.6428,
"step": 26
},
{
"epoch": 0.02723146747352496,
"grad_norm": 0.8028613924980164,
"learning_rate": 9.223631578947369e-05,
"loss": 0.5204,
"step": 27
},
{
"epoch": 0.028240040342914774,
"grad_norm": 0.8541864156723022,
"learning_rate": 9.170315789473684e-05,
"loss": 0.6858,
"step": 28
},
{
"epoch": 0.02924861321230459,
"grad_norm": 0.8092413544654846,
"learning_rate": 9.117e-05,
"loss": 0.6384,
"step": 29
},
{
"epoch": 0.030257186081694403,
"grad_norm": 0.9186341762542725,
"learning_rate": 9.063684210526316e-05,
"loss": 0.7882,
"step": 30
},
{
"epoch": 0.031265758951084216,
"grad_norm": 0.9697185158729553,
"learning_rate": 9.010368421052632e-05,
"loss": 0.799,
"step": 31
},
{
"epoch": 0.03227433182047403,
"grad_norm": 0.7349818348884583,
"learning_rate": 8.957052631578946e-05,
"loss": 0.6457,
"step": 32
},
{
"epoch": 0.03328290468986384,
"grad_norm": 0.843620240688324,
"learning_rate": 8.903736842105263e-05,
"loss": 0.7368,
"step": 33
},
{
"epoch": 0.03429147755925366,
"grad_norm": 0.8225664496421814,
"learning_rate": 8.850421052631579e-05,
"loss": 0.7159,
"step": 34
},
{
"epoch": 0.03530005042864347,
"grad_norm": 0.9680421352386475,
"learning_rate": 8.797105263157895e-05,
"loss": 0.8112,
"step": 35
},
{
"epoch": 0.036308623298033284,
"grad_norm": 0.9144193530082703,
"learning_rate": 8.743789473684211e-05,
"loss": 0.7829,
"step": 36
},
{
"epoch": 0.03731719616742309,
"grad_norm": 0.8283833265304565,
"learning_rate": 8.690473684210526e-05,
"loss": 0.7398,
"step": 37
},
{
"epoch": 0.03832576903681291,
"grad_norm": 0.7781999707221985,
"learning_rate": 8.637157894736842e-05,
"loss": 0.7255,
"step": 38
},
{
"epoch": 0.039334341906202726,
"grad_norm": 0.7448036074638367,
"learning_rate": 8.583842105263158e-05,
"loss": 0.6482,
"step": 39
},
{
"epoch": 0.040342914775592535,
"grad_norm": 0.7694168090820312,
"learning_rate": 8.530526315789472e-05,
"loss": 0.6282,
"step": 40
},
{
"epoch": 0.04135148764498235,
"grad_norm": 0.9006367325782776,
"learning_rate": 8.47721052631579e-05,
"loss": 0.7293,
"step": 41
},
{
"epoch": 0.04236006051437216,
"grad_norm": 0.9051785469055176,
"learning_rate": 8.423894736842105e-05,
"loss": 0.8256,
"step": 42
},
{
"epoch": 0.04336863338376198,
"grad_norm": 0.9707128405570984,
"learning_rate": 8.37057894736842e-05,
"loss": 0.7974,
"step": 43
},
{
"epoch": 0.044377206253151794,
"grad_norm": 0.885073721408844,
"learning_rate": 8.317263157894737e-05,
"loss": 0.7655,
"step": 44
},
{
"epoch": 0.0453857791225416,
"grad_norm": 0.9013693928718567,
"learning_rate": 8.263947368421053e-05,
"loss": 0.7205,
"step": 45
},
{
"epoch": 0.04639435199193142,
"grad_norm": 1.1316449642181396,
"learning_rate": 8.210631578947368e-05,
"loss": 0.7543,
"step": 46
},
{
"epoch": 0.04740292486132123,
"grad_norm": 0.8760470151901245,
"learning_rate": 8.157315789473684e-05,
"loss": 0.7771,
"step": 47
},
{
"epoch": 0.048411497730711045,
"grad_norm": 1.0497716665267944,
"learning_rate": 8.104e-05,
"loss": 0.8032,
"step": 48
},
{
"epoch": 0.049420070600100854,
"grad_norm": 0.9932529926300049,
"learning_rate": 8.050684210526316e-05,
"loss": 0.7941,
"step": 49
},
{
"epoch": 0.05042864346949067,
"grad_norm": 1.1281291246414185,
"learning_rate": 7.997368421052632e-05,
"loss": 0.845,
"step": 50
},
{
"epoch": 0.05042864346949067,
"eval_loss": 0.6652013063430786,
"eval_runtime": 101.679,
"eval_samples_per_second": 4.111,
"eval_steps_per_second": 1.033,
"step": 50
},
{
"epoch": 0.05143721633888049,
"grad_norm": 0.7430902123451233,
"learning_rate": 7.944052631578947e-05,
"loss": 0.4593,
"step": 51
},
{
"epoch": 0.052445789208270296,
"grad_norm": 0.6915680170059204,
"learning_rate": 7.890736842105263e-05,
"loss": 0.6674,
"step": 52
},
{
"epoch": 0.05345436207766011,
"grad_norm": 0.6954211592674255,
"learning_rate": 7.837421052631579e-05,
"loss": 0.7528,
"step": 53
},
{
"epoch": 0.05446293494704992,
"grad_norm": 0.6784757971763611,
"learning_rate": 7.784105263157893e-05,
"loss": 0.617,
"step": 54
},
{
"epoch": 0.05547150781643974,
"grad_norm": 0.750968337059021,
"learning_rate": 7.730789473684211e-05,
"loss": 0.6296,
"step": 55
},
{
"epoch": 0.05648008068582955,
"grad_norm": 0.6823384761810303,
"learning_rate": 7.677473684210526e-05,
"loss": 0.6229,
"step": 56
},
{
"epoch": 0.057488653555219364,
"grad_norm": 0.9757019877433777,
"learning_rate": 7.624157894736842e-05,
"loss": 0.6533,
"step": 57
},
{
"epoch": 0.05849722642460918,
"grad_norm": 0.9269747734069824,
"learning_rate": 7.570842105263158e-05,
"loss": 0.5794,
"step": 58
},
{
"epoch": 0.05950579929399899,
"grad_norm": 0.6206308007240295,
"learning_rate": 7.517526315789474e-05,
"loss": 0.5007,
"step": 59
},
{
"epoch": 0.060514372163388806,
"grad_norm": 0.6674503684043884,
"learning_rate": 7.464210526315789e-05,
"loss": 0.4885,
"step": 60
},
{
"epoch": 0.061522945032778616,
"grad_norm": 0.7698457837104797,
"learning_rate": 7.410894736842106e-05,
"loss": 0.773,
"step": 61
},
{
"epoch": 0.06253151790216843,
"grad_norm": 0.7389516234397888,
"learning_rate": 7.35757894736842e-05,
"loss": 0.5738,
"step": 62
},
{
"epoch": 0.06354009077155824,
"grad_norm": 0.770261287689209,
"learning_rate": 7.304263157894737e-05,
"loss": 0.6907,
"step": 63
},
{
"epoch": 0.06454866364094806,
"grad_norm": 0.7029135227203369,
"learning_rate": 7.250947368421053e-05,
"loss": 0.6071,
"step": 64
},
{
"epoch": 0.06555723651033787,
"grad_norm": 0.7364175319671631,
"learning_rate": 7.197631578947368e-05,
"loss": 0.6433,
"step": 65
},
{
"epoch": 0.06656580937972768,
"grad_norm": 0.6604887247085571,
"learning_rate": 7.144315789473684e-05,
"loss": 0.5419,
"step": 66
},
{
"epoch": 0.06757438224911749,
"grad_norm": 0.6892684102058411,
"learning_rate": 7.091e-05,
"loss": 0.6809,
"step": 67
},
{
"epoch": 0.06858295511850732,
"grad_norm": 0.6906418204307556,
"learning_rate": 7.037684210526316e-05,
"loss": 0.5438,
"step": 68
},
{
"epoch": 0.06959152798789713,
"grad_norm": 0.7817425727844238,
"learning_rate": 6.984368421052632e-05,
"loss": 0.7386,
"step": 69
},
{
"epoch": 0.07060010085728693,
"grad_norm": 0.5860413312911987,
"learning_rate": 6.931052631578947e-05,
"loss": 0.4939,
"step": 70
},
{
"epoch": 0.07160867372667676,
"grad_norm": 0.6654998660087585,
"learning_rate": 6.877736842105263e-05,
"loss": 0.5478,
"step": 71
},
{
"epoch": 0.07261724659606657,
"grad_norm": 0.7202038168907166,
"learning_rate": 6.824421052631579e-05,
"loss": 0.5561,
"step": 72
},
{
"epoch": 0.07362581946545638,
"grad_norm": 0.7074757218360901,
"learning_rate": 6.771105263157895e-05,
"loss": 0.5294,
"step": 73
},
{
"epoch": 0.07463439233484619,
"grad_norm": 0.7609388828277588,
"learning_rate": 6.71778947368421e-05,
"loss": 0.6816,
"step": 74
},
{
"epoch": 0.07564296520423601,
"grad_norm": 0.7042875289916992,
"learning_rate": 6.664473684210527e-05,
"loss": 0.6686,
"step": 75
},
{
"epoch": 0.07665153807362582,
"grad_norm": 0.697859525680542,
"learning_rate": 6.611157894736842e-05,
"loss": 0.5751,
"step": 76
},
{
"epoch": 0.07766011094301563,
"grad_norm": 0.7064348459243774,
"learning_rate": 6.557842105263158e-05,
"loss": 0.5744,
"step": 77
},
{
"epoch": 0.07866868381240545,
"grad_norm": 0.5865401029586792,
"learning_rate": 6.504526315789474e-05,
"loss": 0.5026,
"step": 78
},
{
"epoch": 0.07967725668179526,
"grad_norm": 0.7729213237762451,
"learning_rate": 6.451210526315789e-05,
"loss": 0.5335,
"step": 79
},
{
"epoch": 0.08068582955118507,
"grad_norm": 0.791968584060669,
"learning_rate": 6.397894736842105e-05,
"loss": 0.6654,
"step": 80
},
{
"epoch": 0.08169440242057488,
"grad_norm": 0.8126956820487976,
"learning_rate": 6.344578947368421e-05,
"loss": 0.6091,
"step": 81
},
{
"epoch": 0.0827029752899647,
"grad_norm": 0.7012320160865784,
"learning_rate": 6.291263157894737e-05,
"loss": 0.587,
"step": 82
},
{
"epoch": 0.08371154815935451,
"grad_norm": 0.7842673659324646,
"learning_rate": 6.237947368421053e-05,
"loss": 0.6438,
"step": 83
},
{
"epoch": 0.08472012102874432,
"grad_norm": 0.7788791656494141,
"learning_rate": 6.184631578947368e-05,
"loss": 0.6173,
"step": 84
},
{
"epoch": 0.08572869389813415,
"grad_norm": 0.7247044444084167,
"learning_rate": 6.131315789473684e-05,
"loss": 0.6625,
"step": 85
},
{
"epoch": 0.08673726676752395,
"grad_norm": 0.7117682099342346,
"learning_rate": 6.078e-05,
"loss": 0.6324,
"step": 86
},
{
"epoch": 0.08774583963691376,
"grad_norm": 0.7648577094078064,
"learning_rate": 6.024684210526315e-05,
"loss": 0.6915,
"step": 87
},
{
"epoch": 0.08875441250630359,
"grad_norm": 0.8128008842468262,
"learning_rate": 5.9713684210526305e-05,
"loss": 0.6515,
"step": 88
},
{
"epoch": 0.0897629853756934,
"grad_norm": 0.8586528301239014,
"learning_rate": 5.918052631578947e-05,
"loss": 0.7476,
"step": 89
},
{
"epoch": 0.0907715582450832,
"grad_norm": 0.9298937916755676,
"learning_rate": 5.8647368421052634e-05,
"loss": 0.792,
"step": 90
},
{
"epoch": 0.09178013111447302,
"grad_norm": 0.8197916746139526,
"learning_rate": 5.811421052631579e-05,
"loss": 0.6175,
"step": 91
},
{
"epoch": 0.09278870398386284,
"grad_norm": 0.7706024050712585,
"learning_rate": 5.758105263157894e-05,
"loss": 0.719,
"step": 92
},
{
"epoch": 0.09379727685325265,
"grad_norm": 0.853099524974823,
"learning_rate": 5.70478947368421e-05,
"loss": 0.6546,
"step": 93
},
{
"epoch": 0.09480584972264246,
"grad_norm": 0.796420156955719,
"learning_rate": 5.6514736842105256e-05,
"loss": 0.6689,
"step": 94
},
{
"epoch": 0.09581442259203228,
"grad_norm": 0.7574595212936401,
"learning_rate": 5.5981578947368424e-05,
"loss": 0.5644,
"step": 95
},
{
"epoch": 0.09682299546142209,
"grad_norm": 0.9858140349388123,
"learning_rate": 5.544842105263158e-05,
"loss": 0.83,
"step": 96
},
{
"epoch": 0.0978315683308119,
"grad_norm": 0.9185313582420349,
"learning_rate": 5.491526315789474e-05,
"loss": 0.7675,
"step": 97
},
{
"epoch": 0.09884014120020171,
"grad_norm": 0.9572092890739441,
"learning_rate": 5.438210526315789e-05,
"loss": 0.7159,
"step": 98
},
{
"epoch": 0.09984871406959153,
"grad_norm": 1.0749398469924927,
"learning_rate": 5.384894736842105e-05,
"loss": 0.8607,
"step": 99
},
{
"epoch": 0.10085728693898134,
"grad_norm": 1.266366958618164,
"learning_rate": 5.331578947368421e-05,
"loss": 0.8676,
"step": 100
},
{
"epoch": 0.10085728693898134,
"eval_loss": 0.6350612640380859,
"eval_runtime": 101.2691,
"eval_samples_per_second": 4.128,
"eval_steps_per_second": 1.037,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.648580773235917e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}