brixeus's picture
Training in progress, step 200, checkpoint
d8a62d2 verified
{
"best_metric": 1.135033369064331,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.011536853034913402,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 5.768426517456701e-05,
"eval_loss": 2.3238277435302734,
"eval_runtime": 495.8901,
"eval_samples_per_second": 58.878,
"eval_steps_per_second": 14.721,
"step": 1
},
{
"epoch": 0.00017305279552370101,
"grad_norm": 0.45083099603652954,
"learning_rate": 3e-05,
"loss": 1.4648,
"step": 3
},
{
"epoch": 0.00034610559104740203,
"grad_norm": 0.5587688684463501,
"learning_rate": 6e-05,
"loss": 1.8717,
"step": 6
},
{
"epoch": 0.0005191583865711031,
"grad_norm": 0.5804035067558289,
"learning_rate": 9e-05,
"loss": 1.8528,
"step": 9
},
{
"epoch": 0.0006922111820948041,
"grad_norm": 0.5394102931022644,
"learning_rate": 9.997266286704631e-05,
"loss": 1.8216,
"step": 12
},
{
"epoch": 0.0008652639776185052,
"grad_norm": 0.5287687182426453,
"learning_rate": 9.98292246503335e-05,
"loss": 1.4867,
"step": 15
},
{
"epoch": 0.0010383167731422061,
"grad_norm": 0.5580219626426697,
"learning_rate": 9.956320346634876e-05,
"loss": 1.553,
"step": 18
},
{
"epoch": 0.0012113695686659072,
"grad_norm": 0.5071175694465637,
"learning_rate": 9.917525374361912e-05,
"loss": 1.4905,
"step": 21
},
{
"epoch": 0.0013844223641896081,
"grad_norm": 0.618292510509491,
"learning_rate": 9.86663298624003e-05,
"loss": 1.3951,
"step": 24
},
{
"epoch": 0.0015574751597133092,
"grad_norm": 0.46973147988319397,
"learning_rate": 9.803768380684242e-05,
"loss": 1.5449,
"step": 27
},
{
"epoch": 0.0017305279552370103,
"grad_norm": 0.49171945452690125,
"learning_rate": 9.729086208503174e-05,
"loss": 1.3479,
"step": 30
},
{
"epoch": 0.0019035807507607112,
"grad_norm": 0.48666226863861084,
"learning_rate": 9.642770192448536e-05,
"loss": 1.4163,
"step": 33
},
{
"epoch": 0.0020766335462844123,
"grad_norm": 0.5254663825035095,
"learning_rate": 9.545032675245813e-05,
"loss": 1.3606,
"step": 36
},
{
"epoch": 0.0022496863418081134,
"grad_norm": 0.4926294684410095,
"learning_rate": 9.43611409721806e-05,
"loss": 1.3109,
"step": 39
},
{
"epoch": 0.0024227391373318145,
"grad_norm": 0.5055580735206604,
"learning_rate": 9.316282404787871e-05,
"loss": 1.3493,
"step": 42
},
{
"epoch": 0.002595791932855515,
"grad_norm": 0.5364347100257874,
"learning_rate": 9.185832391312644e-05,
"loss": 1.4115,
"step": 45
},
{
"epoch": 0.0027688447283792162,
"grad_norm": 0.7558141350746155,
"learning_rate": 9.045084971874738e-05,
"loss": 1.4071,
"step": 48
},
{
"epoch": 0.0028842132587283504,
"eval_loss": 1.2356951236724854,
"eval_runtime": 500.8722,
"eval_samples_per_second": 58.292,
"eval_steps_per_second": 14.575,
"step": 50
},
{
"epoch": 0.0029418975239029173,
"grad_norm": 0.4775656461715698,
"learning_rate": 8.894386393810563e-05,
"loss": 1.2319,
"step": 51
},
{
"epoch": 0.0031149503194266184,
"grad_norm": 0.5036152601242065,
"learning_rate": 8.73410738492077e-05,
"loss": 0.9718,
"step": 54
},
{
"epoch": 0.0032880031149503195,
"grad_norm": 0.39522024989128113,
"learning_rate": 8.564642241456986e-05,
"loss": 1.1598,
"step": 57
},
{
"epoch": 0.0034610559104740206,
"grad_norm": 0.35405585169792175,
"learning_rate": 8.386407858128706e-05,
"loss": 1.0674,
"step": 60
},
{
"epoch": 0.0036341087059977213,
"grad_norm": 0.4121133089065552,
"learning_rate": 8.199842702516583e-05,
"loss": 1.0174,
"step": 63
},
{
"epoch": 0.0038071615015214224,
"grad_norm": 0.32193905115127563,
"learning_rate": 8.005405736415126e-05,
"loss": 1.0159,
"step": 66
},
{
"epoch": 0.003980214297045124,
"grad_norm": 0.3453183174133301,
"learning_rate": 7.803575286758364e-05,
"loss": 1.0557,
"step": 69
},
{
"epoch": 0.0041532670925688246,
"grad_norm": 0.3785501718521118,
"learning_rate": 7.594847868906076e-05,
"loss": 1.2081,
"step": 72
},
{
"epoch": 0.004326319888092525,
"grad_norm": 0.3871416747570038,
"learning_rate": 7.379736965185368e-05,
"loss": 1.1649,
"step": 75
},
{
"epoch": 0.004499372683616227,
"grad_norm": 0.4219188988208771,
"learning_rate": 7.158771761692464e-05,
"loss": 1.1471,
"step": 78
},
{
"epoch": 0.004672425479139927,
"grad_norm": 0.40537410974502563,
"learning_rate": 6.932495846462261e-05,
"loss": 1.3124,
"step": 81
},
{
"epoch": 0.004845478274663629,
"grad_norm": 0.36390063166618347,
"learning_rate": 6.701465872208216e-05,
"loss": 1.2589,
"step": 84
},
{
"epoch": 0.00501853107018733,
"grad_norm": 0.4167007803916931,
"learning_rate": 6.466250186922325e-05,
"loss": 1.0981,
"step": 87
},
{
"epoch": 0.00519158386571103,
"grad_norm": 0.4835963845252991,
"learning_rate": 6.227427435703997e-05,
"loss": 1.3458,
"step": 90
},
{
"epoch": 0.005364636661234732,
"grad_norm": 0.46142578125,
"learning_rate": 5.985585137257401e-05,
"loss": 1.1397,
"step": 93
},
{
"epoch": 0.0055376894567584324,
"grad_norm": 0.5481660962104797,
"learning_rate": 5.74131823855921e-05,
"loss": 1.1947,
"step": 96
},
{
"epoch": 0.005710742252282134,
"grad_norm": 0.6424285173416138,
"learning_rate": 5.495227651252315e-05,
"loss": 1.4071,
"step": 99
},
{
"epoch": 0.005768426517456701,
"eval_loss": 1.1644848585128784,
"eval_runtime": 495.3076,
"eval_samples_per_second": 58.947,
"eval_steps_per_second": 14.738,
"step": 100
},
{
"epoch": 0.005883795047805835,
"grad_norm": 0.46381473541259766,
"learning_rate": 5.247918773366112e-05,
"loss": 1.1083,
"step": 102
},
{
"epoch": 0.006056847843329536,
"grad_norm": 0.5000247955322266,
"learning_rate": 5e-05,
"loss": 1.0165,
"step": 105
},
{
"epoch": 0.006229900638853237,
"grad_norm": 0.38582298159599304,
"learning_rate": 4.7520812266338885e-05,
"loss": 1.059,
"step": 108
},
{
"epoch": 0.0064029534343769375,
"grad_norm": 0.40634816884994507,
"learning_rate": 4.504772348747687e-05,
"loss": 1.021,
"step": 111
},
{
"epoch": 0.006576006229900639,
"grad_norm": 0.32036200165748596,
"learning_rate": 4.2586817614407895e-05,
"loss": 0.9665,
"step": 114
},
{
"epoch": 0.00674905902542434,
"grad_norm": 0.3690759241580963,
"learning_rate": 4.0144148627425993e-05,
"loss": 1.1389,
"step": 117
},
{
"epoch": 0.006922111820948041,
"grad_norm": 0.3883983790874481,
"learning_rate": 3.772572564296005e-05,
"loss": 1.1566,
"step": 120
},
{
"epoch": 0.007095164616471742,
"grad_norm": 0.4271601438522339,
"learning_rate": 3.533749813077677e-05,
"loss": 1.1872,
"step": 123
},
{
"epoch": 0.0072682174119954425,
"grad_norm": 0.358237087726593,
"learning_rate": 3.298534127791785e-05,
"loss": 1.1692,
"step": 126
},
{
"epoch": 0.007441270207519144,
"grad_norm": 0.365348219871521,
"learning_rate": 3.0675041535377405e-05,
"loss": 1.1708,
"step": 129
},
{
"epoch": 0.007614323003042845,
"grad_norm": 0.41103696823120117,
"learning_rate": 2.8412282383075363e-05,
"loss": 1.2572,
"step": 132
},
{
"epoch": 0.007787375798566546,
"grad_norm": 0.41762009263038635,
"learning_rate": 2.6202630348146324e-05,
"loss": 1.1534,
"step": 135
},
{
"epoch": 0.007960428594090248,
"grad_norm": 0.39923688769340515,
"learning_rate": 2.405152131093926e-05,
"loss": 1.3375,
"step": 138
},
{
"epoch": 0.008133481389613948,
"grad_norm": 0.47532254457473755,
"learning_rate": 2.196424713241637e-05,
"loss": 1.3354,
"step": 141
},
{
"epoch": 0.008306534185137649,
"grad_norm": 0.3896162211894989,
"learning_rate": 1.9945942635848748e-05,
"loss": 1.3604,
"step": 144
},
{
"epoch": 0.00847958698066135,
"grad_norm": 0.4989997446537018,
"learning_rate": 1.800157297483417e-05,
"loss": 1.3699,
"step": 147
},
{
"epoch": 0.00865263977618505,
"grad_norm": 0.7773590087890625,
"learning_rate": 1.6135921418712956e-05,
"loss": 1.333,
"step": 150
},
{
"epoch": 0.00865263977618505,
"eval_loss": 1.139582872390747,
"eval_runtime": 496.6517,
"eval_samples_per_second": 58.788,
"eval_steps_per_second": 14.698,
"step": 150
},
{
"epoch": 0.008825692571708753,
"grad_norm": 0.343904584646225,
"learning_rate": 1.435357758543015e-05,
"loss": 0.9637,
"step": 153
},
{
"epoch": 0.008998745367232453,
"grad_norm": 0.38003867864608765,
"learning_rate": 1.2658926150792322e-05,
"loss": 0.9447,
"step": 156
},
{
"epoch": 0.009171798162756154,
"grad_norm": 0.3449487090110779,
"learning_rate": 1.1056136061894384e-05,
"loss": 1.0876,
"step": 159
},
{
"epoch": 0.009344850958279855,
"grad_norm": 0.4446622431278229,
"learning_rate": 9.549150281252633e-06,
"loss": 1.1895,
"step": 162
},
{
"epoch": 0.009517903753803555,
"grad_norm": 0.4309278428554535,
"learning_rate": 8.141676086873572e-06,
"loss": 1.0161,
"step": 165
},
{
"epoch": 0.009690956549327258,
"grad_norm": 0.3661138713359833,
"learning_rate": 6.837175952121306e-06,
"loss": 1.1177,
"step": 168
},
{
"epoch": 0.009864009344850959,
"grad_norm": 0.3448355793952942,
"learning_rate": 5.6388590278194096e-06,
"loss": 1.0711,
"step": 171
},
{
"epoch": 0.01003706214037466,
"grad_norm": 0.35449743270874023,
"learning_rate": 4.549673247541875e-06,
"loss": 1.1541,
"step": 174
},
{
"epoch": 0.01021011493589836,
"grad_norm": 0.37603992223739624,
"learning_rate": 3.5722980755146517e-06,
"loss": 1.1487,
"step": 177
},
{
"epoch": 0.01038316773142206,
"grad_norm": 0.4178571105003357,
"learning_rate": 2.7091379149682685e-06,
"loss": 1.1862,
"step": 180
},
{
"epoch": 0.010556220526945763,
"grad_norm": 0.5295783281326294,
"learning_rate": 1.962316193157593e-06,
"loss": 1.2777,
"step": 183
},
{
"epoch": 0.010729273322469464,
"grad_norm": 0.5361846089363098,
"learning_rate": 1.333670137599713e-06,
"loss": 1.1756,
"step": 186
},
{
"epoch": 0.010902326117993164,
"grad_norm": 0.42806971073150635,
"learning_rate": 8.247462563808817e-07,
"loss": 1.2366,
"step": 189
},
{
"epoch": 0.011075378913516865,
"grad_norm": 0.45333951711654663,
"learning_rate": 4.367965336512403e-07,
"loss": 1.0958,
"step": 192
},
{
"epoch": 0.011248431709040567,
"grad_norm": 0.8909216523170471,
"learning_rate": 1.7077534966650766e-07,
"loss": 1.1446,
"step": 195
},
{
"epoch": 0.011421484504564268,
"grad_norm": 0.8104519248008728,
"learning_rate": 2.7337132953697554e-08,
"loss": 1.4137,
"step": 198
},
{
"epoch": 0.011536853034913402,
"eval_loss": 1.135033369064331,
"eval_runtime": 500.4661,
"eval_samples_per_second": 58.34,
"eval_steps_per_second": 14.586,
"step": 200
}
],
"logging_steps": 3,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3737759080448e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}