Romain-XV's picture
Training in progress, step 100, checkpoint
73096b5 verified
raw
history blame
18.8 kB
{
"best_metric": 1.2906723022460938,
"best_model_checkpoint": "miner_id_24/checkpoint-100",
"epoch": 0.01271304278733463,
"eval_steps": 100,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0001271304278733463,
"grad_norm": 0.5326213836669922,
"learning_rate": 2e-05,
"loss": 1.7856,
"step": 1
},
{
"epoch": 0.0001271304278733463,
"eval_loss": 1.7346340417861938,
"eval_runtime": 1249.3361,
"eval_samples_per_second": 4.002,
"eval_steps_per_second": 1.001,
"step": 1
},
{
"epoch": 0.0002542608557466926,
"grad_norm": 0.5624520778656006,
"learning_rate": 4e-05,
"loss": 1.7626,
"step": 2
},
{
"epoch": 0.00038139128362003893,
"grad_norm": 0.5890633463859558,
"learning_rate": 6e-05,
"loss": 1.739,
"step": 3
},
{
"epoch": 0.0005085217114933852,
"grad_norm": 0.5437400937080383,
"learning_rate": 8e-05,
"loss": 1.6671,
"step": 4
},
{
"epoch": 0.0006356521393667316,
"grad_norm": 0.6639446020126343,
"learning_rate": 0.0001,
"loss": 1.7684,
"step": 5
},
{
"epoch": 0.0007627825672400779,
"grad_norm": 0.7031175494194031,
"learning_rate": 0.00012,
"loss": 1.7247,
"step": 6
},
{
"epoch": 0.0008899129951134241,
"grad_norm": 0.5311002731323242,
"learning_rate": 0.00014,
"loss": 1.6195,
"step": 7
},
{
"epoch": 0.0010170434229867704,
"grad_norm": 0.25101518630981445,
"learning_rate": 0.00016,
"loss": 1.5182,
"step": 8
},
{
"epoch": 0.0011441738508601168,
"grad_norm": 0.8389205932617188,
"learning_rate": 0.00018,
"loss": 1.6646,
"step": 9
},
{
"epoch": 0.0012713042787334632,
"grad_norm": 0.9317983388900757,
"learning_rate": 0.0002,
"loss": 1.6395,
"step": 10
},
{
"epoch": 0.0013984347066068094,
"grad_norm": 0.48066481947898865,
"learning_rate": 0.00019999832015210023,
"loss": 1.5921,
"step": 11
},
{
"epoch": 0.0015255651344801557,
"grad_norm": 0.2073744535446167,
"learning_rate": 0.00019999328066483865,
"loss": 1.4335,
"step": 12
},
{
"epoch": 0.0016526955623535021,
"grad_norm": 0.22661754488945007,
"learning_rate": 0.0001999848817075267,
"loss": 1.477,
"step": 13
},
{
"epoch": 0.0017798259902268483,
"grad_norm": 0.3149760663509369,
"learning_rate": 0.00019997312356234386,
"loss": 1.5713,
"step": 14
},
{
"epoch": 0.0019069564181001947,
"grad_norm": 0.31392771005630493,
"learning_rate": 0.00019995800662432798,
"loss": 1.5414,
"step": 15
},
{
"epoch": 0.002034086845973541,
"grad_norm": 0.2748984396457672,
"learning_rate": 0.0001999395314013622,
"loss": 1.5452,
"step": 16
},
{
"epoch": 0.0021612172738468874,
"grad_norm": 0.19064395129680634,
"learning_rate": 0.00019991769851415781,
"loss": 1.5742,
"step": 17
},
{
"epoch": 0.0022883477017202336,
"grad_norm": 0.1578415036201477,
"learning_rate": 0.00019989250869623343,
"loss": 1.5214,
"step": 18
},
{
"epoch": 0.0024154781295935798,
"grad_norm": 0.20229928195476532,
"learning_rate": 0.0001998639627938903,
"loss": 1.3921,
"step": 19
},
{
"epoch": 0.0025426085574669264,
"grad_norm": 0.2705669403076172,
"learning_rate": 0.00019983206176618388,
"loss": 1.4712,
"step": 20
},
{
"epoch": 0.0026697389853402725,
"grad_norm": 0.27215054631233215,
"learning_rate": 0.00019979680668489165,
"loss": 1.4969,
"step": 21
},
{
"epoch": 0.0027968694132136187,
"grad_norm": 0.20431619882583618,
"learning_rate": 0.00019975819873447717,
"loss": 1.431,
"step": 22
},
{
"epoch": 0.0029239998410869653,
"grad_norm": 0.14068549871444702,
"learning_rate": 0.00019971623921205005,
"loss": 1.4543,
"step": 23
},
{
"epoch": 0.0030511302689603115,
"grad_norm": 0.1594925820827484,
"learning_rate": 0.00019967092952732264,
"loss": 1.364,
"step": 24
},
{
"epoch": 0.0031782606968336576,
"grad_norm": 0.17738410830497742,
"learning_rate": 0.00019962227120256252,
"loss": 1.4377,
"step": 25
},
{
"epoch": 0.0033053911247070042,
"grad_norm": 0.1752498745918274,
"learning_rate": 0.00019957026587254134,
"loss": 1.3827,
"step": 26
},
{
"epoch": 0.0034325215525803504,
"grad_norm": 0.18004052340984344,
"learning_rate": 0.00019951491528448004,
"loss": 1.3867,
"step": 27
},
{
"epoch": 0.0035596519804536966,
"grad_norm": 0.1525774598121643,
"learning_rate": 0.00019945622129799,
"loss": 1.4164,
"step": 28
},
{
"epoch": 0.003686782408327043,
"grad_norm": 0.1710219830274582,
"learning_rate": 0.00019939418588501057,
"loss": 1.4155,
"step": 29
},
{
"epoch": 0.0038139128362003893,
"grad_norm": 0.20903100073337555,
"learning_rate": 0.000199328811129743,
"loss": 1.5239,
"step": 30
},
{
"epoch": 0.003941043264073736,
"grad_norm": 0.18399456143379211,
"learning_rate": 0.00019926009922858006,
"loss": 1.3889,
"step": 31
},
{
"epoch": 0.004068173691947082,
"grad_norm": 0.13330113887786865,
"learning_rate": 0.0001991880524900327,
"loss": 1.3587,
"step": 32
},
{
"epoch": 0.004195304119820428,
"grad_norm": 0.13807597756385803,
"learning_rate": 0.00019911267333465218,
"loss": 1.4211,
"step": 33
},
{
"epoch": 0.004322434547693775,
"grad_norm": 0.1550034135580063,
"learning_rate": 0.0001990339642949488,
"loss": 1.4317,
"step": 34
},
{
"epoch": 0.004449564975567121,
"grad_norm": 0.1827971190214157,
"learning_rate": 0.00019895192801530685,
"loss": 1.4176,
"step": 35
},
{
"epoch": 0.004576695403440467,
"grad_norm": 0.16028065979480743,
"learning_rate": 0.00019886656725189575,
"loss": 1.4122,
"step": 36
},
{
"epoch": 0.004703825831313814,
"grad_norm": 0.14322146773338318,
"learning_rate": 0.00019877788487257753,
"loss": 1.423,
"step": 37
},
{
"epoch": 0.0048309562591871595,
"grad_norm": 0.1685505211353302,
"learning_rate": 0.00019868588385681032,
"loss": 1.3702,
"step": 38
},
{
"epoch": 0.004958086687060506,
"grad_norm": 0.1632552444934845,
"learning_rate": 0.00019859056729554844,
"loss": 1.3164,
"step": 39
},
{
"epoch": 0.005085217114933853,
"grad_norm": 0.17149530351161957,
"learning_rate": 0.00019849193839113833,
"loss": 1.2799,
"step": 40
},
{
"epoch": 0.0052123475428071985,
"grad_norm": 0.14993086457252502,
"learning_rate": 0.00019839000045721118,
"loss": 1.3412,
"step": 41
},
{
"epoch": 0.005339477970680545,
"grad_norm": 0.15981823205947876,
"learning_rate": 0.00019828475691857145,
"loss": 1.3698,
"step": 42
},
{
"epoch": 0.005466608398553892,
"grad_norm": 0.15311439335346222,
"learning_rate": 0.00019817621131108196,
"loss": 1.3792,
"step": 43
},
{
"epoch": 0.005593738826427237,
"grad_norm": 0.19757720828056335,
"learning_rate": 0.00019806436728154485,
"loss": 1.4082,
"step": 44
},
{
"epoch": 0.005720869254300584,
"grad_norm": 0.15765132009983063,
"learning_rate": 0.00019794922858757928,
"loss": 1.282,
"step": 45
},
{
"epoch": 0.005847999682173931,
"grad_norm": 0.15940701961517334,
"learning_rate": 0.00019783079909749515,
"loss": 1.4016,
"step": 46
},
{
"epoch": 0.005975130110047276,
"grad_norm": 0.1622331291437149,
"learning_rate": 0.00019770908279016309,
"loss": 1.3624,
"step": 47
},
{
"epoch": 0.006102260537920623,
"grad_norm": 0.14866457879543304,
"learning_rate": 0.00019758408375488071,
"loss": 1.2807,
"step": 48
},
{
"epoch": 0.0062293909657939696,
"grad_norm": 0.16648173332214355,
"learning_rate": 0.00019745580619123535,
"loss": 1.3617,
"step": 49
},
{
"epoch": 0.006356521393667315,
"grad_norm": 0.16083629429340363,
"learning_rate": 0.00019732425440896297,
"loss": 1.3903,
"step": 50
},
{
"epoch": 0.006483651821540662,
"grad_norm": 0.18012581765651703,
"learning_rate": 0.00019718943282780323,
"loss": 1.3472,
"step": 51
},
{
"epoch": 0.0066107822494140085,
"grad_norm": 0.16914351284503937,
"learning_rate": 0.00019705134597735113,
"loss": 1.3765,
"step": 52
},
{
"epoch": 0.006737912677287354,
"grad_norm": 0.15967325866222382,
"learning_rate": 0.00019690999849690484,
"loss": 1.3312,
"step": 53
},
{
"epoch": 0.006865043105160701,
"grad_norm": 0.15996071696281433,
"learning_rate": 0.00019676539513530968,
"loss": 1.4227,
"step": 54
},
{
"epoch": 0.006992173533034047,
"grad_norm": 0.1715613752603531,
"learning_rate": 0.0001966175407507987,
"loss": 1.3634,
"step": 55
},
{
"epoch": 0.007119303960907393,
"grad_norm": 0.16633041203022003,
"learning_rate": 0.00019646644031082948,
"loss": 1.3279,
"step": 56
},
{
"epoch": 0.00724643438878074,
"grad_norm": 0.16247525811195374,
"learning_rate": 0.00019631209889191712,
"loss": 1.3721,
"step": 57
},
{
"epoch": 0.007373564816654086,
"grad_norm": 0.1603326052427292,
"learning_rate": 0.00019615452167946385,
"loss": 1.3212,
"step": 58
},
{
"epoch": 0.007500695244527432,
"grad_norm": 0.16569744050502777,
"learning_rate": 0.00019599371396758456,
"loss": 1.3224,
"step": 59
},
{
"epoch": 0.007627825672400779,
"grad_norm": 0.16010916233062744,
"learning_rate": 0.0001958296811589293,
"loss": 1.3022,
"step": 60
},
{
"epoch": 0.007754956100274125,
"grad_norm": 0.1680569052696228,
"learning_rate": 0.00019566242876450137,
"loss": 1.3197,
"step": 61
},
{
"epoch": 0.007882086528147472,
"grad_norm": 0.16432645916938782,
"learning_rate": 0.00019549196240347248,
"loss": 1.3167,
"step": 62
},
{
"epoch": 0.008009216956020818,
"grad_norm": 0.17412547767162323,
"learning_rate": 0.00019531828780299383,
"loss": 1.3196,
"step": 63
},
{
"epoch": 0.008136347383894163,
"grad_norm": 0.1736118346452713,
"learning_rate": 0.0001951414107980036,
"loss": 1.2966,
"step": 64
},
{
"epoch": 0.00826347781176751,
"grad_norm": 0.16254910826683044,
"learning_rate": 0.00019496133733103112,
"loss": 1.3416,
"step": 65
},
{
"epoch": 0.008390608239640857,
"grad_norm": 0.16831637918949127,
"learning_rate": 0.00019477807345199714,
"loss": 1.3396,
"step": 66
},
{
"epoch": 0.008517738667514202,
"grad_norm": 0.1759282648563385,
"learning_rate": 0.00019459162531801046,
"loss": 1.3101,
"step": 67
},
{
"epoch": 0.00864486909538755,
"grad_norm": 0.17314572632312775,
"learning_rate": 0.00019440199919316123,
"loss": 1.4026,
"step": 68
},
{
"epoch": 0.008771999523260895,
"grad_norm": 0.17074303328990936,
"learning_rate": 0.00019420920144831044,
"loss": 1.3088,
"step": 69
},
{
"epoch": 0.008899129951134241,
"grad_norm": 0.17773644626140594,
"learning_rate": 0.0001940132385608757,
"loss": 1.32,
"step": 70
},
{
"epoch": 0.009026260379007589,
"grad_norm": 0.1736891269683838,
"learning_rate": 0.0001938141171146141,
"loss": 1.2865,
"step": 71
},
{
"epoch": 0.009153390806880934,
"grad_norm": 0.17593072354793549,
"learning_rate": 0.0001936118437994003,
"loss": 1.3276,
"step": 72
},
{
"epoch": 0.00928052123475428,
"grad_norm": 0.16799978911876678,
"learning_rate": 0.00019340642541100248,
"loss": 1.2585,
"step": 73
},
{
"epoch": 0.009407651662627628,
"grad_norm": 0.17271657288074493,
"learning_rate": 0.00019319786885085364,
"loss": 1.3838,
"step": 74
},
{
"epoch": 0.009534782090500973,
"grad_norm": 0.18318656086921692,
"learning_rate": 0.0001929861811258197,
"loss": 1.3857,
"step": 75
},
{
"epoch": 0.009661912518374319,
"grad_norm": 0.1850346177816391,
"learning_rate": 0.0001927713693479643,
"loss": 1.3884,
"step": 76
},
{
"epoch": 0.009789042946247667,
"grad_norm": 0.1707659363746643,
"learning_rate": 0.0001925534407343097,
"loss": 1.2674,
"step": 77
},
{
"epoch": 0.009916173374121012,
"grad_norm": 0.1803124099969864,
"learning_rate": 0.0001923324026065944,
"loss": 1.2899,
"step": 78
},
{
"epoch": 0.010043303801994358,
"grad_norm": 0.18143856525421143,
"learning_rate": 0.0001921082623910271,
"loss": 1.2967,
"step": 79
},
{
"epoch": 0.010170434229867705,
"grad_norm": 0.17903882265090942,
"learning_rate": 0.00019188102761803717,
"loss": 1.2913,
"step": 80
},
{
"epoch": 0.010297564657741051,
"grad_norm": 0.181906595826149,
"learning_rate": 0.00019165070592202173,
"loss": 1.2568,
"step": 81
},
{
"epoch": 0.010424695085614397,
"grad_norm": 0.1655733585357666,
"learning_rate": 0.00019141730504108922,
"loss": 1.2758,
"step": 82
},
{
"epoch": 0.010551825513487744,
"grad_norm": 0.17457562685012817,
"learning_rate": 0.00019118083281679913,
"loss": 1.2506,
"step": 83
},
{
"epoch": 0.01067895594136109,
"grad_norm": 0.18457446992397308,
"learning_rate": 0.00019094129719389886,
"loss": 1.3701,
"step": 84
},
{
"epoch": 0.010806086369234436,
"grad_norm": 0.17702506482601166,
"learning_rate": 0.0001906987062200567,
"loss": 1.3071,
"step": 85
},
{
"epoch": 0.010933216797107783,
"grad_norm": 0.1763213723897934,
"learning_rate": 0.0001904530680455914,
"loss": 1.2996,
"step": 86
},
{
"epoch": 0.011060347224981129,
"grad_norm": 0.17649979889392853,
"learning_rate": 0.0001902043909231984,
"loss": 1.314,
"step": 87
},
{
"epoch": 0.011187477652854475,
"grad_norm": 0.1817278414964676,
"learning_rate": 0.00018995268320767252,
"loss": 1.315,
"step": 88
},
{
"epoch": 0.011314608080727822,
"grad_norm": 0.17668454349040985,
"learning_rate": 0.0001896979533556273,
"loss": 1.2914,
"step": 89
},
{
"epoch": 0.011441738508601168,
"grad_norm": 0.17107272148132324,
"learning_rate": 0.0001894402099252109,
"loss": 1.2884,
"step": 90
},
{
"epoch": 0.011568868936474514,
"grad_norm": 0.18352022767066956,
"learning_rate": 0.0001891794615758185,
"loss": 1.3404,
"step": 91
},
{
"epoch": 0.011695999364347861,
"grad_norm": 0.17999856173992157,
"learning_rate": 0.00018891571706780146,
"loss": 1.3001,
"step": 92
},
{
"epoch": 0.011823129792221207,
"grad_norm": 0.17374040186405182,
"learning_rate": 0.00018864898526217293,
"loss": 1.266,
"step": 93
},
{
"epoch": 0.011950260220094553,
"grad_norm": 0.18675245344638824,
"learning_rate": 0.0001883792751203102,
"loss": 1.3347,
"step": 94
},
{
"epoch": 0.0120773906479679,
"grad_norm": 0.18314674496650696,
"learning_rate": 0.0001881065957036536,
"loss": 1.3224,
"step": 95
},
{
"epoch": 0.012204521075841246,
"grad_norm": 0.17483913898468018,
"learning_rate": 0.00018783095617340193,
"loss": 1.2926,
"step": 96
},
{
"epoch": 0.012331651503714592,
"grad_norm": 0.19491428136825562,
"learning_rate": 0.00018755236579020502,
"loss": 1.2636,
"step": 97
},
{
"epoch": 0.012458781931587939,
"grad_norm": 0.17128659784793854,
"learning_rate": 0.0001872708339138522,
"loss": 1.2653,
"step": 98
},
{
"epoch": 0.012585912359461285,
"grad_norm": 0.172018900513649,
"learning_rate": 0.00018698637000295816,
"loss": 1.2686,
"step": 99
},
{
"epoch": 0.01271304278733463,
"grad_norm": 0.18891726434230804,
"learning_rate": 0.0001866989836146449,
"loss": 1.4058,
"step": 100
},
{
"epoch": 0.01271304278733463,
"eval_loss": 1.2906723022460938,
"eval_runtime": 1258.4463,
"eval_samples_per_second": 3.973,
"eval_steps_per_second": 0.993,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 552,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 2,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0364000227295232e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}