lesso's picture
Training in progress, step 200, checkpoint
9593afd verified
{
"best_metric": 0.7234218120574951,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.009289363678588018,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 4.644681839294008e-05,
"grad_norm": 3.092576503753662,
"learning_rate": 1.018e-05,
"loss": 0.997,
"step": 1
},
{
"epoch": 4.644681839294008e-05,
"eval_loss": 1.5409296751022339,
"eval_runtime": 162.8209,
"eval_samples_per_second": 55.681,
"eval_steps_per_second": 13.923,
"step": 1
},
{
"epoch": 9.289363678588016e-05,
"grad_norm": 21.47955322265625,
"learning_rate": 2.036e-05,
"loss": 1.9273,
"step": 2
},
{
"epoch": 0.00013934045517882026,
"grad_norm": 9.852048873901367,
"learning_rate": 3.0539999999999996e-05,
"loss": 1.6379,
"step": 3
},
{
"epoch": 0.00018578727357176033,
"grad_norm": 17.509506225585938,
"learning_rate": 4.072e-05,
"loss": 1.4906,
"step": 4
},
{
"epoch": 0.00023223409196470042,
"grad_norm": 14.546015739440918,
"learning_rate": 5.09e-05,
"loss": 1.4437,
"step": 5
},
{
"epoch": 0.0002786809103576405,
"grad_norm": 5.5904154777526855,
"learning_rate": 6.107999999999999e-05,
"loss": 1.6272,
"step": 6
},
{
"epoch": 0.0003251277287505806,
"grad_norm": 4.112157821655273,
"learning_rate": 7.125999999999999e-05,
"loss": 1.4437,
"step": 7
},
{
"epoch": 0.00037157454714352065,
"grad_norm": 4.384551048278809,
"learning_rate": 8.144e-05,
"loss": 1.3646,
"step": 8
},
{
"epoch": 0.0004180213655364608,
"grad_norm": 8.227641105651855,
"learning_rate": 9.162e-05,
"loss": 1.1622,
"step": 9
},
{
"epoch": 0.00046446818392940084,
"grad_norm": 5.1519856452941895,
"learning_rate": 0.0001018,
"loss": 1.4487,
"step": 10
},
{
"epoch": 0.0005109150023223409,
"grad_norm": 9.391843795776367,
"learning_rate": 0.00010126421052631578,
"loss": 1.2691,
"step": 11
},
{
"epoch": 0.000557361820715281,
"grad_norm": 5.215498924255371,
"learning_rate": 0.00010072842105263156,
"loss": 0.9428,
"step": 12
},
{
"epoch": 0.000603808639108221,
"grad_norm": 2.952871084213257,
"learning_rate": 0.00010019263157894736,
"loss": 1.0689,
"step": 13
},
{
"epoch": 0.0006502554575011612,
"grad_norm": 2.8391802310943604,
"learning_rate": 9.965684210526316e-05,
"loss": 0.9291,
"step": 14
},
{
"epoch": 0.0006967022758941013,
"grad_norm": 2.461745023727417,
"learning_rate": 9.912105263157895e-05,
"loss": 0.8318,
"step": 15
},
{
"epoch": 0.0007431490942870413,
"grad_norm": 2.3736178874969482,
"learning_rate": 9.858526315789473e-05,
"loss": 0.5747,
"step": 16
},
{
"epoch": 0.0007895959126799814,
"grad_norm": 3.788600444793701,
"learning_rate": 9.804947368421052e-05,
"loss": 0.9676,
"step": 17
},
{
"epoch": 0.0008360427310729215,
"grad_norm": 3.9328246116638184,
"learning_rate": 9.75136842105263e-05,
"loss": 1.0193,
"step": 18
},
{
"epoch": 0.0008824895494658616,
"grad_norm": 2.693528652191162,
"learning_rate": 9.69778947368421e-05,
"loss": 0.7466,
"step": 19
},
{
"epoch": 0.0009289363678588017,
"grad_norm": 3.4937636852264404,
"learning_rate": 9.644210526315789e-05,
"loss": 0.8632,
"step": 20
},
{
"epoch": 0.0009753831862517418,
"grad_norm": 2.1122593879699707,
"learning_rate": 9.590631578947369e-05,
"loss": 0.7247,
"step": 21
},
{
"epoch": 0.0010218300046446818,
"grad_norm": 3.186523675918579,
"learning_rate": 9.537052631578947e-05,
"loss": 0.6886,
"step": 22
},
{
"epoch": 0.001068276823037622,
"grad_norm": 2.758934497833252,
"learning_rate": 9.483473684210526e-05,
"loss": 0.6994,
"step": 23
},
{
"epoch": 0.001114723641430562,
"grad_norm": 2.430767774581909,
"learning_rate": 9.429894736842104e-05,
"loss": 0.6421,
"step": 24
},
{
"epoch": 0.0011611704598235022,
"grad_norm": 3.4728782176971436,
"learning_rate": 9.376315789473684e-05,
"loss": 0.8879,
"step": 25
},
{
"epoch": 0.001207617278216442,
"grad_norm": 3.511162519454956,
"learning_rate": 9.322736842105262e-05,
"loss": 0.9696,
"step": 26
},
{
"epoch": 0.0012540640966093822,
"grad_norm": 3.0230774879455566,
"learning_rate": 9.269157894736842e-05,
"loss": 0.9589,
"step": 27
},
{
"epoch": 0.0013005109150023223,
"grad_norm": 2.1474082469940186,
"learning_rate": 9.215578947368421e-05,
"loss": 0.7707,
"step": 28
},
{
"epoch": 0.0013469577333952625,
"grad_norm": 2.3117761611938477,
"learning_rate": 9.162e-05,
"loss": 0.7724,
"step": 29
},
{
"epoch": 0.0013934045517882026,
"grad_norm": 1.9512385129928589,
"learning_rate": 9.108421052631578e-05,
"loss": 0.603,
"step": 30
},
{
"epoch": 0.0014398513701811425,
"grad_norm": 6.22908878326416,
"learning_rate": 9.054842105263158e-05,
"loss": 1.0569,
"step": 31
},
{
"epoch": 0.0014862981885740826,
"grad_norm": 2.2944276332855225,
"learning_rate": 9.001263157894736e-05,
"loss": 0.6613,
"step": 32
},
{
"epoch": 0.0015327450069670227,
"grad_norm": 2.312437057495117,
"learning_rate": 8.947684210526315e-05,
"loss": 0.8758,
"step": 33
},
{
"epoch": 0.0015791918253599629,
"grad_norm": 2.5488150119781494,
"learning_rate": 8.894105263157895e-05,
"loss": 0.635,
"step": 34
},
{
"epoch": 0.001625638643752903,
"grad_norm": 3.658079147338867,
"learning_rate": 8.840526315789473e-05,
"loss": 1.0831,
"step": 35
},
{
"epoch": 0.001672085462145843,
"grad_norm": 2.474161386489868,
"learning_rate": 8.786947368421052e-05,
"loss": 0.7918,
"step": 36
},
{
"epoch": 0.001718532280538783,
"grad_norm": 2.5074217319488525,
"learning_rate": 8.733368421052632e-05,
"loss": 0.8514,
"step": 37
},
{
"epoch": 0.0017649790989317231,
"grad_norm": 2.0377755165100098,
"learning_rate": 8.67978947368421e-05,
"loss": 0.6768,
"step": 38
},
{
"epoch": 0.0018114259173246632,
"grad_norm": 2.2819857597351074,
"learning_rate": 8.626210526315789e-05,
"loss": 0.6528,
"step": 39
},
{
"epoch": 0.0018578727357176034,
"grad_norm": 2.3973352909088135,
"learning_rate": 8.572631578947367e-05,
"loss": 0.8542,
"step": 40
},
{
"epoch": 0.0019043195541105435,
"grad_norm": 2.628427267074585,
"learning_rate": 8.519052631578947e-05,
"loss": 0.8507,
"step": 41
},
{
"epoch": 0.0019507663725034836,
"grad_norm": 3.116105556488037,
"learning_rate": 8.465473684210527e-05,
"loss": 0.9283,
"step": 42
},
{
"epoch": 0.0019972131908964235,
"grad_norm": 3.5683062076568604,
"learning_rate": 8.411894736842105e-05,
"loss": 1.1937,
"step": 43
},
{
"epoch": 0.0020436600092893636,
"grad_norm": 2.569610118865967,
"learning_rate": 8.358315789473684e-05,
"loss": 0.9237,
"step": 44
},
{
"epoch": 0.0020901068276823038,
"grad_norm": 3.7425827980041504,
"learning_rate": 8.304736842105262e-05,
"loss": 1.3234,
"step": 45
},
{
"epoch": 0.002136553646075244,
"grad_norm": 3.7030258178710938,
"learning_rate": 8.251157894736841e-05,
"loss": 0.7639,
"step": 46
},
{
"epoch": 0.002183000464468184,
"grad_norm": 3.188816785812378,
"learning_rate": 8.197578947368421e-05,
"loss": 1.1931,
"step": 47
},
{
"epoch": 0.002229447282861124,
"grad_norm": 5.445688724517822,
"learning_rate": 8.144e-05,
"loss": 0.8938,
"step": 48
},
{
"epoch": 0.0022758941012540643,
"grad_norm": 2.3576200008392334,
"learning_rate": 8.090421052631579e-05,
"loss": 0.836,
"step": 49
},
{
"epoch": 0.0023223409196470044,
"grad_norm": 22.260164260864258,
"learning_rate": 8.036842105263158e-05,
"loss": 1.0266,
"step": 50
},
{
"epoch": 0.0023223409196470044,
"eval_loss": 0.8269708752632141,
"eval_runtime": 162.7994,
"eval_samples_per_second": 55.688,
"eval_steps_per_second": 13.925,
"step": 50
},
{
"epoch": 0.002368787738039944,
"grad_norm": 2.7304725646972656,
"learning_rate": 7.983263157894736e-05,
"loss": 0.7953,
"step": 51
},
{
"epoch": 0.002415234556432884,
"grad_norm": 2.282052516937256,
"learning_rate": 7.929684210526315e-05,
"loss": 0.8493,
"step": 52
},
{
"epoch": 0.0024616813748258243,
"grad_norm": 2.524898052215576,
"learning_rate": 7.876105263157895e-05,
"loss": 1.0108,
"step": 53
},
{
"epoch": 0.0025081281932187644,
"grad_norm": 2.6745500564575195,
"learning_rate": 7.822526315789473e-05,
"loss": 1.0531,
"step": 54
},
{
"epoch": 0.0025545750116117046,
"grad_norm": 3.1213998794555664,
"learning_rate": 7.768947368421053e-05,
"loss": 1.1068,
"step": 55
},
{
"epoch": 0.0026010218300046447,
"grad_norm": 3.499636173248291,
"learning_rate": 7.715368421052631e-05,
"loss": 1.0808,
"step": 56
},
{
"epoch": 0.002647468648397585,
"grad_norm": 3.125180721282959,
"learning_rate": 7.66178947368421e-05,
"loss": 0.8485,
"step": 57
},
{
"epoch": 0.002693915466790525,
"grad_norm": 2.547490119934082,
"learning_rate": 7.608210526315788e-05,
"loss": 0.7604,
"step": 58
},
{
"epoch": 0.002740362285183465,
"grad_norm": 2.058576822280884,
"learning_rate": 7.554631578947368e-05,
"loss": 0.665,
"step": 59
},
{
"epoch": 0.002786809103576405,
"grad_norm": 2.624077320098877,
"learning_rate": 7.501052631578947e-05,
"loss": 0.6822,
"step": 60
},
{
"epoch": 0.0028332559219693453,
"grad_norm": 1.4456878900527954,
"learning_rate": 7.447473684210527e-05,
"loss": 0.554,
"step": 61
},
{
"epoch": 0.002879702740362285,
"grad_norm": 1.9477028846740723,
"learning_rate": 7.393894736842105e-05,
"loss": 0.712,
"step": 62
},
{
"epoch": 0.002926149558755225,
"grad_norm": 2.0057406425476074,
"learning_rate": 7.340315789473684e-05,
"loss": 0.7368,
"step": 63
},
{
"epoch": 0.0029725963771481652,
"grad_norm": 2.041309356689453,
"learning_rate": 7.286736842105262e-05,
"loss": 0.9478,
"step": 64
},
{
"epoch": 0.0030190431955411053,
"grad_norm": 1.8585742712020874,
"learning_rate": 7.233157894736842e-05,
"loss": 0.5883,
"step": 65
},
{
"epoch": 0.0030654900139340455,
"grad_norm": 2.3726940155029297,
"learning_rate": 7.179578947368421e-05,
"loss": 0.8448,
"step": 66
},
{
"epoch": 0.0031119368323269856,
"grad_norm": 4.274697303771973,
"learning_rate": 7.125999999999999e-05,
"loss": 0.693,
"step": 67
},
{
"epoch": 0.0031583836507199257,
"grad_norm": 1.7456036806106567,
"learning_rate": 7.072421052631579e-05,
"loss": 0.6139,
"step": 68
},
{
"epoch": 0.003204830469112866,
"grad_norm": 2.077462673187256,
"learning_rate": 7.018842105263158e-05,
"loss": 0.6687,
"step": 69
},
{
"epoch": 0.003251277287505806,
"grad_norm": 1.6591328382492065,
"learning_rate": 6.965263157894736e-05,
"loss": 0.6602,
"step": 70
},
{
"epoch": 0.003297724105898746,
"grad_norm": 2.040104866027832,
"learning_rate": 6.911684210526316e-05,
"loss": 0.6718,
"step": 71
},
{
"epoch": 0.003344170924291686,
"grad_norm": 2.031517505645752,
"learning_rate": 6.858105263157894e-05,
"loss": 0.6965,
"step": 72
},
{
"epoch": 0.003390617742684626,
"grad_norm": 1.677396297454834,
"learning_rate": 6.804526315789473e-05,
"loss": 0.6022,
"step": 73
},
{
"epoch": 0.003437064561077566,
"grad_norm": 2.6971142292022705,
"learning_rate": 6.750947368421052e-05,
"loss": 0.7803,
"step": 74
},
{
"epoch": 0.003483511379470506,
"grad_norm": 1.2442923784255981,
"learning_rate": 6.697368421052631e-05,
"loss": 0.4904,
"step": 75
},
{
"epoch": 0.0035299581978634463,
"grad_norm": 1.520882487297058,
"learning_rate": 6.64378947368421e-05,
"loss": 0.5449,
"step": 76
},
{
"epoch": 0.0035764050162563864,
"grad_norm": 2.945136070251465,
"learning_rate": 6.59021052631579e-05,
"loss": 0.5945,
"step": 77
},
{
"epoch": 0.0036228518346493265,
"grad_norm": 2.225796937942505,
"learning_rate": 6.536631578947368e-05,
"loss": 0.8414,
"step": 78
},
{
"epoch": 0.0036692986530422666,
"grad_norm": 3.5419042110443115,
"learning_rate": 6.483052631578947e-05,
"loss": 0.951,
"step": 79
},
{
"epoch": 0.0037157454714352067,
"grad_norm": 2.4470789432525635,
"learning_rate": 6.429473684210525e-05,
"loss": 0.6766,
"step": 80
},
{
"epoch": 0.003762192289828147,
"grad_norm": 1.8150739669799805,
"learning_rate": 6.375894736842104e-05,
"loss": 0.6212,
"step": 81
},
{
"epoch": 0.003808639108221087,
"grad_norm": 2.2378828525543213,
"learning_rate": 6.322315789473684e-05,
"loss": 0.912,
"step": 82
},
{
"epoch": 0.003855085926614027,
"grad_norm": 2.66448974609375,
"learning_rate": 6.268736842105264e-05,
"loss": 0.7284,
"step": 83
},
{
"epoch": 0.0039015327450069672,
"grad_norm": 2.0171289443969727,
"learning_rate": 6.215157894736842e-05,
"loss": 0.5339,
"step": 84
},
{
"epoch": 0.003947979563399907,
"grad_norm": 1.829827070236206,
"learning_rate": 6.16157894736842e-05,
"loss": 0.6982,
"step": 85
},
{
"epoch": 0.003994426381792847,
"grad_norm": 1.3786966800689697,
"learning_rate": 6.107999999999999e-05,
"loss": 0.4433,
"step": 86
},
{
"epoch": 0.004040873200185788,
"grad_norm": 2.0562403202056885,
"learning_rate": 6.054421052631578e-05,
"loss": 0.791,
"step": 87
},
{
"epoch": 0.004087320018578727,
"grad_norm": 1.8710417747497559,
"learning_rate": 6.000842105263157e-05,
"loss": 0.5487,
"step": 88
},
{
"epoch": 0.004133766836971668,
"grad_norm": 2.46244215965271,
"learning_rate": 5.947263157894737e-05,
"loss": 0.9045,
"step": 89
},
{
"epoch": 0.0041802136553646075,
"grad_norm": 1.9283982515335083,
"learning_rate": 5.893684210526316e-05,
"loss": 0.6472,
"step": 90
},
{
"epoch": 0.004226660473757547,
"grad_norm": 1.794073462486267,
"learning_rate": 5.8401052631578944e-05,
"loss": 0.7332,
"step": 91
},
{
"epoch": 0.004273107292150488,
"grad_norm": 2.4211764335632324,
"learning_rate": 5.7865263157894736e-05,
"loss": 0.9175,
"step": 92
},
{
"epoch": 0.0043195541105434275,
"grad_norm": 2.131087064743042,
"learning_rate": 5.732947368421052e-05,
"loss": 0.846,
"step": 93
},
{
"epoch": 0.004366000928936368,
"grad_norm": 3.606595993041992,
"learning_rate": 5.6793684210526306e-05,
"loss": 1.023,
"step": 94
},
{
"epoch": 0.004412447747329308,
"grad_norm": 2.0817458629608154,
"learning_rate": 5.6257894736842105e-05,
"loss": 0.7585,
"step": 95
},
{
"epoch": 0.004458894565722248,
"grad_norm": 2.736661672592163,
"learning_rate": 5.57221052631579e-05,
"loss": 0.9443,
"step": 96
},
{
"epoch": 0.004505341384115188,
"grad_norm": 1.7814656496047974,
"learning_rate": 5.518631578947368e-05,
"loss": 0.9056,
"step": 97
},
{
"epoch": 0.0045517882025081285,
"grad_norm": 2.098845958709717,
"learning_rate": 5.4650526315789474e-05,
"loss": 0.7014,
"step": 98
},
{
"epoch": 0.004598235020901068,
"grad_norm": 2.316159963607788,
"learning_rate": 5.411473684210526e-05,
"loss": 0.7147,
"step": 99
},
{
"epoch": 0.004644681839294009,
"grad_norm": 2.182925224304199,
"learning_rate": 5.3578947368421044e-05,
"loss": 0.7637,
"step": 100
},
{
"epoch": 0.004644681839294009,
"eval_loss": 0.7575440406799316,
"eval_runtime": 162.5731,
"eval_samples_per_second": 55.766,
"eval_steps_per_second": 13.944,
"step": 100
},
{
"epoch": 0.0046911286576869484,
"grad_norm": 1.547819972038269,
"learning_rate": 5.3043157894736836e-05,
"loss": 0.7768,
"step": 101
},
{
"epoch": 0.004737575476079888,
"grad_norm": 2.0550365447998047,
"learning_rate": 5.2507368421052635e-05,
"loss": 0.8542,
"step": 102
},
{
"epoch": 0.004784022294472829,
"grad_norm": 1.7644928693771362,
"learning_rate": 5.197157894736842e-05,
"loss": 0.8406,
"step": 103
},
{
"epoch": 0.004830469112865768,
"grad_norm": 2.784821033477783,
"learning_rate": 5.143578947368421e-05,
"loss": 1.0809,
"step": 104
},
{
"epoch": 0.004876915931258709,
"grad_norm": 2.643968105316162,
"learning_rate": 5.09e-05,
"loss": 1.1358,
"step": 105
},
{
"epoch": 0.004923362749651649,
"grad_norm": 2.6479332447052,
"learning_rate": 5.036421052631578e-05,
"loss": 0.9534,
"step": 106
},
{
"epoch": 0.004969809568044589,
"grad_norm": 1.5139284133911133,
"learning_rate": 4.982842105263158e-05,
"loss": 0.6037,
"step": 107
},
{
"epoch": 0.005016256386437529,
"grad_norm": 2.2001686096191406,
"learning_rate": 4.9292631578947366e-05,
"loss": 1.0875,
"step": 108
},
{
"epoch": 0.005062703204830469,
"grad_norm": 1.906663417816162,
"learning_rate": 4.875684210526315e-05,
"loss": 0.8251,
"step": 109
},
{
"epoch": 0.005109150023223409,
"grad_norm": 1.6133707761764526,
"learning_rate": 4.822105263157894e-05,
"loss": 0.7804,
"step": 110
},
{
"epoch": 0.00515559684161635,
"grad_norm": 1.6872289180755615,
"learning_rate": 4.7685263157894735e-05,
"loss": 0.5731,
"step": 111
},
{
"epoch": 0.005202043660009289,
"grad_norm": 1.2829549312591553,
"learning_rate": 4.714947368421052e-05,
"loss": 0.4865,
"step": 112
},
{
"epoch": 0.005248490478402229,
"grad_norm": 1.8299009799957275,
"learning_rate": 4.661368421052631e-05,
"loss": 0.7806,
"step": 113
},
{
"epoch": 0.00529493729679517,
"grad_norm": 1.3792545795440674,
"learning_rate": 4.6077894736842104e-05,
"loss": 0.5824,
"step": 114
},
{
"epoch": 0.005341384115188109,
"grad_norm": 1.554002046585083,
"learning_rate": 4.554210526315789e-05,
"loss": 0.7399,
"step": 115
},
{
"epoch": 0.00538783093358105,
"grad_norm": 1.8911974430084229,
"learning_rate": 4.500631578947368e-05,
"loss": 0.8756,
"step": 116
},
{
"epoch": 0.0054342777519739895,
"grad_norm": 2.071706771850586,
"learning_rate": 4.447052631578947e-05,
"loss": 0.8007,
"step": 117
},
{
"epoch": 0.00548072457036693,
"grad_norm": 2.202437162399292,
"learning_rate": 4.393473684210526e-05,
"loss": 0.8207,
"step": 118
},
{
"epoch": 0.00552717138875987,
"grad_norm": 1.33773672580719,
"learning_rate": 4.339894736842105e-05,
"loss": 0.5947,
"step": 119
},
{
"epoch": 0.00557361820715281,
"grad_norm": 1.8306225538253784,
"learning_rate": 4.2863157894736835e-05,
"loss": 0.7513,
"step": 120
},
{
"epoch": 0.00562006502554575,
"grad_norm": 1.6813061237335205,
"learning_rate": 4.2327368421052634e-05,
"loss": 0.6929,
"step": 121
},
{
"epoch": 0.005666511843938691,
"grad_norm": 1.5658451318740845,
"learning_rate": 4.179157894736842e-05,
"loss": 0.5594,
"step": 122
},
{
"epoch": 0.00571295866233163,
"grad_norm": 1.4536268711090088,
"learning_rate": 4.1255789473684204e-05,
"loss": 0.6208,
"step": 123
},
{
"epoch": 0.00575940548072457,
"grad_norm": 1.9043149948120117,
"learning_rate": 4.072e-05,
"loss": 0.6332,
"step": 124
},
{
"epoch": 0.0058058522991175105,
"grad_norm": 2.0733814239501953,
"learning_rate": 4.018421052631579e-05,
"loss": 0.6764,
"step": 125
},
{
"epoch": 0.00585229911751045,
"grad_norm": 1.7627897262573242,
"learning_rate": 3.9648421052631573e-05,
"loss": 0.7384,
"step": 126
},
{
"epoch": 0.005898745935903391,
"grad_norm": 1.6006054878234863,
"learning_rate": 3.9112631578947365e-05,
"loss": 0.6752,
"step": 127
},
{
"epoch": 0.0059451927542963304,
"grad_norm": 1.4541168212890625,
"learning_rate": 3.857684210526316e-05,
"loss": 0.6692,
"step": 128
},
{
"epoch": 0.005991639572689271,
"grad_norm": 1.3292078971862793,
"learning_rate": 3.804105263157894e-05,
"loss": 0.5353,
"step": 129
},
{
"epoch": 0.006038086391082211,
"grad_norm": 1.6884562969207764,
"learning_rate": 3.7505263157894734e-05,
"loss": 0.7562,
"step": 130
},
{
"epoch": 0.006084533209475151,
"grad_norm": 1.0477324724197388,
"learning_rate": 3.6969473684210526e-05,
"loss": 0.3243,
"step": 131
},
{
"epoch": 0.006130980027868091,
"grad_norm": 1.4753937721252441,
"learning_rate": 3.643368421052631e-05,
"loss": 0.5291,
"step": 132
},
{
"epoch": 0.0061774268462610315,
"grad_norm": 1.7509891986846924,
"learning_rate": 3.5897894736842103e-05,
"loss": 0.6364,
"step": 133
},
{
"epoch": 0.006223873664653971,
"grad_norm": 2.055713653564453,
"learning_rate": 3.5362105263157895e-05,
"loss": 0.782,
"step": 134
},
{
"epoch": 0.006270320483046911,
"grad_norm": 2.0711967945098877,
"learning_rate": 3.482631578947368e-05,
"loss": 0.7677,
"step": 135
},
{
"epoch": 0.006316767301439851,
"grad_norm": 1.3271763324737549,
"learning_rate": 3.429052631578947e-05,
"loss": 0.5314,
"step": 136
},
{
"epoch": 0.006363214119832791,
"grad_norm": 1.7668476104736328,
"learning_rate": 3.375473684210526e-05,
"loss": 0.8441,
"step": 137
},
{
"epoch": 0.006409660938225732,
"grad_norm": 1.773807168006897,
"learning_rate": 3.321894736842105e-05,
"loss": 0.7551,
"step": 138
},
{
"epoch": 0.006456107756618671,
"grad_norm": 1.6312812566757202,
"learning_rate": 3.268315789473684e-05,
"loss": 0.8111,
"step": 139
},
{
"epoch": 0.006502554575011612,
"grad_norm": 1.6187984943389893,
"learning_rate": 3.2147368421052627e-05,
"loss": 0.6781,
"step": 140
},
{
"epoch": 0.006549001393404552,
"grad_norm": 1.6448986530303955,
"learning_rate": 3.161157894736842e-05,
"loss": 0.5815,
"step": 141
},
{
"epoch": 0.006595448211797492,
"grad_norm": 1.9651342630386353,
"learning_rate": 3.107578947368421e-05,
"loss": 0.7199,
"step": 142
},
{
"epoch": 0.006641895030190432,
"grad_norm": 2.4397366046905518,
"learning_rate": 3.0539999999999996e-05,
"loss": 0.7959,
"step": 143
},
{
"epoch": 0.006688341848583372,
"grad_norm": 1.7463246583938599,
"learning_rate": 3.0004210526315784e-05,
"loss": 0.7066,
"step": 144
},
{
"epoch": 0.006734788666976312,
"grad_norm": 1.6383179426193237,
"learning_rate": 2.946842105263158e-05,
"loss": 0.5844,
"step": 145
},
{
"epoch": 0.006781235485369252,
"grad_norm": 2.03802752494812,
"learning_rate": 2.8932631578947368e-05,
"loss": 0.7851,
"step": 146
},
{
"epoch": 0.006827682303762192,
"grad_norm": 1.5965886116027832,
"learning_rate": 2.8396842105263153e-05,
"loss": 0.7421,
"step": 147
},
{
"epoch": 0.006874129122155132,
"grad_norm": 1.6589584350585938,
"learning_rate": 2.786105263157895e-05,
"loss": 0.7362,
"step": 148
},
{
"epoch": 0.006920575940548073,
"grad_norm": 1.904215693473816,
"learning_rate": 2.7325263157894737e-05,
"loss": 0.842,
"step": 149
},
{
"epoch": 0.006967022758941012,
"grad_norm": 1.954518437385559,
"learning_rate": 2.6789473684210522e-05,
"loss": 0.8687,
"step": 150
},
{
"epoch": 0.006967022758941012,
"eval_loss": 0.7381066679954529,
"eval_runtime": 163.587,
"eval_samples_per_second": 55.42,
"eval_steps_per_second": 13.858,
"step": 150
},
{
"epoch": 0.007013469577333953,
"grad_norm": 1.321113109588623,
"learning_rate": 2.6253684210526317e-05,
"loss": 0.6275,
"step": 151
},
{
"epoch": 0.0070599163957268925,
"grad_norm": 1.9313557147979736,
"learning_rate": 2.5717894736842106e-05,
"loss": 0.8083,
"step": 152
},
{
"epoch": 0.007106363214119833,
"grad_norm": 2.39707350730896,
"learning_rate": 2.518210526315789e-05,
"loss": 1.1125,
"step": 153
},
{
"epoch": 0.007152810032512773,
"grad_norm": 2.2258388996124268,
"learning_rate": 2.4646315789473683e-05,
"loss": 0.9885,
"step": 154
},
{
"epoch": 0.007199256850905713,
"grad_norm": 2.207796096801758,
"learning_rate": 2.411052631578947e-05,
"loss": 0.8165,
"step": 155
},
{
"epoch": 0.007245703669298653,
"grad_norm": 2.068021774291992,
"learning_rate": 2.357473684210526e-05,
"loss": 0.9621,
"step": 156
},
{
"epoch": 0.0072921504876915936,
"grad_norm": 3.123298168182373,
"learning_rate": 2.3038947368421052e-05,
"loss": 0.9623,
"step": 157
},
{
"epoch": 0.007338597306084533,
"grad_norm": 1.7516857385635376,
"learning_rate": 2.250315789473684e-05,
"loss": 0.7126,
"step": 158
},
{
"epoch": 0.007385044124477473,
"grad_norm": 1.756352424621582,
"learning_rate": 2.196736842105263e-05,
"loss": 0.6112,
"step": 159
},
{
"epoch": 0.0074314909428704135,
"grad_norm": 1.324313998222351,
"learning_rate": 2.1431578947368418e-05,
"loss": 0.4837,
"step": 160
},
{
"epoch": 0.007477937761263353,
"grad_norm": 1.6090558767318726,
"learning_rate": 2.089578947368421e-05,
"loss": 0.5255,
"step": 161
},
{
"epoch": 0.007524384579656294,
"grad_norm": 1.3804148435592651,
"learning_rate": 2.036e-05,
"loss": 0.4674,
"step": 162
},
{
"epoch": 0.007570831398049233,
"grad_norm": 1.3651041984558105,
"learning_rate": 1.9824210526315787e-05,
"loss": 0.7306,
"step": 163
},
{
"epoch": 0.007617278216442174,
"grad_norm": 1.8530007600784302,
"learning_rate": 1.928842105263158e-05,
"loss": 0.7491,
"step": 164
},
{
"epoch": 0.007663725034835114,
"grad_norm": 1.493820309638977,
"learning_rate": 1.8752631578947367e-05,
"loss": 0.6576,
"step": 165
},
{
"epoch": 0.007710171853228054,
"grad_norm": 1.199458360671997,
"learning_rate": 1.8216842105263156e-05,
"loss": 0.3963,
"step": 166
},
{
"epoch": 0.007756618671620994,
"grad_norm": 1.6788829565048218,
"learning_rate": 1.7681052631578948e-05,
"loss": 0.7574,
"step": 167
},
{
"epoch": 0.0078030654900139345,
"grad_norm": 1.2864102125167847,
"learning_rate": 1.7145263157894736e-05,
"loss": 0.5522,
"step": 168
},
{
"epoch": 0.007849512308406874,
"grad_norm": 1.8316515684127808,
"learning_rate": 1.6609473684210525e-05,
"loss": 0.5285,
"step": 169
},
{
"epoch": 0.007895959126799815,
"grad_norm": 1.116195559501648,
"learning_rate": 1.6073684210526313e-05,
"loss": 0.5598,
"step": 170
},
{
"epoch": 0.007942405945192754,
"grad_norm": 1.7328448295593262,
"learning_rate": 1.5537894736842105e-05,
"loss": 0.5905,
"step": 171
},
{
"epoch": 0.007988852763585694,
"grad_norm": 1.5131913423538208,
"learning_rate": 1.5002105263157892e-05,
"loss": 0.5292,
"step": 172
},
{
"epoch": 0.008035299581978635,
"grad_norm": 1.8316717147827148,
"learning_rate": 1.4466315789473684e-05,
"loss": 0.547,
"step": 173
},
{
"epoch": 0.008081746400371575,
"grad_norm": 1.4963319301605225,
"learning_rate": 1.3930526315789474e-05,
"loss": 0.6936,
"step": 174
},
{
"epoch": 0.008128193218764514,
"grad_norm": 1.6800758838653564,
"learning_rate": 1.3394736842105261e-05,
"loss": 0.6586,
"step": 175
},
{
"epoch": 0.008174640037157455,
"grad_norm": 1.0843589305877686,
"learning_rate": 1.2858947368421053e-05,
"loss": 0.4544,
"step": 176
},
{
"epoch": 0.008221086855550395,
"grad_norm": 1.6353403329849243,
"learning_rate": 1.2323157894736842e-05,
"loss": 0.6454,
"step": 177
},
{
"epoch": 0.008267533673943336,
"grad_norm": 1.6226987838745117,
"learning_rate": 1.178736842105263e-05,
"loss": 0.7083,
"step": 178
},
{
"epoch": 0.008313980492336275,
"grad_norm": 1.2014755010604858,
"learning_rate": 1.125157894736842e-05,
"loss": 0.518,
"step": 179
},
{
"epoch": 0.008360427310729215,
"grad_norm": 0.9961537718772888,
"learning_rate": 1.0715789473684209e-05,
"loss": 0.5034,
"step": 180
},
{
"epoch": 0.008406874129122156,
"grad_norm": 1.2505362033843994,
"learning_rate": 1.018e-05,
"loss": 0.3744,
"step": 181
},
{
"epoch": 0.008453320947515094,
"grad_norm": 1.209218144416809,
"learning_rate": 9.64421052631579e-06,
"loss": 0.5659,
"step": 182
},
{
"epoch": 0.008499767765908035,
"grad_norm": 1.5287011861801147,
"learning_rate": 9.108421052631578e-06,
"loss": 0.6864,
"step": 183
},
{
"epoch": 0.008546214584300976,
"grad_norm": 1.5412635803222656,
"learning_rate": 8.572631578947368e-06,
"loss": 0.784,
"step": 184
},
{
"epoch": 0.008592661402693916,
"grad_norm": 1.2702871561050415,
"learning_rate": 8.036842105263157e-06,
"loss": 0.6373,
"step": 185
},
{
"epoch": 0.008639108221086855,
"grad_norm": 1.2671583890914917,
"learning_rate": 7.501052631578946e-06,
"loss": 0.4503,
"step": 186
},
{
"epoch": 0.008685555039479795,
"grad_norm": 1.6440976858139038,
"learning_rate": 6.965263157894737e-06,
"loss": 0.6182,
"step": 187
},
{
"epoch": 0.008732001857872736,
"grad_norm": 1.6860370635986328,
"learning_rate": 6.4294736842105265e-06,
"loss": 0.6629,
"step": 188
},
{
"epoch": 0.008778448676265677,
"grad_norm": 1.778744101524353,
"learning_rate": 5.893684210526315e-06,
"loss": 0.707,
"step": 189
},
{
"epoch": 0.008824895494658615,
"grad_norm": 2.259239673614502,
"learning_rate": 5.3578947368421044e-06,
"loss": 0.8088,
"step": 190
},
{
"epoch": 0.008871342313051556,
"grad_norm": 1.5541491508483887,
"learning_rate": 4.822105263157895e-06,
"loss": 0.6756,
"step": 191
},
{
"epoch": 0.008917789131444497,
"grad_norm": 1.634876012802124,
"learning_rate": 4.286315789473684e-06,
"loss": 0.8733,
"step": 192
},
{
"epoch": 0.008964235949837435,
"grad_norm": 1.7315068244934082,
"learning_rate": 3.750526315789473e-06,
"loss": 0.6555,
"step": 193
},
{
"epoch": 0.009010682768230376,
"grad_norm": 1.7454522848129272,
"learning_rate": 3.2147368421052633e-06,
"loss": 0.754,
"step": 194
},
{
"epoch": 0.009057129586623316,
"grad_norm": 1.7474086284637451,
"learning_rate": 2.6789473684210522e-06,
"loss": 0.8278,
"step": 195
},
{
"epoch": 0.009103576405016257,
"grad_norm": 1.9185843467712402,
"learning_rate": 2.143157894736842e-06,
"loss": 0.7852,
"step": 196
},
{
"epoch": 0.009150023223409196,
"grad_norm": 1.923701286315918,
"learning_rate": 1.6073684210526316e-06,
"loss": 0.6713,
"step": 197
},
{
"epoch": 0.009196470041802136,
"grad_norm": 1.524601697921753,
"learning_rate": 1.071578947368421e-06,
"loss": 0.7535,
"step": 198
},
{
"epoch": 0.009242916860195077,
"grad_norm": 2.139697313308716,
"learning_rate": 5.357894736842105e-07,
"loss": 0.9147,
"step": 199
},
{
"epoch": 0.009289363678588018,
"grad_norm": 1.9575085639953613,
"learning_rate": 0.0,
"loss": 0.8268,
"step": 200
},
{
"epoch": 0.009289363678588018,
"eval_loss": 0.7234218120574951,
"eval_runtime": 163.2305,
"eval_samples_per_second": 55.541,
"eval_steps_per_second": 13.888,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2158942759092224e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}