LenDigLearn's picture
upload model and tokenizer
a8a0ecd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.95971351835273,
"eval_steps": 500,
"global_step": 834,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007162041181736795,
"grad_norm": 1.5651538372039795,
"learning_rate": 9.98800959232614e-06,
"loss": 1.1028,
"step": 1
},
{
"epoch": 0.01432408236347359,
"grad_norm": 2.358640193939209,
"learning_rate": 9.976019184652279e-06,
"loss": 1.1257,
"step": 2
},
{
"epoch": 0.021486123545210387,
"grad_norm": 1.7103272676467896,
"learning_rate": 9.964028776978418e-06,
"loss": 1.1098,
"step": 3
},
{
"epoch": 0.02864816472694718,
"grad_norm": 1.783992886543274,
"learning_rate": 9.952038369304557e-06,
"loss": 1.0858,
"step": 4
},
{
"epoch": 0.03581020590868397,
"grad_norm": 1.0596697330474854,
"learning_rate": 9.940047961630696e-06,
"loss": 1.0924,
"step": 5
},
{
"epoch": 0.04297224709042077,
"grad_norm": 1.694056510925293,
"learning_rate": 9.928057553956835e-06,
"loss": 1.0912,
"step": 6
},
{
"epoch": 0.050134288272157566,
"grad_norm": 1.4265025854110718,
"learning_rate": 9.916067146282976e-06,
"loss": 1.0833,
"step": 7
},
{
"epoch": 0.05729632945389436,
"grad_norm": 2.3417539596557617,
"learning_rate": 9.904076738609113e-06,
"loss": 1.1031,
"step": 8
},
{
"epoch": 0.06445837063563116,
"grad_norm": 1.6366913318634033,
"learning_rate": 9.892086330935252e-06,
"loss": 1.1186,
"step": 9
},
{
"epoch": 0.07162041181736795,
"grad_norm": 2.713798761367798,
"learning_rate": 9.880095923261391e-06,
"loss": 1.119,
"step": 10
},
{
"epoch": 0.07878245299910475,
"grad_norm": 1.0447508096694946,
"learning_rate": 9.86810551558753e-06,
"loss": 1.0836,
"step": 11
},
{
"epoch": 0.08594449418084155,
"grad_norm": 1.4873104095458984,
"learning_rate": 9.85611510791367e-06,
"loss": 1.1093,
"step": 12
},
{
"epoch": 0.09310653536257833,
"grad_norm": 2.0511322021484375,
"learning_rate": 9.844124700239808e-06,
"loss": 1.1048,
"step": 13
},
{
"epoch": 0.10026857654431513,
"grad_norm": 1.8400065898895264,
"learning_rate": 9.832134292565947e-06,
"loss": 1.1084,
"step": 14
},
{
"epoch": 0.10743061772605192,
"grad_norm": 1.743239402770996,
"learning_rate": 9.820143884892086e-06,
"loss": 1.0893,
"step": 15
},
{
"epoch": 0.11459265890778872,
"grad_norm": 0.8387613296508789,
"learning_rate": 9.808153477218227e-06,
"loss": 1.101,
"step": 16
},
{
"epoch": 0.12175470008952552,
"grad_norm": 2.1584479808807373,
"learning_rate": 9.796163069544366e-06,
"loss": 1.1003,
"step": 17
},
{
"epoch": 0.12891674127126232,
"grad_norm": 1.1021759510040283,
"learning_rate": 9.784172661870505e-06,
"loss": 1.0921,
"step": 18
},
{
"epoch": 0.1360787824529991,
"grad_norm": 1.9304773807525635,
"learning_rate": 9.772182254196644e-06,
"loss": 1.0968,
"step": 19
},
{
"epoch": 0.1432408236347359,
"grad_norm": 1.6461104154586792,
"learning_rate": 9.760191846522783e-06,
"loss": 1.0937,
"step": 20
},
{
"epoch": 0.1504028648164727,
"grad_norm": 1.5581350326538086,
"learning_rate": 9.748201438848922e-06,
"loss": 1.0716,
"step": 21
},
{
"epoch": 0.1575649059982095,
"grad_norm": 1.9193130731582642,
"learning_rate": 9.736211031175061e-06,
"loss": 1.0634,
"step": 22
},
{
"epoch": 0.1647269471799463,
"grad_norm": 1.8347828388214111,
"learning_rate": 9.724220623501199e-06,
"loss": 1.0665,
"step": 23
},
{
"epoch": 0.1718889883616831,
"grad_norm": 2.1090774536132812,
"learning_rate": 9.712230215827338e-06,
"loss": 1.0967,
"step": 24
},
{
"epoch": 0.17905102954341987,
"grad_norm": 1.6594020128250122,
"learning_rate": 9.700239808153478e-06,
"loss": 1.0695,
"step": 25
},
{
"epoch": 0.18621307072515667,
"grad_norm": 1.6360571384429932,
"learning_rate": 9.688249400479617e-06,
"loss": 1.0808,
"step": 26
},
{
"epoch": 0.19337511190689347,
"grad_norm": 1.7902008295059204,
"learning_rate": 9.676258992805757e-06,
"loss": 1.0693,
"step": 27
},
{
"epoch": 0.20053715308863027,
"grad_norm": 1.9148868322372437,
"learning_rate": 9.664268585131896e-06,
"loss": 1.0656,
"step": 28
},
{
"epoch": 0.20769919427036707,
"grad_norm": 1.5940035581588745,
"learning_rate": 9.652278177458035e-06,
"loss": 1.0617,
"step": 29
},
{
"epoch": 0.21486123545210384,
"grad_norm": 2.1188247203826904,
"learning_rate": 9.640287769784174e-06,
"loss": 1.0571,
"step": 30
},
{
"epoch": 0.22202327663384064,
"grad_norm": 2.0343472957611084,
"learning_rate": 9.628297362110313e-06,
"loss": 1.0601,
"step": 31
},
{
"epoch": 0.22918531781557744,
"grad_norm": 2.536809206008911,
"learning_rate": 9.616306954436452e-06,
"loss": 1.059,
"step": 32
},
{
"epoch": 0.23634735899731424,
"grad_norm": 2.986196279525757,
"learning_rate": 9.60431654676259e-06,
"loss": 0.9905,
"step": 33
},
{
"epoch": 0.24350940017905104,
"grad_norm": 2.9638538360595703,
"learning_rate": 9.59232613908873e-06,
"loss": 1.0369,
"step": 34
},
{
"epoch": 0.25067144136078784,
"grad_norm": 3.4822328090667725,
"learning_rate": 9.580335731414869e-06,
"loss": 1.0549,
"step": 35
},
{
"epoch": 0.25783348254252464,
"grad_norm": 2.860246181488037,
"learning_rate": 9.568345323741008e-06,
"loss": 1.0131,
"step": 36
},
{
"epoch": 0.26499552372426144,
"grad_norm": 3.4336676597595215,
"learning_rate": 9.556354916067147e-06,
"loss": 1.032,
"step": 37
},
{
"epoch": 0.2721575649059982,
"grad_norm": 3.2346091270446777,
"learning_rate": 9.544364508393286e-06,
"loss": 1.0287,
"step": 38
},
{
"epoch": 0.279319606087735,
"grad_norm": 2.7058355808258057,
"learning_rate": 9.532374100719425e-06,
"loss": 1.0313,
"step": 39
},
{
"epoch": 0.2864816472694718,
"grad_norm": 3.560464382171631,
"learning_rate": 9.520383693045564e-06,
"loss": 1.013,
"step": 40
},
{
"epoch": 0.2936436884512086,
"grad_norm": 3.6023006439208984,
"learning_rate": 9.508393285371703e-06,
"loss": 0.9168,
"step": 41
},
{
"epoch": 0.3008057296329454,
"grad_norm": 3.6482856273651123,
"learning_rate": 9.496402877697842e-06,
"loss": 0.973,
"step": 42
},
{
"epoch": 0.3079677708146822,
"grad_norm": 3.4008662700653076,
"learning_rate": 9.484412470023981e-06,
"loss": 1.0305,
"step": 43
},
{
"epoch": 0.315129811996419,
"grad_norm": 3.068812847137451,
"learning_rate": 9.47242206235012e-06,
"loss": 0.9169,
"step": 44
},
{
"epoch": 0.3222918531781558,
"grad_norm": 3.736107349395752,
"learning_rate": 9.46043165467626e-06,
"loss": 1.0063,
"step": 45
},
{
"epoch": 0.3294538943598926,
"grad_norm": 4.98853063583374,
"learning_rate": 9.448441247002398e-06,
"loss": 0.9408,
"step": 46
},
{
"epoch": 0.3366159355416294,
"grad_norm": 3.7207694053649902,
"learning_rate": 9.436450839328539e-06,
"loss": 0.9363,
"step": 47
},
{
"epoch": 0.3437779767233662,
"grad_norm": 4.021739959716797,
"learning_rate": 9.424460431654678e-06,
"loss": 0.9955,
"step": 48
},
{
"epoch": 0.35094001790510293,
"grad_norm": 3.676600694656372,
"learning_rate": 9.412470023980817e-06,
"loss": 1.0505,
"step": 49
},
{
"epoch": 0.35810205908683973,
"grad_norm": 3.9010024070739746,
"learning_rate": 9.400479616306956e-06,
"loss": 0.9115,
"step": 50
},
{
"epoch": 0.36526410026857653,
"grad_norm": 3.856921672821045,
"learning_rate": 9.388489208633095e-06,
"loss": 0.9625,
"step": 51
},
{
"epoch": 0.37242614145031333,
"grad_norm": 3.6618595123291016,
"learning_rate": 9.376498800959234e-06,
"loss": 0.9107,
"step": 52
},
{
"epoch": 0.37958818263205013,
"grad_norm": 3.955465793609619,
"learning_rate": 9.364508393285371e-06,
"loss": 0.9036,
"step": 53
},
{
"epoch": 0.38675022381378693,
"grad_norm": 4.451611518859863,
"learning_rate": 9.35251798561151e-06,
"loss": 0.8745,
"step": 54
},
{
"epoch": 0.39391226499552373,
"grad_norm": 3.950402021408081,
"learning_rate": 9.34052757793765e-06,
"loss": 0.8967,
"step": 55
},
{
"epoch": 0.40107430617726053,
"grad_norm": 4.007294654846191,
"learning_rate": 9.32853717026379e-06,
"loss": 0.9382,
"step": 56
},
{
"epoch": 0.40823634735899733,
"grad_norm": 4.601572036743164,
"learning_rate": 9.31654676258993e-06,
"loss": 0.892,
"step": 57
},
{
"epoch": 0.41539838854073413,
"grad_norm": 3.6578915119171143,
"learning_rate": 9.304556354916068e-06,
"loss": 0.8066,
"step": 58
},
{
"epoch": 0.42256042972247093,
"grad_norm": 4.2530293464660645,
"learning_rate": 9.292565947242207e-06,
"loss": 0.7893,
"step": 59
},
{
"epoch": 0.4297224709042077,
"grad_norm": 4.652935028076172,
"learning_rate": 9.280575539568346e-06,
"loss": 0.805,
"step": 60
},
{
"epoch": 0.4368845120859445,
"grad_norm": 4.985301494598389,
"learning_rate": 9.268585131894485e-06,
"loss": 0.8248,
"step": 61
},
{
"epoch": 0.4440465532676813,
"grad_norm": 5.5606513023376465,
"learning_rate": 9.256594724220624e-06,
"loss": 0.8412,
"step": 62
},
{
"epoch": 0.4512085944494181,
"grad_norm": 4.6756110191345215,
"learning_rate": 9.244604316546764e-06,
"loss": 0.8119,
"step": 63
},
{
"epoch": 0.4583706356311549,
"grad_norm": 4.607848644256592,
"learning_rate": 9.232613908872903e-06,
"loss": 0.7553,
"step": 64
},
{
"epoch": 0.4655326768128917,
"grad_norm": 5.126153945922852,
"learning_rate": 9.220623501199042e-06,
"loss": 0.7668,
"step": 65
},
{
"epoch": 0.4726947179946285,
"grad_norm": 4.498857498168945,
"learning_rate": 9.20863309352518e-06,
"loss": 0.727,
"step": 66
},
{
"epoch": 0.4798567591763653,
"grad_norm": 5.598169326782227,
"learning_rate": 9.19664268585132e-06,
"loss": 0.8671,
"step": 67
},
{
"epoch": 0.4870188003581021,
"grad_norm": 5.176477432250977,
"learning_rate": 9.184652278177459e-06,
"loss": 0.8697,
"step": 68
},
{
"epoch": 0.4941808415398389,
"grad_norm": 4.874248504638672,
"learning_rate": 9.172661870503598e-06,
"loss": 0.7736,
"step": 69
},
{
"epoch": 0.5013428827215757,
"grad_norm": 4.082653999328613,
"learning_rate": 9.160671462829737e-06,
"loss": 0.7047,
"step": 70
},
{
"epoch": 0.5085049239033125,
"grad_norm": 5.192132949829102,
"learning_rate": 9.148681055155876e-06,
"loss": 0.7359,
"step": 71
},
{
"epoch": 0.5156669650850493,
"grad_norm": 5.804781436920166,
"learning_rate": 9.136690647482015e-06,
"loss": 0.6859,
"step": 72
},
{
"epoch": 0.5228290062667861,
"grad_norm": 4.250144004821777,
"learning_rate": 9.124700239808154e-06,
"loss": 0.7178,
"step": 73
},
{
"epoch": 0.5299910474485229,
"grad_norm": 4.898803234100342,
"learning_rate": 9.112709832134293e-06,
"loss": 0.8012,
"step": 74
},
{
"epoch": 0.5371530886302597,
"grad_norm": 4.092855453491211,
"learning_rate": 9.100719424460432e-06,
"loss": 0.595,
"step": 75
},
{
"epoch": 0.5443151298119964,
"grad_norm": 4.206191062927246,
"learning_rate": 9.088729016786571e-06,
"loss": 0.6574,
"step": 76
},
{
"epoch": 0.5514771709937332,
"grad_norm": 5.5786967277526855,
"learning_rate": 9.07673860911271e-06,
"loss": 0.6096,
"step": 77
},
{
"epoch": 0.55863921217547,
"grad_norm": 4.80825138092041,
"learning_rate": 9.064748201438849e-06,
"loss": 0.6351,
"step": 78
},
{
"epoch": 0.5658012533572068,
"grad_norm": 5.1291608810424805,
"learning_rate": 9.05275779376499e-06,
"loss": 0.6163,
"step": 79
},
{
"epoch": 0.5729632945389436,
"grad_norm": 4.304827690124512,
"learning_rate": 9.040767386091129e-06,
"loss": 0.6089,
"step": 80
},
{
"epoch": 0.5801253357206804,
"grad_norm": 3.9486804008483887,
"learning_rate": 9.028776978417268e-06,
"loss": 0.5804,
"step": 81
},
{
"epoch": 0.5872873769024172,
"grad_norm": 6.8340163230896,
"learning_rate": 9.016786570743405e-06,
"loss": 0.6102,
"step": 82
},
{
"epoch": 0.594449418084154,
"grad_norm": 5.0892133712768555,
"learning_rate": 9.004796163069544e-06,
"loss": 0.6376,
"step": 83
},
{
"epoch": 0.6016114592658908,
"grad_norm": 4.589208126068115,
"learning_rate": 8.992805755395683e-06,
"loss": 0.6618,
"step": 84
},
{
"epoch": 0.6087735004476276,
"grad_norm": 4.036871433258057,
"learning_rate": 8.980815347721822e-06,
"loss": 0.6268,
"step": 85
},
{
"epoch": 0.6159355416293644,
"grad_norm": 5.117236614227295,
"learning_rate": 8.968824940047961e-06,
"loss": 0.5782,
"step": 86
},
{
"epoch": 0.6230975828111012,
"grad_norm": 5.529454708099365,
"learning_rate": 8.956834532374102e-06,
"loss": 0.6378,
"step": 87
},
{
"epoch": 0.630259623992838,
"grad_norm": 4.290615558624268,
"learning_rate": 8.944844124700241e-06,
"loss": 0.4607,
"step": 88
},
{
"epoch": 0.6374216651745748,
"grad_norm": 4.275355815887451,
"learning_rate": 8.93285371702638e-06,
"loss": 0.5444,
"step": 89
},
{
"epoch": 0.6445837063563116,
"grad_norm": 5.064560890197754,
"learning_rate": 8.92086330935252e-06,
"loss": 0.7254,
"step": 90
},
{
"epoch": 0.6517457475380484,
"grad_norm": 4.527130126953125,
"learning_rate": 8.908872901678658e-06,
"loss": 0.5581,
"step": 91
},
{
"epoch": 0.6589077887197852,
"grad_norm": 5.649165630340576,
"learning_rate": 8.896882494004797e-06,
"loss": 0.5265,
"step": 92
},
{
"epoch": 0.666069829901522,
"grad_norm": 4.565707206726074,
"learning_rate": 8.884892086330936e-06,
"loss": 0.5161,
"step": 93
},
{
"epoch": 0.6732318710832588,
"grad_norm": 6.295898914337158,
"learning_rate": 8.872901678657075e-06,
"loss": 0.5414,
"step": 94
},
{
"epoch": 0.6803939122649956,
"grad_norm": 8.986715316772461,
"learning_rate": 8.860911270983214e-06,
"loss": 0.5894,
"step": 95
},
{
"epoch": 0.6875559534467324,
"grad_norm": 6.694461822509766,
"learning_rate": 8.848920863309353e-06,
"loss": 0.5679,
"step": 96
},
{
"epoch": 0.6947179946284691,
"grad_norm": 5.961154460906982,
"learning_rate": 8.836930455635492e-06,
"loss": 0.6379,
"step": 97
},
{
"epoch": 0.7018800358102059,
"grad_norm": 4.6056671142578125,
"learning_rate": 8.824940047961632e-06,
"loss": 0.5691,
"step": 98
},
{
"epoch": 0.7090420769919427,
"grad_norm": 5.036160945892334,
"learning_rate": 8.81294964028777e-06,
"loss": 0.5234,
"step": 99
},
{
"epoch": 0.7162041181736795,
"grad_norm": 4.869359016418457,
"learning_rate": 8.80095923261391e-06,
"loss": 0.4494,
"step": 100
},
{
"epoch": 0.7233661593554163,
"grad_norm": 6.883158206939697,
"learning_rate": 8.788968824940049e-06,
"loss": 0.5863,
"step": 101
},
{
"epoch": 0.7305282005371531,
"grad_norm": 4.8142805099487305,
"learning_rate": 8.776978417266188e-06,
"loss": 0.4585,
"step": 102
},
{
"epoch": 0.7376902417188899,
"grad_norm": 4.156213760375977,
"learning_rate": 8.764988009592327e-06,
"loss": 0.4224,
"step": 103
},
{
"epoch": 0.7448522829006267,
"grad_norm": 5.524331569671631,
"learning_rate": 8.752997601918466e-06,
"loss": 0.5855,
"step": 104
},
{
"epoch": 0.7520143240823635,
"grad_norm": 4.5275468826293945,
"learning_rate": 8.741007194244605e-06,
"loss": 0.3134,
"step": 105
},
{
"epoch": 0.7591763652641003,
"grad_norm": 6.391297340393066,
"learning_rate": 8.729016786570744e-06,
"loss": 0.4662,
"step": 106
},
{
"epoch": 0.7663384064458371,
"grad_norm": 4.844995498657227,
"learning_rate": 8.717026378896883e-06,
"loss": 0.5037,
"step": 107
},
{
"epoch": 0.7735004476275739,
"grad_norm": 5.861647129058838,
"learning_rate": 8.705035971223022e-06,
"loss": 0.4134,
"step": 108
},
{
"epoch": 0.7806624888093107,
"grad_norm": 4.5392889976501465,
"learning_rate": 8.693045563549161e-06,
"loss": 0.4099,
"step": 109
},
{
"epoch": 0.7878245299910475,
"grad_norm": 4.010335922241211,
"learning_rate": 8.681055155875302e-06,
"loss": 0.3897,
"step": 110
},
{
"epoch": 0.7949865711727843,
"grad_norm": 5.2261433601379395,
"learning_rate": 8.66906474820144e-06,
"loss": 0.3422,
"step": 111
},
{
"epoch": 0.8021486123545211,
"grad_norm": 5.837971210479736,
"learning_rate": 8.657074340527578e-06,
"loss": 0.4188,
"step": 112
},
{
"epoch": 0.8093106535362579,
"grad_norm": 7.071847915649414,
"learning_rate": 8.645083932853717e-06,
"loss": 0.5138,
"step": 113
},
{
"epoch": 0.8164726947179947,
"grad_norm": 3.631950855255127,
"learning_rate": 8.633093525179856e-06,
"loss": 0.2924,
"step": 114
},
{
"epoch": 0.8236347358997315,
"grad_norm": 5.4959797859191895,
"learning_rate": 8.621103117505995e-06,
"loss": 0.4857,
"step": 115
},
{
"epoch": 0.8307967770814683,
"grad_norm": 6.527896404266357,
"learning_rate": 8.609112709832134e-06,
"loss": 0.4211,
"step": 116
},
{
"epoch": 0.8379588182632051,
"grad_norm": 7.07539176940918,
"learning_rate": 8.597122302158273e-06,
"loss": 0.481,
"step": 117
},
{
"epoch": 0.8451208594449419,
"grad_norm": 5.752196311950684,
"learning_rate": 8.585131894484412e-06,
"loss": 0.4774,
"step": 118
},
{
"epoch": 0.8522829006266786,
"grad_norm": 3.1388776302337646,
"learning_rate": 8.573141486810553e-06,
"loss": 0.2974,
"step": 119
},
{
"epoch": 0.8594449418084154,
"grad_norm": 5.351109504699707,
"learning_rate": 8.561151079136692e-06,
"loss": 0.333,
"step": 120
},
{
"epoch": 0.8666069829901522,
"grad_norm": 4.641998767852783,
"learning_rate": 8.549160671462831e-06,
"loss": 0.4377,
"step": 121
},
{
"epoch": 0.873769024171889,
"grad_norm": 9.292861938476562,
"learning_rate": 8.53717026378897e-06,
"loss": 0.497,
"step": 122
},
{
"epoch": 0.8809310653536258,
"grad_norm": 5.21453857421875,
"learning_rate": 8.525179856115109e-06,
"loss": 0.4271,
"step": 123
},
{
"epoch": 0.8880931065353626,
"grad_norm": 6.3802618980407715,
"learning_rate": 8.513189448441248e-06,
"loss": 0.4672,
"step": 124
},
{
"epoch": 0.8952551477170994,
"grad_norm": 5.154406547546387,
"learning_rate": 8.501199040767387e-06,
"loss": 0.2783,
"step": 125
},
{
"epoch": 0.9024171888988362,
"grad_norm": 3.9693143367767334,
"learning_rate": 8.489208633093526e-06,
"loss": 0.3932,
"step": 126
},
{
"epoch": 0.909579230080573,
"grad_norm": 2.942033529281616,
"learning_rate": 8.477218225419664e-06,
"loss": 0.3193,
"step": 127
},
{
"epoch": 0.9167412712623098,
"grad_norm": 4.29665994644165,
"learning_rate": 8.465227817745804e-06,
"loss": 0.4377,
"step": 128
},
{
"epoch": 0.9239033124440466,
"grad_norm": 5.550212860107422,
"learning_rate": 8.453237410071943e-06,
"loss": 0.3315,
"step": 129
},
{
"epoch": 0.9310653536257834,
"grad_norm": 6.654735565185547,
"learning_rate": 8.441247002398082e-06,
"loss": 0.4388,
"step": 130
},
{
"epoch": 0.9382273948075202,
"grad_norm": 5.80216121673584,
"learning_rate": 8.429256594724221e-06,
"loss": 0.3865,
"step": 131
},
{
"epoch": 0.945389435989257,
"grad_norm": 6.76437520980835,
"learning_rate": 8.41726618705036e-06,
"loss": 0.5202,
"step": 132
},
{
"epoch": 0.9525514771709938,
"grad_norm": 9.009016990661621,
"learning_rate": 8.4052757793765e-06,
"loss": 0.4008,
"step": 133
},
{
"epoch": 0.9597135183527306,
"grad_norm": 9.987932205200195,
"learning_rate": 8.393285371702639e-06,
"loss": 0.4824,
"step": 134
},
{
"epoch": 0.9668755595344674,
"grad_norm": 10.408086776733398,
"learning_rate": 8.381294964028778e-06,
"loss": 0.7183,
"step": 135
},
{
"epoch": 0.9740376007162042,
"grad_norm": 4.125233173370361,
"learning_rate": 8.369304556354917e-06,
"loss": 0.331,
"step": 136
},
{
"epoch": 0.981199641897941,
"grad_norm": 5.206197738647461,
"learning_rate": 8.357314148681056e-06,
"loss": 0.451,
"step": 137
},
{
"epoch": 0.9883616830796778,
"grad_norm": 6.510553359985352,
"learning_rate": 8.345323741007195e-06,
"loss": 0.2887,
"step": 138
},
{
"epoch": 0.9955237242614146,
"grad_norm": 5.558812141418457,
"learning_rate": 8.333333333333334e-06,
"loss": 0.2659,
"step": 139
},
{
"epoch": 1.0,
"grad_norm": 5.024322509765625,
"learning_rate": 8.321342925659473e-06,
"loss": 0.2857,
"step": 140
},
{
"epoch": 1.0,
"eval_accuracy": 0.8551307847082495,
"eval_loss": 0.34348970651626587,
"eval_runtime": 12.772,
"eval_samples_per_second": 38.913,
"eval_steps_per_second": 38.913,
"step": 140
},
{
"epoch": 1.0071620411817368,
"grad_norm": 5.62790584564209,
"learning_rate": 8.309352517985614e-06,
"loss": 0.3689,
"step": 141
},
{
"epoch": 1.0143240823634736,
"grad_norm": 6.509753704071045,
"learning_rate": 8.29736211031175e-06,
"loss": 0.3419,
"step": 142
},
{
"epoch": 1.0214861235452104,
"grad_norm": 7.264653205871582,
"learning_rate": 8.28537170263789e-06,
"loss": 0.2551,
"step": 143
},
{
"epoch": 1.0286481647269472,
"grad_norm": 6.89661169052124,
"learning_rate": 8.273381294964029e-06,
"loss": 0.4026,
"step": 144
},
{
"epoch": 1.035810205908684,
"grad_norm": 6.0390238761901855,
"learning_rate": 8.261390887290168e-06,
"loss": 0.2978,
"step": 145
},
{
"epoch": 1.0429722470904208,
"grad_norm": 6.132388591766357,
"learning_rate": 8.249400479616307e-06,
"loss": 0.4529,
"step": 146
},
{
"epoch": 1.0501342882721576,
"grad_norm": 6.270555019378662,
"learning_rate": 8.237410071942446e-06,
"loss": 0.4341,
"step": 147
},
{
"epoch": 1.0572963294538944,
"grad_norm": 8.636746406555176,
"learning_rate": 8.225419664268585e-06,
"loss": 0.3452,
"step": 148
},
{
"epoch": 1.0644583706356312,
"grad_norm": 5.3527021408081055,
"learning_rate": 8.213429256594724e-06,
"loss": 0.2689,
"step": 149
},
{
"epoch": 1.071620411817368,
"grad_norm": 5.665110111236572,
"learning_rate": 8.201438848920865e-06,
"loss": 0.5618,
"step": 150
},
{
"epoch": 1.0787824529991048,
"grad_norm": 9.846869468688965,
"learning_rate": 8.189448441247004e-06,
"loss": 0.3931,
"step": 151
},
{
"epoch": 1.0859444941808416,
"grad_norm": 8.280915260314941,
"learning_rate": 8.177458033573143e-06,
"loss": 0.3301,
"step": 152
},
{
"epoch": 1.0931065353625784,
"grad_norm": 5.660332202911377,
"learning_rate": 8.165467625899282e-06,
"loss": 0.372,
"step": 153
},
{
"epoch": 1.1002685765443152,
"grad_norm": 3.366448402404785,
"learning_rate": 8.153477218225421e-06,
"loss": 0.3427,
"step": 154
},
{
"epoch": 1.107430617726052,
"grad_norm": 6.918087959289551,
"learning_rate": 8.14148681055156e-06,
"loss": 0.5082,
"step": 155
},
{
"epoch": 1.1145926589077888,
"grad_norm": 7.009018898010254,
"learning_rate": 8.129496402877699e-06,
"loss": 0.3534,
"step": 156
},
{
"epoch": 1.1217547000895256,
"grad_norm": 5.730655193328857,
"learning_rate": 8.117505995203836e-06,
"loss": 0.2806,
"step": 157
},
{
"epoch": 1.1289167412712624,
"grad_norm": 3.503355026245117,
"learning_rate": 8.105515587529975e-06,
"loss": 0.2374,
"step": 158
},
{
"epoch": 1.1360787824529992,
"grad_norm": 3.6845366954803467,
"learning_rate": 8.093525179856116e-06,
"loss": 0.2822,
"step": 159
},
{
"epoch": 1.143240823634736,
"grad_norm": 8.738545417785645,
"learning_rate": 8.081534772182255e-06,
"loss": 0.3553,
"step": 160
},
{
"epoch": 1.1504028648164728,
"grad_norm": 12.152175903320312,
"learning_rate": 8.069544364508394e-06,
"loss": 0.264,
"step": 161
},
{
"epoch": 1.1575649059982096,
"grad_norm": 6.637088775634766,
"learning_rate": 8.057553956834533e-06,
"loss": 0.3602,
"step": 162
},
{
"epoch": 1.1647269471799464,
"grad_norm": 11.342058181762695,
"learning_rate": 8.045563549160672e-06,
"loss": 0.3529,
"step": 163
},
{
"epoch": 1.1718889883616832,
"grad_norm": 3.130880355834961,
"learning_rate": 8.033573141486811e-06,
"loss": 0.2449,
"step": 164
},
{
"epoch": 1.1790510295434198,
"grad_norm": 4.657078266143799,
"learning_rate": 8.02158273381295e-06,
"loss": 0.2095,
"step": 165
},
{
"epoch": 1.1862130707251566,
"grad_norm": 11.053173065185547,
"learning_rate": 8.00959232613909e-06,
"loss": 0.3824,
"step": 166
},
{
"epoch": 1.1933751119068934,
"grad_norm": 8.9373779296875,
"learning_rate": 7.997601918465228e-06,
"loss": 0.5074,
"step": 167
},
{
"epoch": 1.2005371530886302,
"grad_norm": 7.14840030670166,
"learning_rate": 7.985611510791367e-06,
"loss": 0.3061,
"step": 168
},
{
"epoch": 1.207699194270367,
"grad_norm": 6.889973163604736,
"learning_rate": 7.973621103117507e-06,
"loss": 0.3045,
"step": 169
},
{
"epoch": 1.2148612354521038,
"grad_norm": 10.003790855407715,
"learning_rate": 7.961630695443646e-06,
"loss": 0.3446,
"step": 170
},
{
"epoch": 1.2220232766338406,
"grad_norm": 6.85793924331665,
"learning_rate": 7.949640287769785e-06,
"loss": 0.3411,
"step": 171
},
{
"epoch": 1.2291853178155774,
"grad_norm": 7.919402122497559,
"learning_rate": 7.937649880095924e-06,
"loss": 0.4896,
"step": 172
},
{
"epoch": 1.2363473589973142,
"grad_norm": 3.570951461791992,
"learning_rate": 7.925659472422063e-06,
"loss": 0.211,
"step": 173
},
{
"epoch": 1.243509400179051,
"grad_norm": 4.916582107543945,
"learning_rate": 7.913669064748202e-06,
"loss": 0.2846,
"step": 174
},
{
"epoch": 1.2506714413607878,
"grad_norm": 8.822362899780273,
"learning_rate": 7.90167865707434e-06,
"loss": 0.6171,
"step": 175
},
{
"epoch": 1.2578334825425246,
"grad_norm": 6.692116737365723,
"learning_rate": 7.88968824940048e-06,
"loss": 0.3357,
"step": 176
},
{
"epoch": 1.2649955237242614,
"grad_norm": 8.720771789550781,
"learning_rate": 7.877697841726619e-06,
"loss": 0.3968,
"step": 177
},
{
"epoch": 1.2721575649059982,
"grad_norm": 3.4842636585235596,
"learning_rate": 7.865707434052758e-06,
"loss": 0.2225,
"step": 178
},
{
"epoch": 1.279319606087735,
"grad_norm": 5.311177730560303,
"learning_rate": 7.853717026378897e-06,
"loss": 0.2241,
"step": 179
},
{
"epoch": 1.2864816472694718,
"grad_norm": 7.102256774902344,
"learning_rate": 7.841726618705036e-06,
"loss": 0.4413,
"step": 180
},
{
"epoch": 1.2936436884512086,
"grad_norm": 9.19848346710205,
"learning_rate": 7.829736211031177e-06,
"loss": 0.4979,
"step": 181
},
{
"epoch": 1.3008057296329454,
"grad_norm": 6.935247421264648,
"learning_rate": 7.817745803357316e-06,
"loss": 0.3895,
"step": 182
},
{
"epoch": 1.3079677708146822,
"grad_norm": 6.123559951782227,
"learning_rate": 7.805755395683455e-06,
"loss": 0.6249,
"step": 183
},
{
"epoch": 1.315129811996419,
"grad_norm": 4.8054609298706055,
"learning_rate": 7.793764988009594e-06,
"loss": 0.3903,
"step": 184
},
{
"epoch": 1.3222918531781558,
"grad_norm": 3.220245361328125,
"learning_rate": 7.781774580335733e-06,
"loss": 0.2484,
"step": 185
},
{
"epoch": 1.3294538943598926,
"grad_norm": 7.8549346923828125,
"learning_rate": 7.769784172661872e-06,
"loss": 0.2953,
"step": 186
},
{
"epoch": 1.3366159355416294,
"grad_norm": 4.862519264221191,
"learning_rate": 7.75779376498801e-06,
"loss": 0.3309,
"step": 187
},
{
"epoch": 1.3437779767233662,
"grad_norm": 7.397862434387207,
"learning_rate": 7.745803357314148e-06,
"loss": 0.3559,
"step": 188
},
{
"epoch": 1.350940017905103,
"grad_norm": 6.308946132659912,
"learning_rate": 7.733812949640287e-06,
"loss": 0.2876,
"step": 189
},
{
"epoch": 1.3581020590868398,
"grad_norm": 6.779823303222656,
"learning_rate": 7.721822541966428e-06,
"loss": 0.1981,
"step": 190
},
{
"epoch": 1.3652641002685766,
"grad_norm": 7.278501033782959,
"learning_rate": 7.709832134292567e-06,
"loss": 0.424,
"step": 191
},
{
"epoch": 1.3724261414503134,
"grad_norm": 7.412106037139893,
"learning_rate": 7.697841726618706e-06,
"loss": 0.2568,
"step": 192
},
{
"epoch": 1.3795881826320502,
"grad_norm": 4.081076145172119,
"learning_rate": 7.685851318944845e-06,
"loss": 0.227,
"step": 193
},
{
"epoch": 1.386750223813787,
"grad_norm": 10.722938537597656,
"learning_rate": 7.673860911270984e-06,
"loss": 0.4147,
"step": 194
},
{
"epoch": 1.3939122649955238,
"grad_norm": 4.308574676513672,
"learning_rate": 7.661870503597123e-06,
"loss": 0.2981,
"step": 195
},
{
"epoch": 1.4010743061772606,
"grad_norm": 6.543905735015869,
"learning_rate": 7.649880095923262e-06,
"loss": 0.146,
"step": 196
},
{
"epoch": 1.4082363473589974,
"grad_norm": 7.446622371673584,
"learning_rate": 7.637889688249401e-06,
"loss": 0.3292,
"step": 197
},
{
"epoch": 1.4153983885407342,
"grad_norm": 6.272434234619141,
"learning_rate": 7.62589928057554e-06,
"loss": 0.3738,
"step": 198
},
{
"epoch": 1.422560429722471,
"grad_norm": 6.094548225402832,
"learning_rate": 7.613908872901679e-06,
"loss": 0.2389,
"step": 199
},
{
"epoch": 1.4297224709042076,
"grad_norm": 6.979650020599365,
"learning_rate": 7.601918465227819e-06,
"loss": 0.2491,
"step": 200
},
{
"epoch": 1.4368845120859444,
"grad_norm": 7.520836353302002,
"learning_rate": 7.589928057553958e-06,
"loss": 0.2936,
"step": 201
},
{
"epoch": 1.4440465532676812,
"grad_norm": 9.974089622497559,
"learning_rate": 7.5779376498800964e-06,
"loss": 0.372,
"step": 202
},
{
"epoch": 1.451208594449418,
"grad_norm": 8.374258995056152,
"learning_rate": 7.5659472422062355e-06,
"loss": 0.2795,
"step": 203
},
{
"epoch": 1.4583706356311548,
"grad_norm": 6.120067119598389,
"learning_rate": 7.5539568345323745e-06,
"loss": 0.2502,
"step": 204
},
{
"epoch": 1.4655326768128916,
"grad_norm": 2.661911725997925,
"learning_rate": 7.5419664268585136e-06,
"loss": 0.2178,
"step": 205
},
{
"epoch": 1.4726947179946284,
"grad_norm": 5.33140754699707,
"learning_rate": 7.529976019184653e-06,
"loss": 0.2776,
"step": 206
},
{
"epoch": 1.4798567591763652,
"grad_norm": 6.527672290802002,
"learning_rate": 7.517985611510792e-06,
"loss": 0.2005,
"step": 207
},
{
"epoch": 1.487018800358102,
"grad_norm": 3.9632420539855957,
"learning_rate": 7.505995203836931e-06,
"loss": 0.1922,
"step": 208
},
{
"epoch": 1.4941808415398388,
"grad_norm": 5.8358330726623535,
"learning_rate": 7.4940047961630706e-06,
"loss": 0.327,
"step": 209
},
{
"epoch": 1.5013428827215756,
"grad_norm": 5.2152557373046875,
"learning_rate": 7.48201438848921e-06,
"loss": 0.3027,
"step": 210
},
{
"epoch": 1.5085049239033124,
"grad_norm": 8.100699424743652,
"learning_rate": 7.470023980815349e-06,
"loss": 0.2879,
"step": 211
},
{
"epoch": 1.5156669650850492,
"grad_norm": 7.577643871307373,
"learning_rate": 7.458033573141488e-06,
"loss": 0.3307,
"step": 212
},
{
"epoch": 1.522829006266786,
"grad_norm": 7.359758377075195,
"learning_rate": 7.446043165467627e-06,
"loss": 0.2822,
"step": 213
},
{
"epoch": 1.5299910474485228,
"grad_norm": 5.6610612869262695,
"learning_rate": 7.434052757793766e-06,
"loss": 0.4894,
"step": 214
},
{
"epoch": 1.5371530886302596,
"grad_norm": 3.8541088104248047,
"learning_rate": 7.422062350119905e-06,
"loss": 0.2774,
"step": 215
},
{
"epoch": 1.5443151298119964,
"grad_norm": 6.980274200439453,
"learning_rate": 7.410071942446043e-06,
"loss": 0.4373,
"step": 216
},
{
"epoch": 1.5514771709937332,
"grad_norm": 6.333699703216553,
"learning_rate": 7.398081534772182e-06,
"loss": 0.4068,
"step": 217
},
{
"epoch": 1.55863921217547,
"grad_norm": 8.193052291870117,
"learning_rate": 7.386091127098322e-06,
"loss": 0.2872,
"step": 218
},
{
"epoch": 1.5658012533572068,
"grad_norm": 8.756412506103516,
"learning_rate": 7.374100719424461e-06,
"loss": 0.3259,
"step": 219
},
{
"epoch": 1.5729632945389436,
"grad_norm": 5.721433639526367,
"learning_rate": 7.3621103117506e-06,
"loss": 0.2389,
"step": 220
},
{
"epoch": 1.5801253357206804,
"grad_norm": 4.392519474029541,
"learning_rate": 7.350119904076739e-06,
"loss": 0.2862,
"step": 221
},
{
"epoch": 1.5872873769024172,
"grad_norm": 7.019931316375732,
"learning_rate": 7.338129496402878e-06,
"loss": 0.4388,
"step": 222
},
{
"epoch": 1.594449418084154,
"grad_norm": 6.2576470375061035,
"learning_rate": 7.326139088729017e-06,
"loss": 0.1563,
"step": 223
},
{
"epoch": 1.6016114592658908,
"grad_norm": 5.387588024139404,
"learning_rate": 7.314148681055156e-06,
"loss": 0.23,
"step": 224
},
{
"epoch": 1.6087735004476276,
"grad_norm": 4.647127151489258,
"learning_rate": 7.302158273381296e-06,
"loss": 0.2865,
"step": 225
},
{
"epoch": 1.6159355416293644,
"grad_norm": 4.853245735168457,
"learning_rate": 7.290167865707435e-06,
"loss": 0.2604,
"step": 226
},
{
"epoch": 1.6230975828111012,
"grad_norm": 7.052027225494385,
"learning_rate": 7.278177458033574e-06,
"loss": 0.3141,
"step": 227
},
{
"epoch": 1.630259623992838,
"grad_norm": 4.379312515258789,
"learning_rate": 7.266187050359713e-06,
"loss": 0.2006,
"step": 228
},
{
"epoch": 1.6374216651745748,
"grad_norm": 8.26744556427002,
"learning_rate": 7.254196642685852e-06,
"loss": 0.2165,
"step": 229
},
{
"epoch": 1.6445837063563116,
"grad_norm": 5.053855895996094,
"learning_rate": 7.242206235011991e-06,
"loss": 0.3284,
"step": 230
},
{
"epoch": 1.6517457475380484,
"grad_norm": 9.649201393127441,
"learning_rate": 7.230215827338129e-06,
"loss": 0.3844,
"step": 231
},
{
"epoch": 1.6589077887197852,
"grad_norm": 4.344612121582031,
"learning_rate": 7.218225419664268e-06,
"loss": 0.2531,
"step": 232
},
{
"epoch": 1.666069829901522,
"grad_norm": 3.039994955062866,
"learning_rate": 7.206235011990408e-06,
"loss": 0.1779,
"step": 233
},
{
"epoch": 1.6732318710832588,
"grad_norm": 15.308979034423828,
"learning_rate": 7.194244604316547e-06,
"loss": 0.2409,
"step": 234
},
{
"epoch": 1.6803939122649956,
"grad_norm": 6.968617916107178,
"learning_rate": 7.182254196642686e-06,
"loss": 0.2493,
"step": 235
},
{
"epoch": 1.6875559534467324,
"grad_norm": 5.991739749908447,
"learning_rate": 7.170263788968825e-06,
"loss": 0.2102,
"step": 236
},
{
"epoch": 1.6947179946284692,
"grad_norm": 7.561839580535889,
"learning_rate": 7.1582733812949644e-06,
"loss": 0.5371,
"step": 237
},
{
"epoch": 1.701880035810206,
"grad_norm": 7.78062105178833,
"learning_rate": 7.1462829736211035e-06,
"loss": 0.3472,
"step": 238
},
{
"epoch": 1.7090420769919428,
"grad_norm": 9.596529960632324,
"learning_rate": 7.1342925659472425e-06,
"loss": 0.4896,
"step": 239
},
{
"epoch": 1.7162041181736796,
"grad_norm": 4.697896480560303,
"learning_rate": 7.122302158273382e-06,
"loss": 0.2315,
"step": 240
},
{
"epoch": 1.7233661593554164,
"grad_norm": 7.397878646850586,
"learning_rate": 7.1103117505995214e-06,
"loss": 0.273,
"step": 241
},
{
"epoch": 1.7305282005371532,
"grad_norm": 8.375514030456543,
"learning_rate": 7.0983213429256605e-06,
"loss": 0.5812,
"step": 242
},
{
"epoch": 1.73769024171889,
"grad_norm": 3.8642501831054688,
"learning_rate": 7.0863309352517995e-06,
"loss": 0.233,
"step": 243
},
{
"epoch": 1.7448522829006268,
"grad_norm": 11.222294807434082,
"learning_rate": 7.0743405275779385e-06,
"loss": 0.4143,
"step": 244
},
{
"epoch": 1.7520143240823636,
"grad_norm": 4.19779634475708,
"learning_rate": 7.062350119904078e-06,
"loss": 0.216,
"step": 245
},
{
"epoch": 1.7591763652641004,
"grad_norm": 8.382499694824219,
"learning_rate": 7.050359712230216e-06,
"loss": 0.2655,
"step": 246
},
{
"epoch": 1.7663384064458372,
"grad_norm": 4.434614658355713,
"learning_rate": 7.038369304556355e-06,
"loss": 0.1881,
"step": 247
},
{
"epoch": 1.773500447627574,
"grad_norm": 9.419454574584961,
"learning_rate": 7.026378896882494e-06,
"loss": 0.4074,
"step": 248
},
{
"epoch": 1.7806624888093108,
"grad_norm": 5.077871799468994,
"learning_rate": 7.014388489208634e-06,
"loss": 0.1809,
"step": 249
},
{
"epoch": 1.7878245299910476,
"grad_norm": 8.029293060302734,
"learning_rate": 7.002398081534773e-06,
"loss": 0.3477,
"step": 250
},
{
"epoch": 1.7949865711727844,
"grad_norm": 7.078944206237793,
"learning_rate": 6.990407673860912e-06,
"loss": 0.2644,
"step": 251
},
{
"epoch": 1.8021486123545212,
"grad_norm": 4.196276664733887,
"learning_rate": 6.978417266187051e-06,
"loss": 0.3626,
"step": 252
},
{
"epoch": 1.809310653536258,
"grad_norm": 4.02357816696167,
"learning_rate": 6.96642685851319e-06,
"loss": 0.219,
"step": 253
},
{
"epoch": 1.8164726947179948,
"grad_norm": 6.769551753997803,
"learning_rate": 6.954436450839329e-06,
"loss": 0.2301,
"step": 254
},
{
"epoch": 1.8236347358997316,
"grad_norm": 4.0238118171691895,
"learning_rate": 6.942446043165468e-06,
"loss": 0.2041,
"step": 255
},
{
"epoch": 1.8307967770814684,
"grad_norm": 4.813575744628906,
"learning_rate": 6.930455635491608e-06,
"loss": 0.3333,
"step": 256
},
{
"epoch": 1.8379588182632052,
"grad_norm": 7.067838668823242,
"learning_rate": 6.918465227817747e-06,
"loss": 0.2188,
"step": 257
},
{
"epoch": 1.845120859444942,
"grad_norm": 5.983901023864746,
"learning_rate": 6.906474820143886e-06,
"loss": 0.1423,
"step": 258
},
{
"epoch": 1.8522829006266786,
"grad_norm": 8.536234855651855,
"learning_rate": 6.894484412470025e-06,
"loss": 0.1956,
"step": 259
},
{
"epoch": 1.8594449418084154,
"grad_norm": 6.614276885986328,
"learning_rate": 6.882494004796164e-06,
"loss": 0.3024,
"step": 260
},
{
"epoch": 1.8666069829901522,
"grad_norm": 7.686590671539307,
"learning_rate": 6.870503597122302e-06,
"loss": 0.2499,
"step": 261
},
{
"epoch": 1.873769024171889,
"grad_norm": 2.95554256439209,
"learning_rate": 6.858513189448441e-06,
"loss": 0.1194,
"step": 262
},
{
"epoch": 1.8809310653536258,
"grad_norm": 12.062966346740723,
"learning_rate": 6.84652278177458e-06,
"loss": 0.5564,
"step": 263
},
{
"epoch": 1.8880931065353626,
"grad_norm": 6.321432590484619,
"learning_rate": 6.834532374100719e-06,
"loss": 0.3792,
"step": 264
},
{
"epoch": 1.8952551477170994,
"grad_norm": 4.885496139526367,
"learning_rate": 6.822541966426859e-06,
"loss": 0.1913,
"step": 265
},
{
"epoch": 1.9024171888988362,
"grad_norm": 8.380147933959961,
"learning_rate": 6.810551558752998e-06,
"loss": 0.3116,
"step": 266
},
{
"epoch": 1.909579230080573,
"grad_norm": 10.321130752563477,
"learning_rate": 6.798561151079137e-06,
"loss": 0.3192,
"step": 267
},
{
"epoch": 1.9167412712623098,
"grad_norm": 8.94253158569336,
"learning_rate": 6.786570743405276e-06,
"loss": 0.4641,
"step": 268
},
{
"epoch": 1.9239033124440466,
"grad_norm": 7.346745014190674,
"learning_rate": 6.774580335731415e-06,
"loss": 0.2584,
"step": 269
},
{
"epoch": 1.9310653536257834,
"grad_norm": 25.712692260742188,
"learning_rate": 6.762589928057554e-06,
"loss": 0.4284,
"step": 270
},
{
"epoch": 1.9382273948075202,
"grad_norm": 4.491262912750244,
"learning_rate": 6.750599520383694e-06,
"loss": 0.1754,
"step": 271
},
{
"epoch": 1.945389435989257,
"grad_norm": 14.674046516418457,
"learning_rate": 6.738609112709833e-06,
"loss": 0.3855,
"step": 272
},
{
"epoch": 1.9525514771709938,
"grad_norm": 11.2119722366333,
"learning_rate": 6.726618705035972e-06,
"loss": 0.3796,
"step": 273
},
{
"epoch": 1.9597135183527306,
"grad_norm": 9.643905639648438,
"learning_rate": 6.714628297362111e-06,
"loss": 0.3383,
"step": 274
},
{
"epoch": 1.9668755595344674,
"grad_norm": 9.514199256896973,
"learning_rate": 6.70263788968825e-06,
"loss": 0.2208,
"step": 275
},
{
"epoch": 1.9740376007162042,
"grad_norm": 10.071746826171875,
"learning_rate": 6.6906474820143886e-06,
"loss": 0.2416,
"step": 276
},
{
"epoch": 1.981199641897941,
"grad_norm": 6.356027603149414,
"learning_rate": 6.678657074340528e-06,
"loss": 0.3232,
"step": 277
},
{
"epoch": 1.9883616830796778,
"grad_norm": 1.8917155265808105,
"learning_rate": 6.666666666666667e-06,
"loss": 0.0916,
"step": 278
},
{
"epoch": 1.9955237242614146,
"grad_norm": 10.453146934509277,
"learning_rate": 6.654676258992806e-06,
"loss": 0.3523,
"step": 279
},
{
"epoch": 2.0,
"grad_norm": 7.3855509757995605,
"learning_rate": 6.6426858513189456e-06,
"loss": 0.2806,
"step": 280
},
{
"epoch": 2.0,
"eval_accuracy": 0.8752515090543259,
"eval_loss": 0.3062034249305725,
"eval_runtime": 13.2652,
"eval_samples_per_second": 37.466,
"eval_steps_per_second": 37.466,
"step": 280
},
{
"epoch": 2.007162041181737,
"grad_norm": 12.951683044433594,
"learning_rate": 6.630695443645085e-06,
"loss": 0.6262,
"step": 281
},
{
"epoch": 2.0143240823634736,
"grad_norm": 6.761510372161865,
"learning_rate": 6.618705035971224e-06,
"loss": 0.202,
"step": 282
},
{
"epoch": 2.0214861235452104,
"grad_norm": 9.198619842529297,
"learning_rate": 6.606714628297363e-06,
"loss": 0.271,
"step": 283
},
{
"epoch": 2.028648164726947,
"grad_norm": 11.052300453186035,
"learning_rate": 6.594724220623502e-06,
"loss": 0.3322,
"step": 284
},
{
"epoch": 2.035810205908684,
"grad_norm": 5.700530529022217,
"learning_rate": 6.582733812949641e-06,
"loss": 0.1473,
"step": 285
},
{
"epoch": 2.042972247090421,
"grad_norm": 3.494630813598633,
"learning_rate": 6.57074340527578e-06,
"loss": 0.1127,
"step": 286
},
{
"epoch": 2.0501342882721576,
"grad_norm": 4.829661846160889,
"learning_rate": 6.55875299760192e-06,
"loss": 0.1499,
"step": 287
},
{
"epoch": 2.0572963294538944,
"grad_norm": 2.1077747344970703,
"learning_rate": 6.546762589928059e-06,
"loss": 0.1221,
"step": 288
},
{
"epoch": 2.064458370635631,
"grad_norm": 8.28569221496582,
"learning_rate": 6.534772182254198e-06,
"loss": 0.3386,
"step": 289
},
{
"epoch": 2.071620411817368,
"grad_norm": 6.121078968048096,
"learning_rate": 6.522781774580337e-06,
"loss": 0.2158,
"step": 290
},
{
"epoch": 2.078782452999105,
"grad_norm": 7.616806507110596,
"learning_rate": 6.510791366906475e-06,
"loss": 0.241,
"step": 291
},
{
"epoch": 2.0859444941808416,
"grad_norm": 10.66478443145752,
"learning_rate": 6.498800959232614e-06,
"loss": 0.2952,
"step": 292
},
{
"epoch": 2.0931065353625784,
"grad_norm": 4.883520126342773,
"learning_rate": 6.486810551558753e-06,
"loss": 0.1603,
"step": 293
},
{
"epoch": 2.100268576544315,
"grad_norm": 2.091519594192505,
"learning_rate": 6.474820143884892e-06,
"loss": 0.0847,
"step": 294
},
{
"epoch": 2.107430617726052,
"grad_norm": 6.887018203735352,
"learning_rate": 6.462829736211031e-06,
"loss": 0.2551,
"step": 295
},
{
"epoch": 2.114592658907789,
"grad_norm": 12.55301284790039,
"learning_rate": 6.450839328537171e-06,
"loss": 0.305,
"step": 296
},
{
"epoch": 2.1217547000895256,
"grad_norm": 5.844974517822266,
"learning_rate": 6.43884892086331e-06,
"loss": 0.3444,
"step": 297
},
{
"epoch": 2.1289167412712624,
"grad_norm": 10.359457969665527,
"learning_rate": 6.426858513189449e-06,
"loss": 0.3414,
"step": 298
},
{
"epoch": 2.136078782452999,
"grad_norm": 5.8697686195373535,
"learning_rate": 6.414868105515588e-06,
"loss": 0.2207,
"step": 299
},
{
"epoch": 2.143240823634736,
"grad_norm": 8.140408515930176,
"learning_rate": 6.402877697841727e-06,
"loss": 0.1734,
"step": 300
},
{
"epoch": 2.150402864816473,
"grad_norm": 11.230256080627441,
"learning_rate": 6.390887290167866e-06,
"loss": 0.2641,
"step": 301
},
{
"epoch": 2.1575649059982096,
"grad_norm": 7.876967430114746,
"learning_rate": 6.378896882494005e-06,
"loss": 0.5413,
"step": 302
},
{
"epoch": 2.1647269471799464,
"grad_norm": 4.870555400848389,
"learning_rate": 6.366906474820145e-06,
"loss": 0.1618,
"step": 303
},
{
"epoch": 2.171888988361683,
"grad_norm": 6.328149795532227,
"learning_rate": 6.354916067146284e-06,
"loss": 0.1336,
"step": 304
},
{
"epoch": 2.17905102954342,
"grad_norm": 11.720122337341309,
"learning_rate": 6.342925659472423e-06,
"loss": 0.3316,
"step": 305
},
{
"epoch": 2.186213070725157,
"grad_norm": 4.192595958709717,
"learning_rate": 6.330935251798561e-06,
"loss": 0.2262,
"step": 306
},
{
"epoch": 2.1933751119068936,
"grad_norm": 7.089945316314697,
"learning_rate": 6.3189448441247e-06,
"loss": 0.3939,
"step": 307
},
{
"epoch": 2.2005371530886304,
"grad_norm": 3.7617053985595703,
"learning_rate": 6.3069544364508395e-06,
"loss": 0.129,
"step": 308
},
{
"epoch": 2.207699194270367,
"grad_norm": 2.709459066390991,
"learning_rate": 6.2949640287769785e-06,
"loss": 0.106,
"step": 309
},
{
"epoch": 2.214861235452104,
"grad_norm": 3.1630465984344482,
"learning_rate": 6.2829736211031175e-06,
"loss": 0.1199,
"step": 310
},
{
"epoch": 2.222023276633841,
"grad_norm": 8.380224227905273,
"learning_rate": 6.2709832134292566e-06,
"loss": 0.1654,
"step": 311
},
{
"epoch": 2.2291853178155776,
"grad_norm": 6.432971000671387,
"learning_rate": 6.2589928057553964e-06,
"loss": 0.2642,
"step": 312
},
{
"epoch": 2.2363473589973144,
"grad_norm": 8.790579795837402,
"learning_rate": 6.2470023980815355e-06,
"loss": 0.2575,
"step": 313
},
{
"epoch": 2.243509400179051,
"grad_norm": 6.955909252166748,
"learning_rate": 6.2350119904076745e-06,
"loss": 0.1253,
"step": 314
},
{
"epoch": 2.250671441360788,
"grad_norm": 4.945182800292969,
"learning_rate": 6.2230215827338136e-06,
"loss": 0.23,
"step": 315
},
{
"epoch": 2.257833482542525,
"grad_norm": 6.262834072113037,
"learning_rate": 6.211031175059953e-06,
"loss": 0.2261,
"step": 316
},
{
"epoch": 2.2649955237242616,
"grad_norm": 6.618960380554199,
"learning_rate": 6.199040767386092e-06,
"loss": 0.2217,
"step": 317
},
{
"epoch": 2.2721575649059984,
"grad_norm": 16.147371292114258,
"learning_rate": 6.1870503597122315e-06,
"loss": 0.3444,
"step": 318
},
{
"epoch": 2.2793196060877348,
"grad_norm": 9.18713092803955,
"learning_rate": 6.1750599520383706e-06,
"loss": 0.1846,
"step": 319
},
{
"epoch": 2.286481647269472,
"grad_norm": 4.649369716644287,
"learning_rate": 6.16306954436451e-06,
"loss": 0.2223,
"step": 320
},
{
"epoch": 2.2936436884512084,
"grad_norm": 4.171594619750977,
"learning_rate": 6.151079136690648e-06,
"loss": 0.1455,
"step": 321
},
{
"epoch": 2.3008057296329456,
"grad_norm": 9.076400756835938,
"learning_rate": 6.139088729016787e-06,
"loss": 0.2683,
"step": 322
},
{
"epoch": 2.307967770814682,
"grad_norm": 8.380135536193848,
"learning_rate": 6.127098321342926e-06,
"loss": 0.2054,
"step": 323
},
{
"epoch": 2.315129811996419,
"grad_norm": 8.534207344055176,
"learning_rate": 6.115107913669065e-06,
"loss": 0.1961,
"step": 324
},
{
"epoch": 2.3222918531781556,
"grad_norm": 5.501217365264893,
"learning_rate": 6.103117505995204e-06,
"loss": 0.1083,
"step": 325
},
{
"epoch": 2.329453894359893,
"grad_norm": 3.4403066635131836,
"learning_rate": 6.091127098321343e-06,
"loss": 0.0808,
"step": 326
},
{
"epoch": 2.336615935541629,
"grad_norm": 9.280027389526367,
"learning_rate": 6.079136690647483e-06,
"loss": 0.2427,
"step": 327
},
{
"epoch": 2.3437779767233664,
"grad_norm": 6.337725639343262,
"learning_rate": 6.067146282973622e-06,
"loss": 0.2028,
"step": 328
},
{
"epoch": 2.3509400179051028,
"grad_norm": 4.871727466583252,
"learning_rate": 6.055155875299761e-06,
"loss": 0.2069,
"step": 329
},
{
"epoch": 2.3581020590868396,
"grad_norm": 1.3199633359909058,
"learning_rate": 6.0431654676259e-06,
"loss": 0.0468,
"step": 330
},
{
"epoch": 2.3652641002685764,
"grad_norm": 7.1639790534973145,
"learning_rate": 6.031175059952039e-06,
"loss": 0.271,
"step": 331
},
{
"epoch": 2.372426141450313,
"grad_norm": 4.194430351257324,
"learning_rate": 6.019184652278178e-06,
"loss": 0.2205,
"step": 332
},
{
"epoch": 2.37958818263205,
"grad_norm": 5.920024871826172,
"learning_rate": 6.007194244604317e-06,
"loss": 0.3546,
"step": 333
},
{
"epoch": 2.3867502238137868,
"grad_norm": 7.114736557006836,
"learning_rate": 5.995203836930457e-06,
"loss": 0.234,
"step": 334
},
{
"epoch": 2.3939122649955236,
"grad_norm": 6.038070201873779,
"learning_rate": 5.983213429256596e-06,
"loss": 0.2333,
"step": 335
},
{
"epoch": 2.4010743061772604,
"grad_norm": 7.507596969604492,
"learning_rate": 5.971223021582734e-06,
"loss": 0.1927,
"step": 336
},
{
"epoch": 2.408236347358997,
"grad_norm": 5.830931663513184,
"learning_rate": 5.959232613908873e-06,
"loss": 0.123,
"step": 337
},
{
"epoch": 2.415398388540734,
"grad_norm": 5.003695964813232,
"learning_rate": 5.947242206235012e-06,
"loss": 0.1651,
"step": 338
},
{
"epoch": 2.4225604297224708,
"grad_norm": 5.298439979553223,
"learning_rate": 5.935251798561151e-06,
"loss": 0.2354,
"step": 339
},
{
"epoch": 2.4297224709042076,
"grad_norm": 8.523574829101562,
"learning_rate": 5.92326139088729e-06,
"loss": 0.206,
"step": 340
},
{
"epoch": 2.4368845120859444,
"grad_norm": 6.604531764984131,
"learning_rate": 5.911270983213429e-06,
"loss": 0.2668,
"step": 341
},
{
"epoch": 2.444046553267681,
"grad_norm": 5.022377014160156,
"learning_rate": 5.899280575539568e-06,
"loss": 0.111,
"step": 342
},
{
"epoch": 2.451208594449418,
"grad_norm": 5.974075794219971,
"learning_rate": 5.887290167865708e-06,
"loss": 0.2463,
"step": 343
},
{
"epoch": 2.4583706356311548,
"grad_norm": 6.426002025604248,
"learning_rate": 5.875299760191847e-06,
"loss": 0.1471,
"step": 344
},
{
"epoch": 2.4655326768128916,
"grad_norm": 4.997922420501709,
"learning_rate": 5.863309352517986e-06,
"loss": 0.2057,
"step": 345
},
{
"epoch": 2.4726947179946284,
"grad_norm": 9.264660835266113,
"learning_rate": 5.851318944844125e-06,
"loss": 0.3078,
"step": 346
},
{
"epoch": 2.479856759176365,
"grad_norm": 4.701541423797607,
"learning_rate": 5.8393285371702644e-06,
"loss": 0.2177,
"step": 347
},
{
"epoch": 2.487018800358102,
"grad_norm": 11.523097038269043,
"learning_rate": 5.8273381294964035e-06,
"loss": 0.2449,
"step": 348
},
{
"epoch": 2.4941808415398388,
"grad_norm": 3.427780866622925,
"learning_rate": 5.8153477218225425e-06,
"loss": 0.2195,
"step": 349
},
{
"epoch": 2.5013428827215756,
"grad_norm": 10.555095672607422,
"learning_rate": 5.803357314148681e-06,
"loss": 0.2723,
"step": 350
},
{
"epoch": 2.5085049239033124,
"grad_norm": 8.235042572021484,
"learning_rate": 5.79136690647482e-06,
"loss": 0.2281,
"step": 351
},
{
"epoch": 2.515666965085049,
"grad_norm": 6.958029270172119,
"learning_rate": 5.77937649880096e-06,
"loss": 0.3334,
"step": 352
},
{
"epoch": 2.522829006266786,
"grad_norm": 4.380754470825195,
"learning_rate": 5.767386091127099e-06,
"loss": 0.1564,
"step": 353
},
{
"epoch": 2.5299910474485228,
"grad_norm": 3.2662181854248047,
"learning_rate": 5.755395683453238e-06,
"loss": 0.1156,
"step": 354
},
{
"epoch": 2.5371530886302596,
"grad_norm": 5.031314849853516,
"learning_rate": 5.743405275779377e-06,
"loss": 0.122,
"step": 355
},
{
"epoch": 2.5443151298119964,
"grad_norm": 5.606161117553711,
"learning_rate": 5.731414868105516e-06,
"loss": 0.2256,
"step": 356
},
{
"epoch": 2.551477170993733,
"grad_norm": 1.6429898738861084,
"learning_rate": 5.719424460431655e-06,
"loss": 0.0744,
"step": 357
},
{
"epoch": 2.55863921217547,
"grad_norm": 4.822231769561768,
"learning_rate": 5.707434052757795e-06,
"loss": 0.198,
"step": 358
},
{
"epoch": 2.5658012533572068,
"grad_norm": 7.1267991065979,
"learning_rate": 5.695443645083934e-06,
"loss": 0.3007,
"step": 359
},
{
"epoch": 2.5729632945389436,
"grad_norm": 7.829804420471191,
"learning_rate": 5.683453237410073e-06,
"loss": 0.3531,
"step": 360
},
{
"epoch": 2.5801253357206804,
"grad_norm": 10.854474067687988,
"learning_rate": 5.671462829736212e-06,
"loss": 0.21,
"step": 361
},
{
"epoch": 2.587287376902417,
"grad_norm": 2.879049777984619,
"learning_rate": 5.659472422062351e-06,
"loss": 0.158,
"step": 362
},
{
"epoch": 2.594449418084154,
"grad_norm": 9.582825660705566,
"learning_rate": 5.64748201438849e-06,
"loss": 0.1221,
"step": 363
},
{
"epoch": 2.6016114592658908,
"grad_norm": 13.478569984436035,
"learning_rate": 5.635491606714629e-06,
"loss": 0.3543,
"step": 364
},
{
"epoch": 2.6087735004476276,
"grad_norm": 13.520654678344727,
"learning_rate": 5.623501199040767e-06,
"loss": 0.2749,
"step": 365
},
{
"epoch": 2.6159355416293644,
"grad_norm": 9.655123710632324,
"learning_rate": 5.611510791366906e-06,
"loss": 0.3296,
"step": 366
},
{
"epoch": 2.623097582811101,
"grad_norm": 8.334728240966797,
"learning_rate": 5.599520383693046e-06,
"loss": 0.201,
"step": 367
},
{
"epoch": 2.630259623992838,
"grad_norm": 9.50122356414795,
"learning_rate": 5.587529976019185e-06,
"loss": 0.1588,
"step": 368
},
{
"epoch": 2.6374216651745748,
"grad_norm": 6.593572616577148,
"learning_rate": 5.575539568345324e-06,
"loss": 0.1869,
"step": 369
},
{
"epoch": 2.6445837063563116,
"grad_norm": 5.8361735343933105,
"learning_rate": 5.563549160671463e-06,
"loss": 0.1161,
"step": 370
},
{
"epoch": 2.6517457475380484,
"grad_norm": 6.58205509185791,
"learning_rate": 5.551558752997602e-06,
"loss": 0.1315,
"step": 371
},
{
"epoch": 2.658907788719785,
"grad_norm": 10.432963371276855,
"learning_rate": 5.539568345323741e-06,
"loss": 0.4753,
"step": 372
},
{
"epoch": 2.666069829901522,
"grad_norm": 10.057262420654297,
"learning_rate": 5.52757793764988e-06,
"loss": 0.2506,
"step": 373
},
{
"epoch": 2.6732318710832588,
"grad_norm": 6.905503749847412,
"learning_rate": 5.51558752997602e-06,
"loss": 0.2124,
"step": 374
},
{
"epoch": 2.6803939122649956,
"grad_norm": 7.519169807434082,
"learning_rate": 5.503597122302159e-06,
"loss": 0.307,
"step": 375
},
{
"epoch": 2.6875559534467324,
"grad_norm": 5.79476261138916,
"learning_rate": 5.491606714628298e-06,
"loss": 0.0908,
"step": 376
},
{
"epoch": 2.694717994628469,
"grad_norm": 10.047815322875977,
"learning_rate": 5.479616306954437e-06,
"loss": 0.2494,
"step": 377
},
{
"epoch": 2.701880035810206,
"grad_norm": 3.0948004722595215,
"learning_rate": 5.467625899280576e-06,
"loss": 0.0827,
"step": 378
},
{
"epoch": 2.7090420769919428,
"grad_norm": 6.911858558654785,
"learning_rate": 5.455635491606715e-06,
"loss": 0.2022,
"step": 379
},
{
"epoch": 2.7162041181736796,
"grad_norm": 13.745474815368652,
"learning_rate": 5.4436450839328535e-06,
"loss": 0.2462,
"step": 380
},
{
"epoch": 2.7233661593554164,
"grad_norm": 9.535720825195312,
"learning_rate": 5.4316546762589925e-06,
"loss": 0.442,
"step": 381
},
{
"epoch": 2.730528200537153,
"grad_norm": 4.10536527633667,
"learning_rate": 5.4196642685851316e-06,
"loss": 0.3275,
"step": 382
},
{
"epoch": 2.73769024171889,
"grad_norm": 5.548096656799316,
"learning_rate": 5.4076738609112715e-06,
"loss": 0.1592,
"step": 383
},
{
"epoch": 2.7448522829006268,
"grad_norm": 5.5715179443359375,
"learning_rate": 5.3956834532374105e-06,
"loss": 0.3444,
"step": 384
},
{
"epoch": 2.7520143240823636,
"grad_norm": 5.943526744842529,
"learning_rate": 5.3836930455635495e-06,
"loss": 0.2461,
"step": 385
},
{
"epoch": 2.7591763652641004,
"grad_norm": 4.913482189178467,
"learning_rate": 5.3717026378896886e-06,
"loss": 0.162,
"step": 386
},
{
"epoch": 2.766338406445837,
"grad_norm": 5.035208702087402,
"learning_rate": 5.359712230215828e-06,
"loss": 0.1334,
"step": 387
},
{
"epoch": 2.773500447627574,
"grad_norm": 7.2981390953063965,
"learning_rate": 5.347721822541967e-06,
"loss": 0.1701,
"step": 388
},
{
"epoch": 2.7806624888093108,
"grad_norm": 7.505518913269043,
"learning_rate": 5.335731414868106e-06,
"loss": 0.1757,
"step": 389
},
{
"epoch": 2.7878245299910476,
"grad_norm": 3.761885404586792,
"learning_rate": 5.3237410071942456e-06,
"loss": 0.2144,
"step": 390
},
{
"epoch": 2.7949865711727844,
"grad_norm": 3.8127267360687256,
"learning_rate": 5.311750599520385e-06,
"loss": 0.0838,
"step": 391
},
{
"epoch": 2.802148612354521,
"grad_norm": 6.290581226348877,
"learning_rate": 5.299760191846524e-06,
"loss": 0.1923,
"step": 392
},
{
"epoch": 2.809310653536258,
"grad_norm": 9.498542785644531,
"learning_rate": 5.287769784172663e-06,
"loss": 0.2806,
"step": 393
},
{
"epoch": 2.8164726947179948,
"grad_norm": 5.22800874710083,
"learning_rate": 5.275779376498802e-06,
"loss": 0.2014,
"step": 394
},
{
"epoch": 2.8236347358997316,
"grad_norm": 3.5994250774383545,
"learning_rate": 5.26378896882494e-06,
"loss": 0.1525,
"step": 395
},
{
"epoch": 2.8307967770814684,
"grad_norm": 8.245004653930664,
"learning_rate": 5.251798561151079e-06,
"loss": 0.2496,
"step": 396
},
{
"epoch": 2.837958818263205,
"grad_norm": 13.4406156539917,
"learning_rate": 5.239808153477218e-06,
"loss": 0.4056,
"step": 397
},
{
"epoch": 2.845120859444942,
"grad_norm": 5.844360828399658,
"learning_rate": 5.227817745803357e-06,
"loss": 0.1751,
"step": 398
},
{
"epoch": 2.8522829006266788,
"grad_norm": 9.290830612182617,
"learning_rate": 5.215827338129497e-06,
"loss": 0.3863,
"step": 399
},
{
"epoch": 2.859444941808415,
"grad_norm": 6.542262554168701,
"learning_rate": 5.203836930455636e-06,
"loss": 0.1952,
"step": 400
},
{
"epoch": 2.8666069829901524,
"grad_norm": 7.181087493896484,
"learning_rate": 5.191846522781775e-06,
"loss": 0.3431,
"step": 401
},
{
"epoch": 2.8737690241718887,
"grad_norm": 4.160923957824707,
"learning_rate": 5.179856115107914e-06,
"loss": 0.0841,
"step": 402
},
{
"epoch": 2.880931065353626,
"grad_norm": 10.504855155944824,
"learning_rate": 5.167865707434053e-06,
"loss": 0.2541,
"step": 403
},
{
"epoch": 2.8880931065353623,
"grad_norm": 2.3804681301116943,
"learning_rate": 5.155875299760192e-06,
"loss": 0.0668,
"step": 404
},
{
"epoch": 2.8952551477170996,
"grad_norm": 6.530309200286865,
"learning_rate": 5.143884892086332e-06,
"loss": 0.2772,
"step": 405
},
{
"epoch": 2.902417188898836,
"grad_norm": 9.804174423217773,
"learning_rate": 5.131894484412471e-06,
"loss": 0.2987,
"step": 406
},
{
"epoch": 2.909579230080573,
"grad_norm": 13.136397361755371,
"learning_rate": 5.11990407673861e-06,
"loss": 0.308,
"step": 407
},
{
"epoch": 2.9167412712623095,
"grad_norm": 6.506509304046631,
"learning_rate": 5.107913669064749e-06,
"loss": 0.1547,
"step": 408
},
{
"epoch": 2.9239033124440468,
"grad_norm": 7.714542865753174,
"learning_rate": 5.095923261390888e-06,
"loss": 0.2585,
"step": 409
},
{
"epoch": 2.931065353625783,
"grad_norm": 4.894972801208496,
"learning_rate": 5.083932853717026e-06,
"loss": 0.0756,
"step": 410
},
{
"epoch": 2.9382273948075204,
"grad_norm": 12.52349853515625,
"learning_rate": 5.071942446043165e-06,
"loss": 0.2471,
"step": 411
},
{
"epoch": 2.9453894359892567,
"grad_norm": 9.225778579711914,
"learning_rate": 5.059952038369304e-06,
"loss": 0.2481,
"step": 412
},
{
"epoch": 2.952551477170994,
"grad_norm": 6.006895065307617,
"learning_rate": 5.047961630695443e-06,
"loss": 0.2832,
"step": 413
},
{
"epoch": 2.9597135183527303,
"grad_norm": 5.98204231262207,
"learning_rate": 5.035971223021583e-06,
"loss": 0.2155,
"step": 414
},
{
"epoch": 2.9668755595344676,
"grad_norm": 3.6064493656158447,
"learning_rate": 5.023980815347722e-06,
"loss": 0.1033,
"step": 415
},
{
"epoch": 2.974037600716204,
"grad_norm": 7.625882148742676,
"learning_rate": 5.011990407673861e-06,
"loss": 0.1979,
"step": 416
},
{
"epoch": 2.981199641897941,
"grad_norm": 9.841728210449219,
"learning_rate": 5e-06,
"loss": 0.3448,
"step": 417
},
{
"epoch": 2.9883616830796775,
"grad_norm": 10.841019630432129,
"learning_rate": 4.9880095923261394e-06,
"loss": 0.289,
"step": 418
},
{
"epoch": 2.9955237242614148,
"grad_norm": 10.639505386352539,
"learning_rate": 4.9760191846522785e-06,
"loss": 0.2548,
"step": 419
},
{
"epoch": 3.0,
"grad_norm": 7.099693298339844,
"learning_rate": 4.9640287769784175e-06,
"loss": 0.2351,
"step": 420
},
{
"epoch": 3.0,
"eval_accuracy": 0.8812877263581489,
"eval_loss": 0.2769426703453064,
"eval_runtime": 12.7975,
"eval_samples_per_second": 38.836,
"eval_steps_per_second": 38.836,
"step": 420
},
{
"epoch": 3.007162041181737,
"grad_norm": 8.117796897888184,
"learning_rate": 4.9520383693045566e-06,
"loss": 0.2663,
"step": 421
},
{
"epoch": 3.0143240823634736,
"grad_norm": 8.604570388793945,
"learning_rate": 4.940047961630696e-06,
"loss": 0.1473,
"step": 422
},
{
"epoch": 3.0214861235452104,
"grad_norm": 3.096344232559204,
"learning_rate": 4.928057553956835e-06,
"loss": 0.0984,
"step": 423
},
{
"epoch": 3.028648164726947,
"grad_norm": 6.029706001281738,
"learning_rate": 4.916067146282974e-06,
"loss": 0.3858,
"step": 424
},
{
"epoch": 3.035810205908684,
"grad_norm": 4.804579734802246,
"learning_rate": 4.9040767386091136e-06,
"loss": 0.0763,
"step": 425
},
{
"epoch": 3.042972247090421,
"grad_norm": 5.308372497558594,
"learning_rate": 4.892086330935253e-06,
"loss": 0.2166,
"step": 426
},
{
"epoch": 3.0501342882721576,
"grad_norm": 2.092256784439087,
"learning_rate": 4.880095923261392e-06,
"loss": 0.0637,
"step": 427
},
{
"epoch": 3.0572963294538944,
"grad_norm": 8.963829040527344,
"learning_rate": 4.868105515587531e-06,
"loss": 0.1555,
"step": 428
},
{
"epoch": 3.064458370635631,
"grad_norm": 3.819185256958008,
"learning_rate": 4.856115107913669e-06,
"loss": 0.1032,
"step": 429
},
{
"epoch": 3.071620411817368,
"grad_norm": 5.126814365386963,
"learning_rate": 4.844124700239809e-06,
"loss": 0.1586,
"step": 430
},
{
"epoch": 3.078782452999105,
"grad_norm": 5.945977687835693,
"learning_rate": 4.832134292565948e-06,
"loss": 0.2215,
"step": 431
},
{
"epoch": 3.0859444941808416,
"grad_norm": 7.399024963378906,
"learning_rate": 4.820143884892087e-06,
"loss": 0.1983,
"step": 432
},
{
"epoch": 3.0931065353625784,
"grad_norm": 3.2025091648101807,
"learning_rate": 4.808153477218226e-06,
"loss": 0.0709,
"step": 433
},
{
"epoch": 3.100268576544315,
"grad_norm": 7.664062976837158,
"learning_rate": 4.796163069544365e-06,
"loss": 0.2531,
"step": 434
},
{
"epoch": 3.107430617726052,
"grad_norm": 8.884867668151855,
"learning_rate": 4.784172661870504e-06,
"loss": 0.156,
"step": 435
},
{
"epoch": 3.114592658907789,
"grad_norm": 8.288426399230957,
"learning_rate": 4.772182254196643e-06,
"loss": 0.1259,
"step": 436
},
{
"epoch": 3.1217547000895256,
"grad_norm": 4.304559230804443,
"learning_rate": 4.760191846522782e-06,
"loss": 0.0615,
"step": 437
},
{
"epoch": 3.1289167412712624,
"grad_norm": 8.041988372802734,
"learning_rate": 4.748201438848921e-06,
"loss": 0.2884,
"step": 438
},
{
"epoch": 3.136078782452999,
"grad_norm": 2.3492345809936523,
"learning_rate": 4.73621103117506e-06,
"loss": 0.0363,
"step": 439
},
{
"epoch": 3.143240823634736,
"grad_norm": 7.380918502807617,
"learning_rate": 4.724220623501199e-06,
"loss": 0.1686,
"step": 440
},
{
"epoch": 3.150402864816473,
"grad_norm": 7.309660911560059,
"learning_rate": 4.712230215827339e-06,
"loss": 0.3719,
"step": 441
},
{
"epoch": 3.1575649059982096,
"grad_norm": 8.47828197479248,
"learning_rate": 4.700239808153478e-06,
"loss": 0.1023,
"step": 442
},
{
"epoch": 3.1647269471799464,
"grad_norm": 10.671316146850586,
"learning_rate": 4.688249400479617e-06,
"loss": 0.1549,
"step": 443
},
{
"epoch": 3.171888988361683,
"grad_norm": 8.016914367675781,
"learning_rate": 4.676258992805755e-06,
"loss": 0.1275,
"step": 444
},
{
"epoch": 3.17905102954342,
"grad_norm": 6.72282075881958,
"learning_rate": 4.664268585131895e-06,
"loss": 0.1603,
"step": 445
},
{
"epoch": 3.186213070725157,
"grad_norm": 11.111018180847168,
"learning_rate": 4.652278177458034e-06,
"loss": 0.2358,
"step": 446
},
{
"epoch": 3.1933751119068936,
"grad_norm": 7.229121685028076,
"learning_rate": 4.640287769784173e-06,
"loss": 0.1746,
"step": 447
},
{
"epoch": 3.2005371530886304,
"grad_norm": 4.709452152252197,
"learning_rate": 4.628297362110312e-06,
"loss": 0.0628,
"step": 448
},
{
"epoch": 3.207699194270367,
"grad_norm": 3.247600793838501,
"learning_rate": 4.616306954436451e-06,
"loss": 0.0916,
"step": 449
},
{
"epoch": 3.214861235452104,
"grad_norm": 5.488199710845947,
"learning_rate": 4.60431654676259e-06,
"loss": 0.2426,
"step": 450
},
{
"epoch": 3.222023276633841,
"grad_norm": 4.127665996551514,
"learning_rate": 4.592326139088729e-06,
"loss": 0.1155,
"step": 451
},
{
"epoch": 3.2291853178155776,
"grad_norm": 6.826393127441406,
"learning_rate": 4.580335731414868e-06,
"loss": 0.2058,
"step": 452
},
{
"epoch": 3.2363473589973144,
"grad_norm": 7.5438151359558105,
"learning_rate": 4.5683453237410074e-06,
"loss": 0.0832,
"step": 453
},
{
"epoch": 3.243509400179051,
"grad_norm": 4.956328392028809,
"learning_rate": 4.5563549160671465e-06,
"loss": 0.1414,
"step": 454
},
{
"epoch": 3.250671441360788,
"grad_norm": 5.9591779708862305,
"learning_rate": 4.5443645083932855e-06,
"loss": 0.2873,
"step": 455
},
{
"epoch": 3.257833482542525,
"grad_norm": 7.277362823486328,
"learning_rate": 4.5323741007194245e-06,
"loss": 0.2129,
"step": 456
},
{
"epoch": 3.2649955237242616,
"grad_norm": 4.973333835601807,
"learning_rate": 4.5203836930455644e-06,
"loss": 0.1075,
"step": 457
},
{
"epoch": 3.2721575649059984,
"grad_norm": 6.4229912757873535,
"learning_rate": 4.508393285371703e-06,
"loss": 0.1338,
"step": 458
},
{
"epoch": 3.2793196060877348,
"grad_norm": 7.826179504394531,
"learning_rate": 4.496402877697842e-06,
"loss": 0.0805,
"step": 459
},
{
"epoch": 3.286481647269472,
"grad_norm": 5.4623894691467285,
"learning_rate": 4.484412470023981e-06,
"loss": 0.2125,
"step": 460
},
{
"epoch": 3.2936436884512084,
"grad_norm": 3.689229965209961,
"learning_rate": 4.472422062350121e-06,
"loss": 0.094,
"step": 461
},
{
"epoch": 3.3008057296329456,
"grad_norm": 5.583723068237305,
"learning_rate": 4.46043165467626e-06,
"loss": 0.1595,
"step": 462
},
{
"epoch": 3.307967770814682,
"grad_norm": 6.716935634613037,
"learning_rate": 4.448441247002399e-06,
"loss": 0.2875,
"step": 463
},
{
"epoch": 3.315129811996419,
"grad_norm": 5.648532390594482,
"learning_rate": 4.436450839328538e-06,
"loss": 0.2079,
"step": 464
},
{
"epoch": 3.3222918531781556,
"grad_norm": 9.457056999206543,
"learning_rate": 4.424460431654677e-06,
"loss": 0.3384,
"step": 465
},
{
"epoch": 3.329453894359893,
"grad_norm": 2.3951103687286377,
"learning_rate": 4.412470023980816e-06,
"loss": 0.0539,
"step": 466
},
{
"epoch": 3.336615935541629,
"grad_norm": 4.021335124969482,
"learning_rate": 4.400479616306955e-06,
"loss": 0.067,
"step": 467
},
{
"epoch": 3.3437779767233664,
"grad_norm": 7.070362091064453,
"learning_rate": 4.388489208633094e-06,
"loss": 0.2575,
"step": 468
},
{
"epoch": 3.3509400179051028,
"grad_norm": 4.526058197021484,
"learning_rate": 4.376498800959233e-06,
"loss": 0.1562,
"step": 469
},
{
"epoch": 3.3581020590868396,
"grad_norm": 10.771688461303711,
"learning_rate": 4.364508393285372e-06,
"loss": 0.2787,
"step": 470
},
{
"epoch": 3.3652641002685764,
"grad_norm": 13.02984619140625,
"learning_rate": 4.352517985611511e-06,
"loss": 0.3604,
"step": 471
},
{
"epoch": 3.372426141450313,
"grad_norm": 11.359794616699219,
"learning_rate": 4.340527577937651e-06,
"loss": 0.2501,
"step": 472
},
{
"epoch": 3.37958818263205,
"grad_norm": 3.5190794467926025,
"learning_rate": 4.328537170263789e-06,
"loss": 0.0857,
"step": 473
},
{
"epoch": 3.3867502238137868,
"grad_norm": 5.3390374183654785,
"learning_rate": 4.316546762589928e-06,
"loss": 0.2399,
"step": 474
},
{
"epoch": 3.3939122649955236,
"grad_norm": 4.161365509033203,
"learning_rate": 4.304556354916067e-06,
"loss": 0.1279,
"step": 475
},
{
"epoch": 3.4010743061772604,
"grad_norm": 5.861349105834961,
"learning_rate": 4.292565947242206e-06,
"loss": 0.1378,
"step": 476
},
{
"epoch": 3.408236347358997,
"grad_norm": 5.710063457489014,
"learning_rate": 4.280575539568346e-06,
"loss": 0.1279,
"step": 477
},
{
"epoch": 3.415398388540734,
"grad_norm": 2.9877302646636963,
"learning_rate": 4.268585131894485e-06,
"loss": 0.0612,
"step": 478
},
{
"epoch": 3.4225604297224708,
"grad_norm": 5.60707426071167,
"learning_rate": 4.256594724220624e-06,
"loss": 0.0963,
"step": 479
},
{
"epoch": 3.4297224709042076,
"grad_norm": 4.96213960647583,
"learning_rate": 4.244604316546763e-06,
"loss": 0.1391,
"step": 480
},
{
"epoch": 3.4368845120859444,
"grad_norm": 8.810404777526855,
"learning_rate": 4.232613908872902e-06,
"loss": 0.2233,
"step": 481
},
{
"epoch": 3.444046553267681,
"grad_norm": 9.261563301086426,
"learning_rate": 4.220623501199041e-06,
"loss": 0.1481,
"step": 482
},
{
"epoch": 3.451208594449418,
"grad_norm": 5.470502853393555,
"learning_rate": 4.20863309352518e-06,
"loss": 0.096,
"step": 483
},
{
"epoch": 3.4583706356311548,
"grad_norm": 4.597263336181641,
"learning_rate": 4.196642685851319e-06,
"loss": 0.0983,
"step": 484
},
{
"epoch": 3.4655326768128916,
"grad_norm": 15.841909408569336,
"learning_rate": 4.184652278177458e-06,
"loss": 0.4426,
"step": 485
},
{
"epoch": 3.4726947179946284,
"grad_norm": 6.602854251861572,
"learning_rate": 4.172661870503597e-06,
"loss": 0.0847,
"step": 486
},
{
"epoch": 3.479856759176365,
"grad_norm": 2.0710320472717285,
"learning_rate": 4.160671462829736e-06,
"loss": 0.1633,
"step": 487
},
{
"epoch": 3.487018800358102,
"grad_norm": 5.60425329208374,
"learning_rate": 4.148681055155875e-06,
"loss": 0.1899,
"step": 488
},
{
"epoch": 3.4941808415398388,
"grad_norm": 4.323896408081055,
"learning_rate": 4.1366906474820145e-06,
"loss": 0.083,
"step": 489
},
{
"epoch": 3.5013428827215756,
"grad_norm": 3.02128529548645,
"learning_rate": 4.1247002398081535e-06,
"loss": 0.0691,
"step": 490
},
{
"epoch": 3.5085049239033124,
"grad_norm": 9.449849128723145,
"learning_rate": 4.1127098321342925e-06,
"loss": 0.1293,
"step": 491
},
{
"epoch": 3.515666965085049,
"grad_norm": 7.267805576324463,
"learning_rate": 4.100719424460432e-06,
"loss": 0.3555,
"step": 492
},
{
"epoch": 3.522829006266786,
"grad_norm": 9.87209701538086,
"learning_rate": 4.0887290167865715e-06,
"loss": 0.3113,
"step": 493
},
{
"epoch": 3.5299910474485228,
"grad_norm": 11.590337753295898,
"learning_rate": 4.0767386091127105e-06,
"loss": 0.1438,
"step": 494
},
{
"epoch": 3.5371530886302596,
"grad_norm": 7.622054576873779,
"learning_rate": 4.0647482014388495e-06,
"loss": 0.1367,
"step": 495
},
{
"epoch": 3.5443151298119964,
"grad_norm": 21.82820701599121,
"learning_rate": 4.052757793764988e-06,
"loss": 0.164,
"step": 496
},
{
"epoch": 3.551477170993733,
"grad_norm": 7.063591003417969,
"learning_rate": 4.040767386091128e-06,
"loss": 0.1362,
"step": 497
},
{
"epoch": 3.55863921217547,
"grad_norm": 13.529502868652344,
"learning_rate": 4.028776978417267e-06,
"loss": 0.2081,
"step": 498
},
{
"epoch": 3.5658012533572068,
"grad_norm": 6.6618547439575195,
"learning_rate": 4.016786570743406e-06,
"loss": 0.0999,
"step": 499
},
{
"epoch": 3.5729632945389436,
"grad_norm": 2.7395083904266357,
"learning_rate": 4.004796163069545e-06,
"loss": 0.1659,
"step": 500
},
{
"epoch": 3.5801253357206804,
"grad_norm": 6.758154392242432,
"learning_rate": 3.992805755395684e-06,
"loss": 0.1001,
"step": 501
},
{
"epoch": 3.587287376902417,
"grad_norm": 6.874615669250488,
"learning_rate": 3.980815347721823e-06,
"loss": 0.3106,
"step": 502
},
{
"epoch": 3.594449418084154,
"grad_norm": 6.3880228996276855,
"learning_rate": 3.968824940047962e-06,
"loss": 0.2233,
"step": 503
},
{
"epoch": 3.6016114592658908,
"grad_norm": 5.770565032958984,
"learning_rate": 3.956834532374101e-06,
"loss": 0.1179,
"step": 504
},
{
"epoch": 3.6087735004476276,
"grad_norm": 2.633235454559326,
"learning_rate": 3.94484412470024e-06,
"loss": 0.0658,
"step": 505
},
{
"epoch": 3.6159355416293644,
"grad_norm": 6.661257743835449,
"learning_rate": 3.932853717026379e-06,
"loss": 0.1756,
"step": 506
},
{
"epoch": 3.623097582811101,
"grad_norm": 3.2861239910125732,
"learning_rate": 3.920863309352518e-06,
"loss": 0.0856,
"step": 507
},
{
"epoch": 3.630259623992838,
"grad_norm": 8.988011360168457,
"learning_rate": 3.908872901678658e-06,
"loss": 0.2723,
"step": 508
},
{
"epoch": 3.6374216651745748,
"grad_norm": 8.593001365661621,
"learning_rate": 3.896882494004797e-06,
"loss": 0.1671,
"step": 509
},
{
"epoch": 3.6445837063563116,
"grad_norm": 4.522592067718506,
"learning_rate": 3.884892086330936e-06,
"loss": 0.1685,
"step": 510
},
{
"epoch": 3.6517457475380484,
"grad_norm": 8.32598876953125,
"learning_rate": 3.872901678657074e-06,
"loss": 0.0991,
"step": 511
},
{
"epoch": 3.658907788719785,
"grad_norm": 6.495327949523926,
"learning_rate": 3.860911270983214e-06,
"loss": 0.1693,
"step": 512
},
{
"epoch": 3.666069829901522,
"grad_norm": 10.446044921875,
"learning_rate": 3.848920863309353e-06,
"loss": 0.2238,
"step": 513
},
{
"epoch": 3.6732318710832588,
"grad_norm": 6.772848606109619,
"learning_rate": 3.836930455635492e-06,
"loss": 0.2505,
"step": 514
},
{
"epoch": 3.6803939122649956,
"grad_norm": 7.457878112792969,
"learning_rate": 3.824940047961631e-06,
"loss": 0.1456,
"step": 515
},
{
"epoch": 3.6875559534467324,
"grad_norm": 7.897626876831055,
"learning_rate": 3.81294964028777e-06,
"loss": 0.2099,
"step": 516
},
{
"epoch": 3.694717994628469,
"grad_norm": 3.0930118560791016,
"learning_rate": 3.8009592326139096e-06,
"loss": 0.0628,
"step": 517
},
{
"epoch": 3.701880035810206,
"grad_norm": 5.182916641235352,
"learning_rate": 3.7889688249400482e-06,
"loss": 0.2316,
"step": 518
},
{
"epoch": 3.7090420769919428,
"grad_norm": 7.7203874588012695,
"learning_rate": 3.7769784172661873e-06,
"loss": 0.1151,
"step": 519
},
{
"epoch": 3.7162041181736796,
"grad_norm": 3.8697283267974854,
"learning_rate": 3.7649880095923263e-06,
"loss": 0.1911,
"step": 520
},
{
"epoch": 3.7233661593554164,
"grad_norm": 6.677131175994873,
"learning_rate": 3.7529976019184653e-06,
"loss": 0.1403,
"step": 521
},
{
"epoch": 3.730528200537153,
"grad_norm": 10.677637100219727,
"learning_rate": 3.741007194244605e-06,
"loss": 0.2475,
"step": 522
},
{
"epoch": 3.73769024171889,
"grad_norm": 8.453975677490234,
"learning_rate": 3.729016786570744e-06,
"loss": 0.2107,
"step": 523
},
{
"epoch": 3.7448522829006268,
"grad_norm": 7.9489054679870605,
"learning_rate": 3.717026378896883e-06,
"loss": 0.2636,
"step": 524
},
{
"epoch": 3.7520143240823636,
"grad_norm": 5.376030445098877,
"learning_rate": 3.7050359712230215e-06,
"loss": 0.3502,
"step": 525
},
{
"epoch": 3.7591763652641004,
"grad_norm": 7.20943021774292,
"learning_rate": 3.693045563549161e-06,
"loss": 0.0799,
"step": 526
},
{
"epoch": 3.766338406445837,
"grad_norm": 9.997661590576172,
"learning_rate": 3.6810551558753e-06,
"loss": 0.206,
"step": 527
},
{
"epoch": 3.773500447627574,
"grad_norm": 13.116365432739258,
"learning_rate": 3.669064748201439e-06,
"loss": 0.2156,
"step": 528
},
{
"epoch": 3.7806624888093108,
"grad_norm": 8.536945343017578,
"learning_rate": 3.657074340527578e-06,
"loss": 0.2,
"step": 529
},
{
"epoch": 3.7878245299910476,
"grad_norm": 9.58375358581543,
"learning_rate": 3.6450839328537175e-06,
"loss": 0.2438,
"step": 530
},
{
"epoch": 3.7949865711727844,
"grad_norm": 4.94558572769165,
"learning_rate": 3.6330935251798566e-06,
"loss": 0.0869,
"step": 531
},
{
"epoch": 3.802148612354521,
"grad_norm": 13.01669979095459,
"learning_rate": 3.6211031175059956e-06,
"loss": 0.2684,
"step": 532
},
{
"epoch": 3.809310653536258,
"grad_norm": 7.679543972015381,
"learning_rate": 3.609112709832134e-06,
"loss": 0.1146,
"step": 533
},
{
"epoch": 3.8164726947179948,
"grad_norm": 12.043442726135254,
"learning_rate": 3.5971223021582737e-06,
"loss": 0.2482,
"step": 534
},
{
"epoch": 3.8236347358997316,
"grad_norm": 8.951347351074219,
"learning_rate": 3.5851318944844127e-06,
"loss": 0.2051,
"step": 535
},
{
"epoch": 3.8307967770814684,
"grad_norm": 11.88425350189209,
"learning_rate": 3.5731414868105517e-06,
"loss": 0.3122,
"step": 536
},
{
"epoch": 3.837958818263205,
"grad_norm": 5.419406414031982,
"learning_rate": 3.561151079136691e-06,
"loss": 0.1697,
"step": 537
},
{
"epoch": 3.845120859444942,
"grad_norm": 10.882148742675781,
"learning_rate": 3.5491606714628302e-06,
"loss": 0.1324,
"step": 538
},
{
"epoch": 3.8522829006266788,
"grad_norm": 5.880361557006836,
"learning_rate": 3.5371702637889693e-06,
"loss": 0.1096,
"step": 539
},
{
"epoch": 3.859444941808415,
"grad_norm": 7.374398708343506,
"learning_rate": 3.525179856115108e-06,
"loss": 0.1297,
"step": 540
},
{
"epoch": 3.8666069829901524,
"grad_norm": 5.992897987365723,
"learning_rate": 3.513189448441247e-06,
"loss": 0.2659,
"step": 541
},
{
"epoch": 3.8737690241718887,
"grad_norm": 3.3240163326263428,
"learning_rate": 3.5011990407673864e-06,
"loss": 0.0718,
"step": 542
},
{
"epoch": 3.880931065353626,
"grad_norm": 14.215323448181152,
"learning_rate": 3.4892086330935254e-06,
"loss": 0.3404,
"step": 543
},
{
"epoch": 3.8880931065353623,
"grad_norm": 8.490641593933105,
"learning_rate": 3.4772182254196645e-06,
"loss": 0.191,
"step": 544
},
{
"epoch": 3.8952551477170996,
"grad_norm": 8.60212516784668,
"learning_rate": 3.465227817745804e-06,
"loss": 0.3062,
"step": 545
},
{
"epoch": 3.902417188898836,
"grad_norm": 9.142696380615234,
"learning_rate": 3.453237410071943e-06,
"loss": 0.3727,
"step": 546
},
{
"epoch": 3.909579230080573,
"grad_norm": 8.537508010864258,
"learning_rate": 3.441247002398082e-06,
"loss": 0.213,
"step": 547
},
{
"epoch": 3.9167412712623095,
"grad_norm": 4.520055294036865,
"learning_rate": 3.4292565947242206e-06,
"loss": 0.1401,
"step": 548
},
{
"epoch": 3.9239033124440468,
"grad_norm": 10.023722648620605,
"learning_rate": 3.4172661870503596e-06,
"loss": 0.16,
"step": 549
},
{
"epoch": 3.931065353625783,
"grad_norm": 6.136500835418701,
"learning_rate": 3.405275779376499e-06,
"loss": 0.0564,
"step": 550
},
{
"epoch": 3.9382273948075204,
"grad_norm": 6.584155559539795,
"learning_rate": 3.393285371702638e-06,
"loss": 0.1132,
"step": 551
},
{
"epoch": 3.9453894359892567,
"grad_norm": 7.42992639541626,
"learning_rate": 3.381294964028777e-06,
"loss": 0.1504,
"step": 552
},
{
"epoch": 3.952551477170994,
"grad_norm": 4.1336798667907715,
"learning_rate": 3.3693045563549166e-06,
"loss": 0.2082,
"step": 553
},
{
"epoch": 3.9597135183527303,
"grad_norm": 8.42125415802002,
"learning_rate": 3.3573141486810557e-06,
"loss": 0.1697,
"step": 554
},
{
"epoch": 3.9668755595344676,
"grad_norm": 9.528365135192871,
"learning_rate": 3.3453237410071943e-06,
"loss": 0.1943,
"step": 555
},
{
"epoch": 3.974037600716204,
"grad_norm": 8.06141471862793,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.1819,
"step": 556
},
{
"epoch": 3.981199641897941,
"grad_norm": 5.582054138183594,
"learning_rate": 3.3213429256594728e-06,
"loss": 0.1164,
"step": 557
},
{
"epoch": 3.9883616830796775,
"grad_norm": 4.734123706817627,
"learning_rate": 3.309352517985612e-06,
"loss": 0.1066,
"step": 558
},
{
"epoch": 3.9955237242614148,
"grad_norm": 9.022132873535156,
"learning_rate": 3.297362110311751e-06,
"loss": 0.2624,
"step": 559
},
{
"epoch": 4.0,
"grad_norm": 3.551133155822754,
"learning_rate": 3.28537170263789e-06,
"loss": 0.1409,
"step": 560
},
{
"epoch": 4.0,
"eval_accuracy": 0.8893360160965795,
"eval_loss": 0.2713683247566223,
"eval_runtime": 12.7906,
"eval_samples_per_second": 38.857,
"eval_steps_per_second": 38.857,
"step": 560
},
{
"epoch": 4.007162041181736,
"grad_norm": 6.001535892486572,
"learning_rate": 3.2733812949640294e-06,
"loss": 0.1727,
"step": 561
},
{
"epoch": 4.014324082363474,
"grad_norm": 7.856527805328369,
"learning_rate": 3.2613908872901684e-06,
"loss": 0.2077,
"step": 562
},
{
"epoch": 4.02148612354521,
"grad_norm": 12.504968643188477,
"learning_rate": 3.249400479616307e-06,
"loss": 0.1911,
"step": 563
},
{
"epoch": 4.028648164726947,
"grad_norm": 6.551493167877197,
"learning_rate": 3.237410071942446e-06,
"loss": 0.1348,
"step": 564
},
{
"epoch": 4.035810205908684,
"grad_norm": 6.259852886199951,
"learning_rate": 3.2254196642685855e-06,
"loss": 0.0678,
"step": 565
},
{
"epoch": 4.042972247090421,
"grad_norm": 11.477996826171875,
"learning_rate": 3.2134292565947245e-06,
"loss": 0.187,
"step": 566
},
{
"epoch": 4.050134288272157,
"grad_norm": 8.440922737121582,
"learning_rate": 3.2014388489208636e-06,
"loss": 0.1693,
"step": 567
},
{
"epoch": 4.057296329453894,
"grad_norm": 2.096426486968994,
"learning_rate": 3.1894484412470026e-06,
"loss": 0.0708,
"step": 568
},
{
"epoch": 4.064458370635631,
"grad_norm": 1.9437880516052246,
"learning_rate": 3.177458033573142e-06,
"loss": 0.0456,
"step": 569
},
{
"epoch": 4.071620411817368,
"grad_norm": 6.8768792152404785,
"learning_rate": 3.1654676258992807e-06,
"loss": 0.1657,
"step": 570
},
{
"epoch": 4.078782452999104,
"grad_norm": 7.917454242706299,
"learning_rate": 3.1534772182254197e-06,
"loss": 0.2116,
"step": 571
},
{
"epoch": 4.085944494180842,
"grad_norm": 6.435678005218506,
"learning_rate": 3.1414868105515588e-06,
"loss": 0.1276,
"step": 572
},
{
"epoch": 4.093106535362578,
"grad_norm": 7.636709213256836,
"learning_rate": 3.1294964028776982e-06,
"loss": 0.2434,
"step": 573
},
{
"epoch": 4.100268576544315,
"grad_norm": 7.988414287567139,
"learning_rate": 3.1175059952038373e-06,
"loss": 0.1653,
"step": 574
},
{
"epoch": 4.107430617726052,
"grad_norm": 8.184667587280273,
"learning_rate": 3.1055155875299763e-06,
"loss": 0.1045,
"step": 575
},
{
"epoch": 4.114592658907789,
"grad_norm": 4.961910247802734,
"learning_rate": 3.0935251798561158e-06,
"loss": 0.0968,
"step": 576
},
{
"epoch": 4.121754700089525,
"grad_norm": 5.732913494110107,
"learning_rate": 3.081534772182255e-06,
"loss": 0.147,
"step": 577
},
{
"epoch": 4.128916741271262,
"grad_norm": 3.643427848815918,
"learning_rate": 3.0695443645083934e-06,
"loss": 0.2073,
"step": 578
},
{
"epoch": 4.136078782452999,
"grad_norm": 7.914638519287109,
"learning_rate": 3.0575539568345324e-06,
"loss": 0.3222,
"step": 579
},
{
"epoch": 4.143240823634736,
"grad_norm": 2.1673357486724854,
"learning_rate": 3.0455635491606715e-06,
"loss": 0.0498,
"step": 580
},
{
"epoch": 4.150402864816472,
"grad_norm": 5.7334513664245605,
"learning_rate": 3.033573141486811e-06,
"loss": 0.1663,
"step": 581
},
{
"epoch": 4.15756490599821,
"grad_norm": 4.769826412200928,
"learning_rate": 3.02158273381295e-06,
"loss": 0.202,
"step": 582
},
{
"epoch": 4.164726947179946,
"grad_norm": 5.7501540184021,
"learning_rate": 3.009592326139089e-06,
"loss": 0.2056,
"step": 583
},
{
"epoch": 4.171888988361683,
"grad_norm": 11.669347763061523,
"learning_rate": 2.9976019184652285e-06,
"loss": 0.2501,
"step": 584
},
{
"epoch": 4.17905102954342,
"grad_norm": 5.45068359375,
"learning_rate": 2.985611510791367e-06,
"loss": 0.0684,
"step": 585
},
{
"epoch": 4.186213070725157,
"grad_norm": 1.9370661973953247,
"learning_rate": 2.973621103117506e-06,
"loss": 0.0478,
"step": 586
},
{
"epoch": 4.193375111906893,
"grad_norm": 2.850411891937256,
"learning_rate": 2.961630695443645e-06,
"loss": 0.0768,
"step": 587
},
{
"epoch": 4.20053715308863,
"grad_norm": 2.566312313079834,
"learning_rate": 2.949640287769784e-06,
"loss": 0.0937,
"step": 588
},
{
"epoch": 4.207699194270367,
"grad_norm": 4.700612545013428,
"learning_rate": 2.9376498800959237e-06,
"loss": 0.0683,
"step": 589
},
{
"epoch": 4.214861235452104,
"grad_norm": 9.014172554016113,
"learning_rate": 2.9256594724220627e-06,
"loss": 0.1393,
"step": 590
},
{
"epoch": 4.22202327663384,
"grad_norm": 7.445233345031738,
"learning_rate": 2.9136690647482017e-06,
"loss": 0.1373,
"step": 591
},
{
"epoch": 4.229185317815578,
"grad_norm": 5.060126781463623,
"learning_rate": 2.9016786570743403e-06,
"loss": 0.098,
"step": 592
},
{
"epoch": 4.236347358997314,
"grad_norm": 5.344047546386719,
"learning_rate": 2.88968824940048e-06,
"loss": 0.1017,
"step": 593
},
{
"epoch": 4.243509400179051,
"grad_norm": 13.688530921936035,
"learning_rate": 2.877697841726619e-06,
"loss": 0.2884,
"step": 594
},
{
"epoch": 4.250671441360788,
"grad_norm": 9.703275680541992,
"learning_rate": 2.865707434052758e-06,
"loss": 0.1058,
"step": 595
},
{
"epoch": 4.257833482542525,
"grad_norm": 8.223041534423828,
"learning_rate": 2.8537170263788973e-06,
"loss": 0.28,
"step": 596
},
{
"epoch": 4.264995523724261,
"grad_norm": 5.328052520751953,
"learning_rate": 2.8417266187050364e-06,
"loss": 0.0894,
"step": 597
},
{
"epoch": 4.272157564905998,
"grad_norm": 4.801018238067627,
"learning_rate": 2.8297362110311754e-06,
"loss": 0.0728,
"step": 598
},
{
"epoch": 4.279319606087735,
"grad_norm": 8.624038696289062,
"learning_rate": 2.8177458033573145e-06,
"loss": 0.2589,
"step": 599
},
{
"epoch": 4.286481647269472,
"grad_norm": 1.4504754543304443,
"learning_rate": 2.805755395683453e-06,
"loss": 0.0433,
"step": 600
},
{
"epoch": 4.293643688451208,
"grad_norm": 3.4955124855041504,
"learning_rate": 2.7937649880095925e-06,
"loss": 0.08,
"step": 601
},
{
"epoch": 4.300805729632946,
"grad_norm": 8.949994087219238,
"learning_rate": 2.7817745803357316e-06,
"loss": 0.094,
"step": 602
},
{
"epoch": 4.307967770814682,
"grad_norm": 7.984180450439453,
"learning_rate": 2.7697841726618706e-06,
"loss": 0.1631,
"step": 603
},
{
"epoch": 4.315129811996419,
"grad_norm": 2.0268032550811768,
"learning_rate": 2.75779376498801e-06,
"loss": 0.0622,
"step": 604
},
{
"epoch": 4.322291853178156,
"grad_norm": 6.695650577545166,
"learning_rate": 2.745803357314149e-06,
"loss": 0.0929,
"step": 605
},
{
"epoch": 4.329453894359893,
"grad_norm": 2.7270710468292236,
"learning_rate": 2.733812949640288e-06,
"loss": 0.0688,
"step": 606
},
{
"epoch": 4.336615935541629,
"grad_norm": 9.790903091430664,
"learning_rate": 2.7218225419664268e-06,
"loss": 0.1525,
"step": 607
},
{
"epoch": 4.343777976723366,
"grad_norm": 6.341964244842529,
"learning_rate": 2.7098321342925658e-06,
"loss": 0.0756,
"step": 608
},
{
"epoch": 4.350940017905103,
"grad_norm": 3.1827850341796875,
"learning_rate": 2.6978417266187052e-06,
"loss": 0.0814,
"step": 609
},
{
"epoch": 4.35810205908684,
"grad_norm": 7.853371620178223,
"learning_rate": 2.6858513189448443e-06,
"loss": 0.072,
"step": 610
},
{
"epoch": 4.365264100268576,
"grad_norm": 2.4876341819763184,
"learning_rate": 2.6738609112709833e-06,
"loss": 0.0512,
"step": 611
},
{
"epoch": 4.372426141450314,
"grad_norm": 2.8959743976593018,
"learning_rate": 2.6618705035971228e-06,
"loss": 0.0398,
"step": 612
},
{
"epoch": 4.37958818263205,
"grad_norm": 10.225565910339355,
"learning_rate": 2.649880095923262e-06,
"loss": 0.2628,
"step": 613
},
{
"epoch": 4.386750223813787,
"grad_norm": 9.800642013549805,
"learning_rate": 2.637889688249401e-06,
"loss": 0.3922,
"step": 614
},
{
"epoch": 4.393912264995524,
"grad_norm": 5.1125617027282715,
"learning_rate": 2.6258992805755395e-06,
"loss": 0.0934,
"step": 615
},
{
"epoch": 4.401074306177261,
"grad_norm": 3.544774293899536,
"learning_rate": 2.6139088729016785e-06,
"loss": 0.0726,
"step": 616
},
{
"epoch": 4.408236347358997,
"grad_norm": 10.818737030029297,
"learning_rate": 2.601918465227818e-06,
"loss": 0.1664,
"step": 617
},
{
"epoch": 4.415398388540734,
"grad_norm": 3.033714771270752,
"learning_rate": 2.589928057553957e-06,
"loss": 0.1875,
"step": 618
},
{
"epoch": 4.422560429722471,
"grad_norm": 3.4782607555389404,
"learning_rate": 2.577937649880096e-06,
"loss": 0.065,
"step": 619
},
{
"epoch": 4.429722470904208,
"grad_norm": 1.43113112449646,
"learning_rate": 2.5659472422062355e-06,
"loss": 0.0302,
"step": 620
},
{
"epoch": 4.436884512085944,
"grad_norm": 6.158276557922363,
"learning_rate": 2.5539568345323745e-06,
"loss": 0.0774,
"step": 621
},
{
"epoch": 4.444046553267682,
"grad_norm": 5.511786937713623,
"learning_rate": 2.541966426858513e-06,
"loss": 0.0645,
"step": 622
},
{
"epoch": 4.451208594449418,
"grad_norm": 5.461114406585693,
"learning_rate": 2.529976019184652e-06,
"loss": 0.1123,
"step": 623
},
{
"epoch": 4.458370635631155,
"grad_norm": 5.242578983306885,
"learning_rate": 2.5179856115107916e-06,
"loss": 0.1413,
"step": 624
},
{
"epoch": 4.465532676812892,
"grad_norm": 4.951175212860107,
"learning_rate": 2.5059952038369307e-06,
"loss": 0.0973,
"step": 625
},
{
"epoch": 4.472694717994629,
"grad_norm": 7.635180950164795,
"learning_rate": 2.4940047961630697e-06,
"loss": 0.1024,
"step": 626
},
{
"epoch": 4.479856759176365,
"grad_norm": 6.080862522125244,
"learning_rate": 2.4820143884892088e-06,
"loss": 0.0925,
"step": 627
},
{
"epoch": 4.487018800358102,
"grad_norm": 12.718398094177246,
"learning_rate": 2.470023980815348e-06,
"loss": 0.1659,
"step": 628
},
{
"epoch": 4.494180841539839,
"grad_norm": 8.396491050720215,
"learning_rate": 2.458033573141487e-06,
"loss": 0.3336,
"step": 629
},
{
"epoch": 4.501342882721576,
"grad_norm": 16.718276977539062,
"learning_rate": 2.4460431654676263e-06,
"loss": 0.3048,
"step": 630
},
{
"epoch": 4.508504923903312,
"grad_norm": 7.400913238525391,
"learning_rate": 2.4340527577937653e-06,
"loss": 0.104,
"step": 631
},
{
"epoch": 4.51566696508505,
"grad_norm": 6.40206241607666,
"learning_rate": 2.4220623501199044e-06,
"loss": 0.118,
"step": 632
},
{
"epoch": 4.522829006266786,
"grad_norm": 11.661595344543457,
"learning_rate": 2.4100719424460434e-06,
"loss": 0.2508,
"step": 633
},
{
"epoch": 4.529991047448523,
"grad_norm": 15.65445613861084,
"learning_rate": 2.3980815347721824e-06,
"loss": 0.1724,
"step": 634
},
{
"epoch": 4.53715308863026,
"grad_norm": 20.107707977294922,
"learning_rate": 2.3860911270983215e-06,
"loss": 0.1529,
"step": 635
},
{
"epoch": 4.544315129811997,
"grad_norm": 1.682045340538025,
"learning_rate": 2.3741007194244605e-06,
"loss": 0.0449,
"step": 636
},
{
"epoch": 4.551477170993733,
"grad_norm": 3.9842913150787354,
"learning_rate": 2.3621103117505996e-06,
"loss": 0.183,
"step": 637
},
{
"epoch": 4.5586392121754695,
"grad_norm": 4.31380558013916,
"learning_rate": 2.350119904076739e-06,
"loss": 0.0579,
"step": 638
},
{
"epoch": 4.565801253357207,
"grad_norm": 7.769341468811035,
"learning_rate": 2.3381294964028776e-06,
"loss": 0.1413,
"step": 639
},
{
"epoch": 4.572963294538944,
"grad_norm": 9.923673629760742,
"learning_rate": 2.326139088729017e-06,
"loss": 0.3786,
"step": 640
},
{
"epoch": 4.58012533572068,
"grad_norm": 10.427968978881836,
"learning_rate": 2.314148681055156e-06,
"loss": 0.1294,
"step": 641
},
{
"epoch": 4.587287376902417,
"grad_norm": 3.505420207977295,
"learning_rate": 2.302158273381295e-06,
"loss": 0.0478,
"step": 642
},
{
"epoch": 4.594449418084154,
"grad_norm": 4.628006458282471,
"learning_rate": 2.290167865707434e-06,
"loss": 0.1185,
"step": 643
},
{
"epoch": 4.601611459265891,
"grad_norm": 6.239457130432129,
"learning_rate": 2.2781774580335732e-06,
"loss": 0.1386,
"step": 644
},
{
"epoch": 4.608773500447628,
"grad_norm": 1.7492996454238892,
"learning_rate": 2.2661870503597123e-06,
"loss": 0.0346,
"step": 645
},
{
"epoch": 4.615935541629364,
"grad_norm": 5.875699520111084,
"learning_rate": 2.2541966426858513e-06,
"loss": 0.1871,
"step": 646
},
{
"epoch": 4.623097582811101,
"grad_norm": 6.977318286895752,
"learning_rate": 2.2422062350119903e-06,
"loss": 0.2546,
"step": 647
},
{
"epoch": 4.630259623992838,
"grad_norm": 12.939038276672363,
"learning_rate": 2.23021582733813e-06,
"loss": 0.121,
"step": 648
},
{
"epoch": 4.637421665174575,
"grad_norm": 8.233115196228027,
"learning_rate": 2.218225419664269e-06,
"loss": 0.1027,
"step": 649
},
{
"epoch": 4.644583706356311,
"grad_norm": 9.395356178283691,
"learning_rate": 2.206235011990408e-06,
"loss": 0.1401,
"step": 650
},
{
"epoch": 4.651745747538048,
"grad_norm": 2.7126667499542236,
"learning_rate": 2.194244604316547e-06,
"loss": 0.0724,
"step": 651
},
{
"epoch": 4.658907788719786,
"grad_norm": 8.101604461669922,
"learning_rate": 2.182254196642686e-06,
"loss": 0.2692,
"step": 652
},
{
"epoch": 4.666069829901522,
"grad_norm": 4.733114719390869,
"learning_rate": 2.1702637889688254e-06,
"loss": 0.1407,
"step": 653
},
{
"epoch": 4.673231871083258,
"grad_norm": 9.878451347351074,
"learning_rate": 2.158273381294964e-06,
"loss": 0.335,
"step": 654
},
{
"epoch": 4.680393912264996,
"grad_norm": 7.976416110992432,
"learning_rate": 2.146282973621103e-06,
"loss": 0.0987,
"step": 655
},
{
"epoch": 4.687555953446733,
"grad_norm": 6.621464252471924,
"learning_rate": 2.1342925659472425e-06,
"loss": 0.1108,
"step": 656
},
{
"epoch": 4.694717994628469,
"grad_norm": 2.5924878120422363,
"learning_rate": 2.1223021582733816e-06,
"loss": 0.0372,
"step": 657
},
{
"epoch": 4.7018800358102055,
"grad_norm": 2.272629737854004,
"learning_rate": 2.1103117505995206e-06,
"loss": 0.0476,
"step": 658
},
{
"epoch": 4.709042076991943,
"grad_norm": 5.590856552124023,
"learning_rate": 2.0983213429256596e-06,
"loss": 0.3509,
"step": 659
},
{
"epoch": 4.716204118173679,
"grad_norm": 5.7683210372924805,
"learning_rate": 2.0863309352517987e-06,
"loss": 0.0917,
"step": 660
},
{
"epoch": 4.723366159355416,
"grad_norm": 7.388215065002441,
"learning_rate": 2.0743405275779377e-06,
"loss": 0.1412,
"step": 661
},
{
"epoch": 4.730528200537153,
"grad_norm": 6.264549732208252,
"learning_rate": 2.0623501199040767e-06,
"loss": 0.1877,
"step": 662
},
{
"epoch": 4.73769024171889,
"grad_norm": 7.518822193145752,
"learning_rate": 2.050359712230216e-06,
"loss": 0.2453,
"step": 663
},
{
"epoch": 4.744852282900626,
"grad_norm": 2.944272041320801,
"learning_rate": 2.0383693045563552e-06,
"loss": 0.0604,
"step": 664
},
{
"epoch": 4.752014324082364,
"grad_norm": 10.040867805480957,
"learning_rate": 2.026378896882494e-06,
"loss": 0.1441,
"step": 665
},
{
"epoch": 4.7591763652641,
"grad_norm": 9.994035720825195,
"learning_rate": 2.0143884892086333e-06,
"loss": 0.2145,
"step": 666
},
{
"epoch": 4.766338406445837,
"grad_norm": 9.281600952148438,
"learning_rate": 2.0023980815347724e-06,
"loss": 0.18,
"step": 667
},
{
"epoch": 4.7735004476275735,
"grad_norm": 5.464154243469238,
"learning_rate": 1.9904076738609114e-06,
"loss": 0.0456,
"step": 668
},
{
"epoch": 4.780662488809311,
"grad_norm": 5.3792524337768555,
"learning_rate": 1.9784172661870504e-06,
"loss": 0.0947,
"step": 669
},
{
"epoch": 4.787824529991047,
"grad_norm": 8.45373821258545,
"learning_rate": 1.9664268585131895e-06,
"loss": 0.1664,
"step": 670
},
{
"epoch": 4.794986571172784,
"grad_norm": 4.617059230804443,
"learning_rate": 1.954436450839329e-06,
"loss": 0.0759,
"step": 671
},
{
"epoch": 4.802148612354521,
"grad_norm": 10.997187614440918,
"learning_rate": 1.942446043165468e-06,
"loss": 0.1521,
"step": 672
},
{
"epoch": 4.809310653536258,
"grad_norm": 5.464894771575928,
"learning_rate": 1.930455635491607e-06,
"loss": 0.0674,
"step": 673
},
{
"epoch": 4.816472694717994,
"grad_norm": 5.334858417510986,
"learning_rate": 1.918465227817746e-06,
"loss": 0.1741,
"step": 674
},
{
"epoch": 4.823634735899732,
"grad_norm": 11.286782264709473,
"learning_rate": 1.906474820143885e-06,
"loss": 0.2467,
"step": 675
},
{
"epoch": 4.830796777081468,
"grad_norm": 7.515106678009033,
"learning_rate": 1.8944844124700241e-06,
"loss": 0.0812,
"step": 676
},
{
"epoch": 4.837958818263205,
"grad_norm": 2.932992935180664,
"learning_rate": 1.8824940047961631e-06,
"loss": 0.0713,
"step": 677
},
{
"epoch": 4.8451208594449415,
"grad_norm": 1.9469927549362183,
"learning_rate": 1.8705035971223024e-06,
"loss": 0.0513,
"step": 678
},
{
"epoch": 4.852282900626679,
"grad_norm": 10.902048110961914,
"learning_rate": 1.8585131894484414e-06,
"loss": 0.2673,
"step": 679
},
{
"epoch": 4.859444941808415,
"grad_norm": 10.670211791992188,
"learning_rate": 1.8465227817745805e-06,
"loss": 0.0831,
"step": 680
},
{
"epoch": 4.866606982990152,
"grad_norm": 15.201674461364746,
"learning_rate": 1.8345323741007195e-06,
"loss": 0.0817,
"step": 681
},
{
"epoch": 4.873769024171889,
"grad_norm": 7.9115681648254395,
"learning_rate": 1.8225419664268588e-06,
"loss": 0.2511,
"step": 682
},
{
"epoch": 4.880931065353626,
"grad_norm": 8.73487663269043,
"learning_rate": 1.8105515587529978e-06,
"loss": 0.1621,
"step": 683
},
{
"epoch": 4.888093106535362,
"grad_norm": 9.163458824157715,
"learning_rate": 1.7985611510791368e-06,
"loss": 0.2148,
"step": 684
},
{
"epoch": 4.8952551477171,
"grad_norm": 3.965367317199707,
"learning_rate": 1.7865707434052759e-06,
"loss": 0.0611,
"step": 685
},
{
"epoch": 4.902417188898836,
"grad_norm": 3.5464630126953125,
"learning_rate": 1.7745803357314151e-06,
"loss": 0.05,
"step": 686
},
{
"epoch": 4.909579230080573,
"grad_norm": 6.229908466339111,
"learning_rate": 1.762589928057554e-06,
"loss": 0.335,
"step": 687
},
{
"epoch": 4.9167412712623095,
"grad_norm": 7.500967025756836,
"learning_rate": 1.7505995203836932e-06,
"loss": 0.1911,
"step": 688
},
{
"epoch": 4.923903312444047,
"grad_norm": 4.766892433166504,
"learning_rate": 1.7386091127098322e-06,
"loss": 0.0773,
"step": 689
},
{
"epoch": 4.931065353625783,
"grad_norm": 7.059886932373047,
"learning_rate": 1.7266187050359715e-06,
"loss": 0.1107,
"step": 690
},
{
"epoch": 4.93822739480752,
"grad_norm": 3.412309408187866,
"learning_rate": 1.7146282973621103e-06,
"loss": 0.0494,
"step": 691
},
{
"epoch": 4.945389435989257,
"grad_norm": 12.20711898803711,
"learning_rate": 1.7026378896882496e-06,
"loss": 0.2606,
"step": 692
},
{
"epoch": 4.952551477170994,
"grad_norm": 12.949819564819336,
"learning_rate": 1.6906474820143886e-06,
"loss": 0.1445,
"step": 693
},
{
"epoch": 4.95971351835273,
"grad_norm": 5.260175704956055,
"learning_rate": 1.6786570743405278e-06,
"loss": 0.0488,
"step": 694
},
{
"epoch": 4.966875559534468,
"grad_norm": 5.705483436584473,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.1724,
"step": 695
},
{
"epoch": 4.974037600716204,
"grad_norm": 9.60573959350586,
"learning_rate": 1.654676258992806e-06,
"loss": 0.2245,
"step": 696
},
{
"epoch": 4.981199641897941,
"grad_norm": 3.508620500564575,
"learning_rate": 1.642685851318945e-06,
"loss": 0.0552,
"step": 697
},
{
"epoch": 4.9883616830796775,
"grad_norm": 3.118699789047241,
"learning_rate": 1.6306954436450842e-06,
"loss": 0.0483,
"step": 698
},
{
"epoch": 4.995523724261415,
"grad_norm": 5.063241958618164,
"learning_rate": 1.618705035971223e-06,
"loss": 0.0576,
"step": 699
},
{
"epoch": 5.0,
"grad_norm": 1.9771332740783691,
"learning_rate": 1.6067146282973623e-06,
"loss": 0.0227,
"step": 700
},
{
"epoch": 5.0,
"eval_accuracy": 0.8893360160965795,
"eval_loss": 0.2797456681728363,
"eval_runtime": 12.7936,
"eval_samples_per_second": 38.847,
"eval_steps_per_second": 38.847,
"step": 700
},
{
"epoch": 5.007162041181736,
"grad_norm": 9.170907974243164,
"learning_rate": 1.5947242206235013e-06,
"loss": 0.0874,
"step": 701
},
{
"epoch": 5.014324082363474,
"grad_norm": 13.936522483825684,
"learning_rate": 1.5827338129496403e-06,
"loss": 0.1382,
"step": 702
},
{
"epoch": 5.02148612354521,
"grad_norm": 5.094358921051025,
"learning_rate": 1.5707434052757794e-06,
"loss": 0.1265,
"step": 703
},
{
"epoch": 5.028648164726947,
"grad_norm": 2.9558494091033936,
"learning_rate": 1.5587529976019186e-06,
"loss": 0.1431,
"step": 704
},
{
"epoch": 5.035810205908684,
"grad_norm": 3.9764819145202637,
"learning_rate": 1.5467625899280579e-06,
"loss": 0.057,
"step": 705
},
{
"epoch": 5.042972247090421,
"grad_norm": 10.071995735168457,
"learning_rate": 1.5347721822541967e-06,
"loss": 0.0821,
"step": 706
},
{
"epoch": 5.050134288272157,
"grad_norm": 7.149175643920898,
"learning_rate": 1.5227817745803357e-06,
"loss": 0.0672,
"step": 707
},
{
"epoch": 5.057296329453894,
"grad_norm": 7.112942218780518,
"learning_rate": 1.510791366906475e-06,
"loss": 0.1492,
"step": 708
},
{
"epoch": 5.064458370635631,
"grad_norm": 8.392587661743164,
"learning_rate": 1.4988009592326142e-06,
"loss": 0.2037,
"step": 709
},
{
"epoch": 5.071620411817368,
"grad_norm": 10.566859245300293,
"learning_rate": 1.486810551558753e-06,
"loss": 0.1623,
"step": 710
},
{
"epoch": 5.078782452999104,
"grad_norm": 7.337512016296387,
"learning_rate": 1.474820143884892e-06,
"loss": 0.2304,
"step": 711
},
{
"epoch": 5.085944494180842,
"grad_norm": 5.944802284240723,
"learning_rate": 1.4628297362110313e-06,
"loss": 0.3046,
"step": 712
},
{
"epoch": 5.093106535362578,
"grad_norm": 7.068042755126953,
"learning_rate": 1.4508393285371702e-06,
"loss": 0.1634,
"step": 713
},
{
"epoch": 5.100268576544315,
"grad_norm": 6.807985305786133,
"learning_rate": 1.4388489208633094e-06,
"loss": 0.0956,
"step": 714
},
{
"epoch": 5.107430617726052,
"grad_norm": 1.3011399507522583,
"learning_rate": 1.4268585131894487e-06,
"loss": 0.0329,
"step": 715
},
{
"epoch": 5.114592658907789,
"grad_norm": 4.0383782386779785,
"learning_rate": 1.4148681055155877e-06,
"loss": 0.1312,
"step": 716
},
{
"epoch": 5.121754700089525,
"grad_norm": 9.570232391357422,
"learning_rate": 1.4028776978417265e-06,
"loss": 0.1369,
"step": 717
},
{
"epoch": 5.128916741271262,
"grad_norm": 9.754796981811523,
"learning_rate": 1.3908872901678658e-06,
"loss": 0.2643,
"step": 718
},
{
"epoch": 5.136078782452999,
"grad_norm": 2.8878424167633057,
"learning_rate": 1.378896882494005e-06,
"loss": 0.0438,
"step": 719
},
{
"epoch": 5.143240823634736,
"grad_norm": 6.284052848815918,
"learning_rate": 1.366906474820144e-06,
"loss": 0.1165,
"step": 720
},
{
"epoch": 5.150402864816472,
"grad_norm": 7.874203681945801,
"learning_rate": 1.3549160671462829e-06,
"loss": 0.1494,
"step": 721
},
{
"epoch": 5.15756490599821,
"grad_norm": 5.88767147064209,
"learning_rate": 1.3429256594724221e-06,
"loss": 0.0979,
"step": 722
},
{
"epoch": 5.164726947179946,
"grad_norm": 9.50105094909668,
"learning_rate": 1.3309352517985614e-06,
"loss": 0.3674,
"step": 723
},
{
"epoch": 5.171888988361683,
"grad_norm": 6.08318567276001,
"learning_rate": 1.3189448441247004e-06,
"loss": 0.1707,
"step": 724
},
{
"epoch": 5.17905102954342,
"grad_norm": 3.7768077850341797,
"learning_rate": 1.3069544364508393e-06,
"loss": 0.1428,
"step": 725
},
{
"epoch": 5.186213070725157,
"grad_norm": 5.646254062652588,
"learning_rate": 1.2949640287769785e-06,
"loss": 0.0879,
"step": 726
},
{
"epoch": 5.193375111906893,
"grad_norm": 13.744306564331055,
"learning_rate": 1.2829736211031178e-06,
"loss": 0.1985,
"step": 727
},
{
"epoch": 5.20053715308863,
"grad_norm": 3.692706346511841,
"learning_rate": 1.2709832134292566e-06,
"loss": 0.0749,
"step": 728
},
{
"epoch": 5.207699194270367,
"grad_norm": 7.3681488037109375,
"learning_rate": 1.2589928057553958e-06,
"loss": 0.1026,
"step": 729
},
{
"epoch": 5.214861235452104,
"grad_norm": 5.465104579925537,
"learning_rate": 1.2470023980815349e-06,
"loss": 0.085,
"step": 730
},
{
"epoch": 5.22202327663384,
"grad_norm": 7.1183037757873535,
"learning_rate": 1.235011990407674e-06,
"loss": 0.0392,
"step": 731
},
{
"epoch": 5.229185317815578,
"grad_norm": 3.9316046237945557,
"learning_rate": 1.2230215827338131e-06,
"loss": 0.0543,
"step": 732
},
{
"epoch": 5.236347358997314,
"grad_norm": 2.7083520889282227,
"learning_rate": 1.2110311750599522e-06,
"loss": 0.0454,
"step": 733
},
{
"epoch": 5.243509400179051,
"grad_norm": 8.070670127868652,
"learning_rate": 1.1990407673860912e-06,
"loss": 0.2432,
"step": 734
},
{
"epoch": 5.250671441360788,
"grad_norm": 5.080997943878174,
"learning_rate": 1.1870503597122303e-06,
"loss": 0.1378,
"step": 735
},
{
"epoch": 5.257833482542525,
"grad_norm": 5.301459312438965,
"learning_rate": 1.1750599520383695e-06,
"loss": 0.0696,
"step": 736
},
{
"epoch": 5.264995523724261,
"grad_norm": 10.343586921691895,
"learning_rate": 1.1630695443645085e-06,
"loss": 0.2511,
"step": 737
},
{
"epoch": 5.272157564905998,
"grad_norm": 1.9044795036315918,
"learning_rate": 1.1510791366906476e-06,
"loss": 0.0365,
"step": 738
},
{
"epoch": 5.279319606087735,
"grad_norm": 6.329905033111572,
"learning_rate": 1.1390887290167866e-06,
"loss": 0.118,
"step": 739
},
{
"epoch": 5.286481647269472,
"grad_norm": 12.016860961914062,
"learning_rate": 1.1270983213429257e-06,
"loss": 0.1504,
"step": 740
},
{
"epoch": 5.293643688451208,
"grad_norm": 8.04209041595459,
"learning_rate": 1.115107913669065e-06,
"loss": 0.1188,
"step": 741
},
{
"epoch": 5.300805729632946,
"grad_norm": 5.305018424987793,
"learning_rate": 1.103117505995204e-06,
"loss": 0.0588,
"step": 742
},
{
"epoch": 5.307967770814682,
"grad_norm": 5.174502372741699,
"learning_rate": 1.091127098321343e-06,
"loss": 0.1221,
"step": 743
},
{
"epoch": 5.315129811996419,
"grad_norm": 5.626262664794922,
"learning_rate": 1.079136690647482e-06,
"loss": 0.0911,
"step": 744
},
{
"epoch": 5.322291853178156,
"grad_norm": 3.473499059677124,
"learning_rate": 1.0671462829736213e-06,
"loss": 0.0435,
"step": 745
},
{
"epoch": 5.329453894359893,
"grad_norm": 7.590930461883545,
"learning_rate": 1.0551558752997603e-06,
"loss": 0.1276,
"step": 746
},
{
"epoch": 5.336615935541629,
"grad_norm": 6.063522815704346,
"learning_rate": 1.0431654676258993e-06,
"loss": 0.2146,
"step": 747
},
{
"epoch": 5.343777976723366,
"grad_norm": 6.192073345184326,
"learning_rate": 1.0311750599520384e-06,
"loss": 0.2152,
"step": 748
},
{
"epoch": 5.350940017905103,
"grad_norm": 5.46181058883667,
"learning_rate": 1.0191846522781776e-06,
"loss": 0.2623,
"step": 749
},
{
"epoch": 5.35810205908684,
"grad_norm": 5.2007012367248535,
"learning_rate": 1.0071942446043167e-06,
"loss": 0.0852,
"step": 750
},
{
"epoch": 5.365264100268576,
"grad_norm": 2.142305850982666,
"learning_rate": 9.952038369304557e-07,
"loss": 0.0339,
"step": 751
},
{
"epoch": 5.372426141450314,
"grad_norm": 5.1917901039123535,
"learning_rate": 9.832134292565947e-07,
"loss": 0.2234,
"step": 752
},
{
"epoch": 5.37958818263205,
"grad_norm": 1.764793038368225,
"learning_rate": 9.71223021582734e-07,
"loss": 0.0293,
"step": 753
},
{
"epoch": 5.386750223813787,
"grad_norm": 2.4261093139648438,
"learning_rate": 9.59232613908873e-07,
"loss": 0.0506,
"step": 754
},
{
"epoch": 5.393912264995524,
"grad_norm": 7.679261207580566,
"learning_rate": 9.472422062350121e-07,
"loss": 0.2158,
"step": 755
},
{
"epoch": 5.401074306177261,
"grad_norm": 5.182496547698975,
"learning_rate": 9.352517985611512e-07,
"loss": 0.1834,
"step": 756
},
{
"epoch": 5.408236347358997,
"grad_norm": 2.914945125579834,
"learning_rate": 9.232613908872902e-07,
"loss": 0.0413,
"step": 757
},
{
"epoch": 5.415398388540734,
"grad_norm": 7.0388360023498535,
"learning_rate": 9.112709832134294e-07,
"loss": 0.066,
"step": 758
},
{
"epoch": 5.422560429722471,
"grad_norm": 2.673398494720459,
"learning_rate": 8.992805755395684e-07,
"loss": 0.0511,
"step": 759
},
{
"epoch": 5.429722470904208,
"grad_norm": 5.319000244140625,
"learning_rate": 8.872901678657076e-07,
"loss": 0.1015,
"step": 760
},
{
"epoch": 5.436884512085944,
"grad_norm": 2.27585506439209,
"learning_rate": 8.752997601918466e-07,
"loss": 0.0394,
"step": 761
},
{
"epoch": 5.444046553267682,
"grad_norm": 2.567699909210205,
"learning_rate": 8.633093525179857e-07,
"loss": 0.0778,
"step": 762
},
{
"epoch": 5.451208594449418,
"grad_norm": 11.017365455627441,
"learning_rate": 8.513189448441248e-07,
"loss": 0.2491,
"step": 763
},
{
"epoch": 5.458370635631155,
"grad_norm": 3.768150568008423,
"learning_rate": 8.393285371702639e-07,
"loss": 0.0674,
"step": 764
},
{
"epoch": 5.465532676812892,
"grad_norm": 3.6968348026275635,
"learning_rate": 8.27338129496403e-07,
"loss": 0.0515,
"step": 765
},
{
"epoch": 5.472694717994629,
"grad_norm": 3.3715782165527344,
"learning_rate": 8.153477218225421e-07,
"loss": 0.1695,
"step": 766
},
{
"epoch": 5.479856759176365,
"grad_norm": 4.479177951812744,
"learning_rate": 8.033573141486811e-07,
"loss": 0.0891,
"step": 767
},
{
"epoch": 5.487018800358102,
"grad_norm": 3.904632329940796,
"learning_rate": 7.913669064748202e-07,
"loss": 0.0651,
"step": 768
},
{
"epoch": 5.494180841539839,
"grad_norm": 10.056451797485352,
"learning_rate": 7.793764988009593e-07,
"loss": 0.0672,
"step": 769
},
{
"epoch": 5.501342882721576,
"grad_norm": 7.317890644073486,
"learning_rate": 7.673860911270984e-07,
"loss": 0.0775,
"step": 770
},
{
"epoch": 5.508504923903312,
"grad_norm": 4.815303802490234,
"learning_rate": 7.553956834532375e-07,
"loss": 0.1864,
"step": 771
},
{
"epoch": 5.51566696508505,
"grad_norm": 4.484886169433594,
"learning_rate": 7.434052757793765e-07,
"loss": 0.0818,
"step": 772
},
{
"epoch": 5.522829006266786,
"grad_norm": 8.391207695007324,
"learning_rate": 7.314148681055157e-07,
"loss": 0.109,
"step": 773
},
{
"epoch": 5.529991047448523,
"grad_norm": 9.510869026184082,
"learning_rate": 7.194244604316547e-07,
"loss": 0.1167,
"step": 774
},
{
"epoch": 5.53715308863026,
"grad_norm": 1.1796542406082153,
"learning_rate": 7.074340527577939e-07,
"loss": 0.0309,
"step": 775
},
{
"epoch": 5.544315129811997,
"grad_norm": 24.52149772644043,
"learning_rate": 6.954436450839329e-07,
"loss": 0.1257,
"step": 776
},
{
"epoch": 5.551477170993733,
"grad_norm": 10.662769317626953,
"learning_rate": 6.83453237410072e-07,
"loss": 0.3804,
"step": 777
},
{
"epoch": 5.5586392121754695,
"grad_norm": 4.737334728240967,
"learning_rate": 6.714628297362111e-07,
"loss": 0.0736,
"step": 778
},
{
"epoch": 5.565801253357207,
"grad_norm": 10.771794319152832,
"learning_rate": 6.594724220623502e-07,
"loss": 0.1611,
"step": 779
},
{
"epoch": 5.572963294538944,
"grad_norm": 5.774040222167969,
"learning_rate": 6.474820143884893e-07,
"loss": 0.0813,
"step": 780
},
{
"epoch": 5.58012533572068,
"grad_norm": 4.687991142272949,
"learning_rate": 6.354916067146283e-07,
"loss": 0.107,
"step": 781
},
{
"epoch": 5.587287376902417,
"grad_norm": 2.2647032737731934,
"learning_rate": 6.235011990407674e-07,
"loss": 0.0432,
"step": 782
},
{
"epoch": 5.594449418084154,
"grad_norm": 2.810767650604248,
"learning_rate": 6.115107913669066e-07,
"loss": 0.0731,
"step": 783
},
{
"epoch": 5.601611459265891,
"grad_norm": 5.394442558288574,
"learning_rate": 5.995203836930456e-07,
"loss": 0.0826,
"step": 784
},
{
"epoch": 5.608773500447628,
"grad_norm": 7.001992225646973,
"learning_rate": 5.875299760191848e-07,
"loss": 0.077,
"step": 785
},
{
"epoch": 5.615935541629364,
"grad_norm": 5.397133827209473,
"learning_rate": 5.755395683453238e-07,
"loss": 0.1079,
"step": 786
},
{
"epoch": 5.623097582811101,
"grad_norm": 3.6474523544311523,
"learning_rate": 5.635491606714628e-07,
"loss": 0.1313,
"step": 787
},
{
"epoch": 5.630259623992838,
"grad_norm": 10.120123863220215,
"learning_rate": 5.51558752997602e-07,
"loss": 0.2847,
"step": 788
},
{
"epoch": 5.637421665174575,
"grad_norm": 7.304914951324463,
"learning_rate": 5.39568345323741e-07,
"loss": 0.0728,
"step": 789
},
{
"epoch": 5.644583706356311,
"grad_norm": 6.6555399894714355,
"learning_rate": 5.275779376498801e-07,
"loss": 0.064,
"step": 790
},
{
"epoch": 5.651745747538048,
"grad_norm": 2.417214870452881,
"learning_rate": 5.155875299760192e-07,
"loss": 0.0618,
"step": 791
},
{
"epoch": 5.658907788719786,
"grad_norm": 4.194424152374268,
"learning_rate": 5.035971223021583e-07,
"loss": 0.0489,
"step": 792
},
{
"epoch": 5.666069829901522,
"grad_norm": 12.794995307922363,
"learning_rate": 4.916067146282974e-07,
"loss": 0.1404,
"step": 793
},
{
"epoch": 5.673231871083258,
"grad_norm": 4.588229656219482,
"learning_rate": 4.796163069544365e-07,
"loss": 0.1745,
"step": 794
},
{
"epoch": 5.680393912264996,
"grad_norm": 12.849517822265625,
"learning_rate": 4.676258992805756e-07,
"loss": 0.1715,
"step": 795
},
{
"epoch": 5.687555953446733,
"grad_norm": 7.121222496032715,
"learning_rate": 4.556354916067147e-07,
"loss": 0.0975,
"step": 796
},
{
"epoch": 5.694717994628469,
"grad_norm": 4.74954080581665,
"learning_rate": 4.436450839328538e-07,
"loss": 0.0625,
"step": 797
},
{
"epoch": 5.7018800358102055,
"grad_norm": 11.487862586975098,
"learning_rate": 4.3165467625899287e-07,
"loss": 0.2519,
"step": 798
},
{
"epoch": 5.709042076991943,
"grad_norm": 14.895743370056152,
"learning_rate": 4.1966426858513196e-07,
"loss": 0.1979,
"step": 799
},
{
"epoch": 5.716204118173679,
"grad_norm": 5.849491119384766,
"learning_rate": 4.0767386091127105e-07,
"loss": 0.0798,
"step": 800
},
{
"epoch": 5.723366159355416,
"grad_norm": 9.15971851348877,
"learning_rate": 3.956834532374101e-07,
"loss": 0.1698,
"step": 801
},
{
"epoch": 5.730528200537153,
"grad_norm": 14.715744972229004,
"learning_rate": 3.836930455635492e-07,
"loss": 0.1073,
"step": 802
},
{
"epoch": 5.73769024171889,
"grad_norm": 3.501526355743408,
"learning_rate": 3.7170263788968827e-07,
"loss": 0.0403,
"step": 803
},
{
"epoch": 5.744852282900626,
"grad_norm": 4.736666202545166,
"learning_rate": 3.5971223021582736e-07,
"loss": 0.0978,
"step": 804
},
{
"epoch": 5.752014324082364,
"grad_norm": 6.315278053283691,
"learning_rate": 3.4772182254196645e-07,
"loss": 0.0792,
"step": 805
},
{
"epoch": 5.7591763652641,
"grad_norm": 7.229060649871826,
"learning_rate": 3.3573141486810554e-07,
"loss": 0.1235,
"step": 806
},
{
"epoch": 5.766338406445837,
"grad_norm": 5.501567840576172,
"learning_rate": 3.237410071942446e-07,
"loss": 0.0607,
"step": 807
},
{
"epoch": 5.7735004476275735,
"grad_norm": 3.9031436443328857,
"learning_rate": 3.117505995203837e-07,
"loss": 0.0455,
"step": 808
},
{
"epoch": 5.780662488809311,
"grad_norm": 1.905010461807251,
"learning_rate": 2.997601918465228e-07,
"loss": 0.0362,
"step": 809
},
{
"epoch": 5.787824529991047,
"grad_norm": 20.247007369995117,
"learning_rate": 2.877697841726619e-07,
"loss": 0.1452,
"step": 810
},
{
"epoch": 5.794986571172784,
"grad_norm": 9.428034782409668,
"learning_rate": 2.75779376498801e-07,
"loss": 0.1267,
"step": 811
},
{
"epoch": 5.802148612354521,
"grad_norm": 2.8392276763916016,
"learning_rate": 2.637889688249401e-07,
"loss": 0.2061,
"step": 812
},
{
"epoch": 5.809310653536258,
"grad_norm": 4.5366926193237305,
"learning_rate": 2.5179856115107916e-07,
"loss": 0.0542,
"step": 813
},
{
"epoch": 5.816472694717994,
"grad_norm": 5.767791271209717,
"learning_rate": 2.3980815347721825e-07,
"loss": 0.0548,
"step": 814
},
{
"epoch": 5.823634735899732,
"grad_norm": 3.0440096855163574,
"learning_rate": 2.2781774580335734e-07,
"loss": 0.0408,
"step": 815
},
{
"epoch": 5.830796777081468,
"grad_norm": 8.914772033691406,
"learning_rate": 2.1582733812949643e-07,
"loss": 0.0981,
"step": 816
},
{
"epoch": 5.837958818263205,
"grad_norm": 6.404951572418213,
"learning_rate": 2.0383693045563552e-07,
"loss": 0.2039,
"step": 817
},
{
"epoch": 5.8451208594449415,
"grad_norm": 7.629164695739746,
"learning_rate": 1.918465227817746e-07,
"loss": 0.2741,
"step": 818
},
{
"epoch": 5.852282900626679,
"grad_norm": 8.382116317749023,
"learning_rate": 1.7985611510791368e-07,
"loss": 0.2178,
"step": 819
},
{
"epoch": 5.859444941808415,
"grad_norm": 12.157092094421387,
"learning_rate": 1.6786570743405277e-07,
"loss": 0.2596,
"step": 820
},
{
"epoch": 5.866606982990152,
"grad_norm": 7.726680755615234,
"learning_rate": 1.5587529976019186e-07,
"loss": 0.1078,
"step": 821
},
{
"epoch": 5.873769024171889,
"grad_norm": 1.8861736059188843,
"learning_rate": 1.4388489208633095e-07,
"loss": 0.054,
"step": 822
},
{
"epoch": 5.880931065353626,
"grad_norm": 8.069757461547852,
"learning_rate": 1.3189448441247004e-07,
"loss": 0.1343,
"step": 823
},
{
"epoch": 5.888093106535362,
"grad_norm": 1.1135824918746948,
"learning_rate": 1.1990407673860913e-07,
"loss": 0.0295,
"step": 824
},
{
"epoch": 5.8952551477171,
"grad_norm": 5.592498302459717,
"learning_rate": 1.0791366906474822e-07,
"loss": 0.0817,
"step": 825
},
{
"epoch": 5.902417188898836,
"grad_norm": 11.28853702545166,
"learning_rate": 9.59232613908873e-08,
"loss": 0.1309,
"step": 826
},
{
"epoch": 5.909579230080573,
"grad_norm": 8.105303764343262,
"learning_rate": 8.393285371702638e-08,
"loss": 0.1098,
"step": 827
},
{
"epoch": 5.9167412712623095,
"grad_norm": 2.810290575027466,
"learning_rate": 7.194244604316547e-08,
"loss": 0.0679,
"step": 828
},
{
"epoch": 5.923903312444047,
"grad_norm": 4.570773124694824,
"learning_rate": 5.995203836930456e-08,
"loss": 0.0602,
"step": 829
},
{
"epoch": 5.931065353625783,
"grad_norm": 4.349383354187012,
"learning_rate": 4.796163069544365e-08,
"loss": 0.0524,
"step": 830
},
{
"epoch": 5.93822739480752,
"grad_norm": 3.1582112312316895,
"learning_rate": 3.597122302158274e-08,
"loss": 0.0263,
"step": 831
},
{
"epoch": 5.945389435989257,
"grad_norm": 7.5621867179870605,
"learning_rate": 2.3980815347721823e-08,
"loss": 0.1276,
"step": 832
},
{
"epoch": 5.952551477170994,
"grad_norm": 6.433109760284424,
"learning_rate": 1.1990407673860912e-08,
"loss": 0.1642,
"step": 833
},
{
"epoch": 5.95971351835273,
"grad_norm": 13.487061500549316,
"learning_rate": 0.0,
"loss": 0.3048,
"step": 834
},
{
"epoch": 5.95971351835273,
"eval_accuracy": 0.8893360160965795,
"eval_loss": 0.280468225479126,
"eval_runtime": 12.771,
"eval_samples_per_second": 38.916,
"eval_steps_per_second": 38.916,
"step": 834
}
],
"logging_steps": 1,
"max_steps": 834,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7006309733928960.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}