|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.95971351835273, |
|
"eval_steps": 500, |
|
"global_step": 834, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007162041181736795, |
|
"grad_norm": 1.5651538372039795, |
|
"learning_rate": 9.98800959232614e-06, |
|
"loss": 1.1028, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01432408236347359, |
|
"grad_norm": 2.358640193939209, |
|
"learning_rate": 9.976019184652279e-06, |
|
"loss": 1.1257, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.021486123545210387, |
|
"grad_norm": 1.7103272676467896, |
|
"learning_rate": 9.964028776978418e-06, |
|
"loss": 1.1098, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.02864816472694718, |
|
"grad_norm": 1.783992886543274, |
|
"learning_rate": 9.952038369304557e-06, |
|
"loss": 1.0858, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03581020590868397, |
|
"grad_norm": 1.0596697330474854, |
|
"learning_rate": 9.940047961630696e-06, |
|
"loss": 1.0924, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04297224709042077, |
|
"grad_norm": 1.694056510925293, |
|
"learning_rate": 9.928057553956835e-06, |
|
"loss": 1.0912, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.050134288272157566, |
|
"grad_norm": 1.4265025854110718, |
|
"learning_rate": 9.916067146282976e-06, |
|
"loss": 1.0833, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05729632945389436, |
|
"grad_norm": 2.3417539596557617, |
|
"learning_rate": 9.904076738609113e-06, |
|
"loss": 1.1031, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.06445837063563116, |
|
"grad_norm": 1.6366913318634033, |
|
"learning_rate": 9.892086330935252e-06, |
|
"loss": 1.1186, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.07162041181736795, |
|
"grad_norm": 2.713798761367798, |
|
"learning_rate": 9.880095923261391e-06, |
|
"loss": 1.119, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07878245299910475, |
|
"grad_norm": 1.0447508096694946, |
|
"learning_rate": 9.86810551558753e-06, |
|
"loss": 1.0836, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.08594449418084155, |
|
"grad_norm": 1.4873104095458984, |
|
"learning_rate": 9.85611510791367e-06, |
|
"loss": 1.1093, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.09310653536257833, |
|
"grad_norm": 2.0511322021484375, |
|
"learning_rate": 9.844124700239808e-06, |
|
"loss": 1.1048, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.10026857654431513, |
|
"grad_norm": 1.8400065898895264, |
|
"learning_rate": 9.832134292565947e-06, |
|
"loss": 1.1084, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.10743061772605192, |
|
"grad_norm": 1.743239402770996, |
|
"learning_rate": 9.820143884892086e-06, |
|
"loss": 1.0893, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.11459265890778872, |
|
"grad_norm": 0.8387613296508789, |
|
"learning_rate": 9.808153477218227e-06, |
|
"loss": 1.101, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.12175470008952552, |
|
"grad_norm": 2.1584479808807373, |
|
"learning_rate": 9.796163069544366e-06, |
|
"loss": 1.1003, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.12891674127126232, |
|
"grad_norm": 1.1021759510040283, |
|
"learning_rate": 9.784172661870505e-06, |
|
"loss": 1.0921, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1360787824529991, |
|
"grad_norm": 1.9304773807525635, |
|
"learning_rate": 9.772182254196644e-06, |
|
"loss": 1.0968, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.1432408236347359, |
|
"grad_norm": 1.6461104154586792, |
|
"learning_rate": 9.760191846522783e-06, |
|
"loss": 1.0937, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1504028648164727, |
|
"grad_norm": 1.5581350326538086, |
|
"learning_rate": 9.748201438848922e-06, |
|
"loss": 1.0716, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1575649059982095, |
|
"grad_norm": 1.9193130731582642, |
|
"learning_rate": 9.736211031175061e-06, |
|
"loss": 1.0634, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1647269471799463, |
|
"grad_norm": 1.8347828388214111, |
|
"learning_rate": 9.724220623501199e-06, |
|
"loss": 1.0665, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1718889883616831, |
|
"grad_norm": 2.1090774536132812, |
|
"learning_rate": 9.712230215827338e-06, |
|
"loss": 1.0967, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.17905102954341987, |
|
"grad_norm": 1.6594020128250122, |
|
"learning_rate": 9.700239808153478e-06, |
|
"loss": 1.0695, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.18621307072515667, |
|
"grad_norm": 1.6360571384429932, |
|
"learning_rate": 9.688249400479617e-06, |
|
"loss": 1.0808, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.19337511190689347, |
|
"grad_norm": 1.7902008295059204, |
|
"learning_rate": 9.676258992805757e-06, |
|
"loss": 1.0693, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.20053715308863027, |
|
"grad_norm": 1.9148868322372437, |
|
"learning_rate": 9.664268585131896e-06, |
|
"loss": 1.0656, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.20769919427036707, |
|
"grad_norm": 1.5940035581588745, |
|
"learning_rate": 9.652278177458035e-06, |
|
"loss": 1.0617, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.21486123545210384, |
|
"grad_norm": 2.1188247203826904, |
|
"learning_rate": 9.640287769784174e-06, |
|
"loss": 1.0571, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.22202327663384064, |
|
"grad_norm": 2.0343472957611084, |
|
"learning_rate": 9.628297362110313e-06, |
|
"loss": 1.0601, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.22918531781557744, |
|
"grad_norm": 2.536809206008911, |
|
"learning_rate": 9.616306954436452e-06, |
|
"loss": 1.059, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.23634735899731424, |
|
"grad_norm": 2.986196279525757, |
|
"learning_rate": 9.60431654676259e-06, |
|
"loss": 0.9905, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.24350940017905104, |
|
"grad_norm": 2.9638538360595703, |
|
"learning_rate": 9.59232613908873e-06, |
|
"loss": 1.0369, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.25067144136078784, |
|
"grad_norm": 3.4822328090667725, |
|
"learning_rate": 9.580335731414869e-06, |
|
"loss": 1.0549, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.25783348254252464, |
|
"grad_norm": 2.860246181488037, |
|
"learning_rate": 9.568345323741008e-06, |
|
"loss": 1.0131, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.26499552372426144, |
|
"grad_norm": 3.4336676597595215, |
|
"learning_rate": 9.556354916067147e-06, |
|
"loss": 1.032, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2721575649059982, |
|
"grad_norm": 3.2346091270446777, |
|
"learning_rate": 9.544364508393286e-06, |
|
"loss": 1.0287, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.279319606087735, |
|
"grad_norm": 2.7058355808258057, |
|
"learning_rate": 9.532374100719425e-06, |
|
"loss": 1.0313, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.2864816472694718, |
|
"grad_norm": 3.560464382171631, |
|
"learning_rate": 9.520383693045564e-06, |
|
"loss": 1.013, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2936436884512086, |
|
"grad_norm": 3.6023006439208984, |
|
"learning_rate": 9.508393285371703e-06, |
|
"loss": 0.9168, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.3008057296329454, |
|
"grad_norm": 3.6482856273651123, |
|
"learning_rate": 9.496402877697842e-06, |
|
"loss": 0.973, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3079677708146822, |
|
"grad_norm": 3.4008662700653076, |
|
"learning_rate": 9.484412470023981e-06, |
|
"loss": 1.0305, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.315129811996419, |
|
"grad_norm": 3.068812847137451, |
|
"learning_rate": 9.47242206235012e-06, |
|
"loss": 0.9169, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.3222918531781558, |
|
"grad_norm": 3.736107349395752, |
|
"learning_rate": 9.46043165467626e-06, |
|
"loss": 1.0063, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3294538943598926, |
|
"grad_norm": 4.98853063583374, |
|
"learning_rate": 9.448441247002398e-06, |
|
"loss": 0.9408, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3366159355416294, |
|
"grad_norm": 3.7207694053649902, |
|
"learning_rate": 9.436450839328539e-06, |
|
"loss": 0.9363, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3437779767233662, |
|
"grad_norm": 4.021739959716797, |
|
"learning_rate": 9.424460431654678e-06, |
|
"loss": 0.9955, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.35094001790510293, |
|
"grad_norm": 3.676600694656372, |
|
"learning_rate": 9.412470023980817e-06, |
|
"loss": 1.0505, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.35810205908683973, |
|
"grad_norm": 3.9010024070739746, |
|
"learning_rate": 9.400479616306956e-06, |
|
"loss": 0.9115, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.36526410026857653, |
|
"grad_norm": 3.856921672821045, |
|
"learning_rate": 9.388489208633095e-06, |
|
"loss": 0.9625, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.37242614145031333, |
|
"grad_norm": 3.6618595123291016, |
|
"learning_rate": 9.376498800959234e-06, |
|
"loss": 0.9107, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.37958818263205013, |
|
"grad_norm": 3.955465793609619, |
|
"learning_rate": 9.364508393285371e-06, |
|
"loss": 0.9036, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.38675022381378693, |
|
"grad_norm": 4.451611518859863, |
|
"learning_rate": 9.35251798561151e-06, |
|
"loss": 0.8745, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.39391226499552373, |
|
"grad_norm": 3.950402021408081, |
|
"learning_rate": 9.34052757793765e-06, |
|
"loss": 0.8967, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.40107430617726053, |
|
"grad_norm": 4.007294654846191, |
|
"learning_rate": 9.32853717026379e-06, |
|
"loss": 0.9382, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.40823634735899733, |
|
"grad_norm": 4.601572036743164, |
|
"learning_rate": 9.31654676258993e-06, |
|
"loss": 0.892, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.41539838854073413, |
|
"grad_norm": 3.6578915119171143, |
|
"learning_rate": 9.304556354916068e-06, |
|
"loss": 0.8066, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.42256042972247093, |
|
"grad_norm": 4.2530293464660645, |
|
"learning_rate": 9.292565947242207e-06, |
|
"loss": 0.7893, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.4297224709042077, |
|
"grad_norm": 4.652935028076172, |
|
"learning_rate": 9.280575539568346e-06, |
|
"loss": 0.805, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4368845120859445, |
|
"grad_norm": 4.985301494598389, |
|
"learning_rate": 9.268585131894485e-06, |
|
"loss": 0.8248, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4440465532676813, |
|
"grad_norm": 5.5606513023376465, |
|
"learning_rate": 9.256594724220624e-06, |
|
"loss": 0.8412, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4512085944494181, |
|
"grad_norm": 4.6756110191345215, |
|
"learning_rate": 9.244604316546764e-06, |
|
"loss": 0.8119, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.4583706356311549, |
|
"grad_norm": 4.607848644256592, |
|
"learning_rate": 9.232613908872903e-06, |
|
"loss": 0.7553, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.4655326768128917, |
|
"grad_norm": 5.126153945922852, |
|
"learning_rate": 9.220623501199042e-06, |
|
"loss": 0.7668, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4726947179946285, |
|
"grad_norm": 4.498857498168945, |
|
"learning_rate": 9.20863309352518e-06, |
|
"loss": 0.727, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4798567591763653, |
|
"grad_norm": 5.598169326782227, |
|
"learning_rate": 9.19664268585132e-06, |
|
"loss": 0.8671, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.4870188003581021, |
|
"grad_norm": 5.176477432250977, |
|
"learning_rate": 9.184652278177459e-06, |
|
"loss": 0.8697, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.4941808415398389, |
|
"grad_norm": 4.874248504638672, |
|
"learning_rate": 9.172661870503598e-06, |
|
"loss": 0.7736, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5013428827215757, |
|
"grad_norm": 4.082653999328613, |
|
"learning_rate": 9.160671462829737e-06, |
|
"loss": 0.7047, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5085049239033125, |
|
"grad_norm": 5.192132949829102, |
|
"learning_rate": 9.148681055155876e-06, |
|
"loss": 0.7359, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5156669650850493, |
|
"grad_norm": 5.804781436920166, |
|
"learning_rate": 9.136690647482015e-06, |
|
"loss": 0.6859, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5228290062667861, |
|
"grad_norm": 4.250144004821777, |
|
"learning_rate": 9.124700239808154e-06, |
|
"loss": 0.7178, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5299910474485229, |
|
"grad_norm": 4.898803234100342, |
|
"learning_rate": 9.112709832134293e-06, |
|
"loss": 0.8012, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5371530886302597, |
|
"grad_norm": 4.092855453491211, |
|
"learning_rate": 9.100719424460432e-06, |
|
"loss": 0.595, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5443151298119964, |
|
"grad_norm": 4.206191062927246, |
|
"learning_rate": 9.088729016786571e-06, |
|
"loss": 0.6574, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5514771709937332, |
|
"grad_norm": 5.5786967277526855, |
|
"learning_rate": 9.07673860911271e-06, |
|
"loss": 0.6096, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.55863921217547, |
|
"grad_norm": 4.80825138092041, |
|
"learning_rate": 9.064748201438849e-06, |
|
"loss": 0.6351, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5658012533572068, |
|
"grad_norm": 5.1291608810424805, |
|
"learning_rate": 9.05275779376499e-06, |
|
"loss": 0.6163, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5729632945389436, |
|
"grad_norm": 4.304827690124512, |
|
"learning_rate": 9.040767386091129e-06, |
|
"loss": 0.6089, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5801253357206804, |
|
"grad_norm": 3.9486804008483887, |
|
"learning_rate": 9.028776978417268e-06, |
|
"loss": 0.5804, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5872873769024172, |
|
"grad_norm": 6.8340163230896, |
|
"learning_rate": 9.016786570743405e-06, |
|
"loss": 0.6102, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.594449418084154, |
|
"grad_norm": 5.0892133712768555, |
|
"learning_rate": 9.004796163069544e-06, |
|
"loss": 0.6376, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.6016114592658908, |
|
"grad_norm": 4.589208126068115, |
|
"learning_rate": 8.992805755395683e-06, |
|
"loss": 0.6618, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6087735004476276, |
|
"grad_norm": 4.036871433258057, |
|
"learning_rate": 8.980815347721822e-06, |
|
"loss": 0.6268, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6159355416293644, |
|
"grad_norm": 5.117236614227295, |
|
"learning_rate": 8.968824940047961e-06, |
|
"loss": 0.5782, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6230975828111012, |
|
"grad_norm": 5.529454708099365, |
|
"learning_rate": 8.956834532374102e-06, |
|
"loss": 0.6378, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.630259623992838, |
|
"grad_norm": 4.290615558624268, |
|
"learning_rate": 8.944844124700241e-06, |
|
"loss": 0.4607, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.6374216651745748, |
|
"grad_norm": 4.275355815887451, |
|
"learning_rate": 8.93285371702638e-06, |
|
"loss": 0.5444, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.6445837063563116, |
|
"grad_norm": 5.064560890197754, |
|
"learning_rate": 8.92086330935252e-06, |
|
"loss": 0.7254, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6517457475380484, |
|
"grad_norm": 4.527130126953125, |
|
"learning_rate": 8.908872901678658e-06, |
|
"loss": 0.5581, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.6589077887197852, |
|
"grad_norm": 5.649165630340576, |
|
"learning_rate": 8.896882494004797e-06, |
|
"loss": 0.5265, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.666069829901522, |
|
"grad_norm": 4.565707206726074, |
|
"learning_rate": 8.884892086330936e-06, |
|
"loss": 0.5161, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6732318710832588, |
|
"grad_norm": 6.295898914337158, |
|
"learning_rate": 8.872901678657075e-06, |
|
"loss": 0.5414, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6803939122649956, |
|
"grad_norm": 8.986715316772461, |
|
"learning_rate": 8.860911270983214e-06, |
|
"loss": 0.5894, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6875559534467324, |
|
"grad_norm": 6.694461822509766, |
|
"learning_rate": 8.848920863309353e-06, |
|
"loss": 0.5679, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6947179946284691, |
|
"grad_norm": 5.961154460906982, |
|
"learning_rate": 8.836930455635492e-06, |
|
"loss": 0.6379, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7018800358102059, |
|
"grad_norm": 4.6056671142578125, |
|
"learning_rate": 8.824940047961632e-06, |
|
"loss": 0.5691, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7090420769919427, |
|
"grad_norm": 5.036160945892334, |
|
"learning_rate": 8.81294964028777e-06, |
|
"loss": 0.5234, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.7162041181736795, |
|
"grad_norm": 4.869359016418457, |
|
"learning_rate": 8.80095923261391e-06, |
|
"loss": 0.4494, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7233661593554163, |
|
"grad_norm": 6.883158206939697, |
|
"learning_rate": 8.788968824940049e-06, |
|
"loss": 0.5863, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.7305282005371531, |
|
"grad_norm": 4.8142805099487305, |
|
"learning_rate": 8.776978417266188e-06, |
|
"loss": 0.4585, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.7376902417188899, |
|
"grad_norm": 4.156213760375977, |
|
"learning_rate": 8.764988009592327e-06, |
|
"loss": 0.4224, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.7448522829006267, |
|
"grad_norm": 5.524331569671631, |
|
"learning_rate": 8.752997601918466e-06, |
|
"loss": 0.5855, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.7520143240823635, |
|
"grad_norm": 4.5275468826293945, |
|
"learning_rate": 8.741007194244605e-06, |
|
"loss": 0.3134, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7591763652641003, |
|
"grad_norm": 6.391297340393066, |
|
"learning_rate": 8.729016786570744e-06, |
|
"loss": 0.4662, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.7663384064458371, |
|
"grad_norm": 4.844995498657227, |
|
"learning_rate": 8.717026378896883e-06, |
|
"loss": 0.5037, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.7735004476275739, |
|
"grad_norm": 5.861647129058838, |
|
"learning_rate": 8.705035971223022e-06, |
|
"loss": 0.4134, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7806624888093107, |
|
"grad_norm": 4.5392889976501465, |
|
"learning_rate": 8.693045563549161e-06, |
|
"loss": 0.4099, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.7878245299910475, |
|
"grad_norm": 4.010335922241211, |
|
"learning_rate": 8.681055155875302e-06, |
|
"loss": 0.3897, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7949865711727843, |
|
"grad_norm": 5.2261433601379395, |
|
"learning_rate": 8.66906474820144e-06, |
|
"loss": 0.3422, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.8021486123545211, |
|
"grad_norm": 5.837971210479736, |
|
"learning_rate": 8.657074340527578e-06, |
|
"loss": 0.4188, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.8093106535362579, |
|
"grad_norm": 7.071847915649414, |
|
"learning_rate": 8.645083932853717e-06, |
|
"loss": 0.5138, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.8164726947179947, |
|
"grad_norm": 3.631950855255127, |
|
"learning_rate": 8.633093525179856e-06, |
|
"loss": 0.2924, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.8236347358997315, |
|
"grad_norm": 5.4959797859191895, |
|
"learning_rate": 8.621103117505995e-06, |
|
"loss": 0.4857, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8307967770814683, |
|
"grad_norm": 6.527896404266357, |
|
"learning_rate": 8.609112709832134e-06, |
|
"loss": 0.4211, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.8379588182632051, |
|
"grad_norm": 7.07539176940918, |
|
"learning_rate": 8.597122302158273e-06, |
|
"loss": 0.481, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.8451208594449419, |
|
"grad_norm": 5.752196311950684, |
|
"learning_rate": 8.585131894484412e-06, |
|
"loss": 0.4774, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.8522829006266786, |
|
"grad_norm": 3.1388776302337646, |
|
"learning_rate": 8.573141486810553e-06, |
|
"loss": 0.2974, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.8594449418084154, |
|
"grad_norm": 5.351109504699707, |
|
"learning_rate": 8.561151079136692e-06, |
|
"loss": 0.333, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8666069829901522, |
|
"grad_norm": 4.641998767852783, |
|
"learning_rate": 8.549160671462831e-06, |
|
"loss": 0.4377, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.873769024171889, |
|
"grad_norm": 9.292861938476562, |
|
"learning_rate": 8.53717026378897e-06, |
|
"loss": 0.497, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.8809310653536258, |
|
"grad_norm": 5.21453857421875, |
|
"learning_rate": 8.525179856115109e-06, |
|
"loss": 0.4271, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.8880931065353626, |
|
"grad_norm": 6.3802618980407715, |
|
"learning_rate": 8.513189448441248e-06, |
|
"loss": 0.4672, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8952551477170994, |
|
"grad_norm": 5.154406547546387, |
|
"learning_rate": 8.501199040767387e-06, |
|
"loss": 0.2783, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9024171888988362, |
|
"grad_norm": 3.9693143367767334, |
|
"learning_rate": 8.489208633093526e-06, |
|
"loss": 0.3932, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.909579230080573, |
|
"grad_norm": 2.942033529281616, |
|
"learning_rate": 8.477218225419664e-06, |
|
"loss": 0.3193, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.9167412712623098, |
|
"grad_norm": 4.29665994644165, |
|
"learning_rate": 8.465227817745804e-06, |
|
"loss": 0.4377, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.9239033124440466, |
|
"grad_norm": 5.550212860107422, |
|
"learning_rate": 8.453237410071943e-06, |
|
"loss": 0.3315, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.9310653536257834, |
|
"grad_norm": 6.654735565185547, |
|
"learning_rate": 8.441247002398082e-06, |
|
"loss": 0.4388, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9382273948075202, |
|
"grad_norm": 5.80216121673584, |
|
"learning_rate": 8.429256594724221e-06, |
|
"loss": 0.3865, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.945389435989257, |
|
"grad_norm": 6.76437520980835, |
|
"learning_rate": 8.41726618705036e-06, |
|
"loss": 0.5202, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.9525514771709938, |
|
"grad_norm": 9.009016990661621, |
|
"learning_rate": 8.4052757793765e-06, |
|
"loss": 0.4008, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.9597135183527306, |
|
"grad_norm": 9.987932205200195, |
|
"learning_rate": 8.393285371702639e-06, |
|
"loss": 0.4824, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.9668755595344674, |
|
"grad_norm": 10.408086776733398, |
|
"learning_rate": 8.381294964028778e-06, |
|
"loss": 0.7183, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.9740376007162042, |
|
"grad_norm": 4.125233173370361, |
|
"learning_rate": 8.369304556354917e-06, |
|
"loss": 0.331, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.981199641897941, |
|
"grad_norm": 5.206197738647461, |
|
"learning_rate": 8.357314148681056e-06, |
|
"loss": 0.451, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.9883616830796778, |
|
"grad_norm": 6.510553359985352, |
|
"learning_rate": 8.345323741007195e-06, |
|
"loss": 0.2887, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.9955237242614146, |
|
"grad_norm": 5.558812141418457, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.2659, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 5.024322509765625, |
|
"learning_rate": 8.321342925659473e-06, |
|
"loss": 0.2857, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.8551307847082495, |
|
"eval_loss": 0.34348970651626587, |
|
"eval_runtime": 12.772, |
|
"eval_samples_per_second": 38.913, |
|
"eval_steps_per_second": 38.913, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0071620411817368, |
|
"grad_norm": 5.62790584564209, |
|
"learning_rate": 8.309352517985614e-06, |
|
"loss": 0.3689, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.0143240823634736, |
|
"grad_norm": 6.509753704071045, |
|
"learning_rate": 8.29736211031175e-06, |
|
"loss": 0.3419, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.0214861235452104, |
|
"grad_norm": 7.264653205871582, |
|
"learning_rate": 8.28537170263789e-06, |
|
"loss": 0.2551, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.0286481647269472, |
|
"grad_norm": 6.89661169052124, |
|
"learning_rate": 8.273381294964029e-06, |
|
"loss": 0.4026, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.035810205908684, |
|
"grad_norm": 6.0390238761901855, |
|
"learning_rate": 8.261390887290168e-06, |
|
"loss": 0.2978, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.0429722470904208, |
|
"grad_norm": 6.132388591766357, |
|
"learning_rate": 8.249400479616307e-06, |
|
"loss": 0.4529, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.0501342882721576, |
|
"grad_norm": 6.270555019378662, |
|
"learning_rate": 8.237410071942446e-06, |
|
"loss": 0.4341, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.0572963294538944, |
|
"grad_norm": 8.636746406555176, |
|
"learning_rate": 8.225419664268585e-06, |
|
"loss": 0.3452, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.0644583706356312, |
|
"grad_norm": 5.3527021408081055, |
|
"learning_rate": 8.213429256594724e-06, |
|
"loss": 0.2689, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.071620411817368, |
|
"grad_norm": 5.665110111236572, |
|
"learning_rate": 8.201438848920865e-06, |
|
"loss": 0.5618, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0787824529991048, |
|
"grad_norm": 9.846869468688965, |
|
"learning_rate": 8.189448441247004e-06, |
|
"loss": 0.3931, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.0859444941808416, |
|
"grad_norm": 8.280915260314941, |
|
"learning_rate": 8.177458033573143e-06, |
|
"loss": 0.3301, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.0931065353625784, |
|
"grad_norm": 5.660332202911377, |
|
"learning_rate": 8.165467625899282e-06, |
|
"loss": 0.372, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.1002685765443152, |
|
"grad_norm": 3.366448402404785, |
|
"learning_rate": 8.153477218225421e-06, |
|
"loss": 0.3427, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.107430617726052, |
|
"grad_norm": 6.918087959289551, |
|
"learning_rate": 8.14148681055156e-06, |
|
"loss": 0.5082, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.1145926589077888, |
|
"grad_norm": 7.009018898010254, |
|
"learning_rate": 8.129496402877699e-06, |
|
"loss": 0.3534, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.1217547000895256, |
|
"grad_norm": 5.730655193328857, |
|
"learning_rate": 8.117505995203836e-06, |
|
"loss": 0.2806, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.1289167412712624, |
|
"grad_norm": 3.503355026245117, |
|
"learning_rate": 8.105515587529975e-06, |
|
"loss": 0.2374, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.1360787824529992, |
|
"grad_norm": 3.6845366954803467, |
|
"learning_rate": 8.093525179856116e-06, |
|
"loss": 0.2822, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.143240823634736, |
|
"grad_norm": 8.738545417785645, |
|
"learning_rate": 8.081534772182255e-06, |
|
"loss": 0.3553, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.1504028648164728, |
|
"grad_norm": 12.152175903320312, |
|
"learning_rate": 8.069544364508394e-06, |
|
"loss": 0.264, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.1575649059982096, |
|
"grad_norm": 6.637088775634766, |
|
"learning_rate": 8.057553956834533e-06, |
|
"loss": 0.3602, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.1647269471799464, |
|
"grad_norm": 11.342058181762695, |
|
"learning_rate": 8.045563549160672e-06, |
|
"loss": 0.3529, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.1718889883616832, |
|
"grad_norm": 3.130880355834961, |
|
"learning_rate": 8.033573141486811e-06, |
|
"loss": 0.2449, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.1790510295434198, |
|
"grad_norm": 4.657078266143799, |
|
"learning_rate": 8.02158273381295e-06, |
|
"loss": 0.2095, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.1862130707251566, |
|
"grad_norm": 11.053173065185547, |
|
"learning_rate": 8.00959232613909e-06, |
|
"loss": 0.3824, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.1933751119068934, |
|
"grad_norm": 8.9373779296875, |
|
"learning_rate": 7.997601918465228e-06, |
|
"loss": 0.5074, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.2005371530886302, |
|
"grad_norm": 7.14840030670166, |
|
"learning_rate": 7.985611510791367e-06, |
|
"loss": 0.3061, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.207699194270367, |
|
"grad_norm": 6.889973163604736, |
|
"learning_rate": 7.973621103117507e-06, |
|
"loss": 0.3045, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.2148612354521038, |
|
"grad_norm": 10.003790855407715, |
|
"learning_rate": 7.961630695443646e-06, |
|
"loss": 0.3446, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.2220232766338406, |
|
"grad_norm": 6.85793924331665, |
|
"learning_rate": 7.949640287769785e-06, |
|
"loss": 0.3411, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.2291853178155774, |
|
"grad_norm": 7.919402122497559, |
|
"learning_rate": 7.937649880095924e-06, |
|
"loss": 0.4896, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.2363473589973142, |
|
"grad_norm": 3.570951461791992, |
|
"learning_rate": 7.925659472422063e-06, |
|
"loss": 0.211, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.243509400179051, |
|
"grad_norm": 4.916582107543945, |
|
"learning_rate": 7.913669064748202e-06, |
|
"loss": 0.2846, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.2506714413607878, |
|
"grad_norm": 8.822362899780273, |
|
"learning_rate": 7.90167865707434e-06, |
|
"loss": 0.6171, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.2578334825425246, |
|
"grad_norm": 6.692116737365723, |
|
"learning_rate": 7.88968824940048e-06, |
|
"loss": 0.3357, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.2649955237242614, |
|
"grad_norm": 8.720771789550781, |
|
"learning_rate": 7.877697841726619e-06, |
|
"loss": 0.3968, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.2721575649059982, |
|
"grad_norm": 3.4842636585235596, |
|
"learning_rate": 7.865707434052758e-06, |
|
"loss": 0.2225, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.279319606087735, |
|
"grad_norm": 5.311177730560303, |
|
"learning_rate": 7.853717026378897e-06, |
|
"loss": 0.2241, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.2864816472694718, |
|
"grad_norm": 7.102256774902344, |
|
"learning_rate": 7.841726618705036e-06, |
|
"loss": 0.4413, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.2936436884512086, |
|
"grad_norm": 9.19848346710205, |
|
"learning_rate": 7.829736211031177e-06, |
|
"loss": 0.4979, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.3008057296329454, |
|
"grad_norm": 6.935247421264648, |
|
"learning_rate": 7.817745803357316e-06, |
|
"loss": 0.3895, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.3079677708146822, |
|
"grad_norm": 6.123559951782227, |
|
"learning_rate": 7.805755395683455e-06, |
|
"loss": 0.6249, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.315129811996419, |
|
"grad_norm": 4.8054609298706055, |
|
"learning_rate": 7.793764988009594e-06, |
|
"loss": 0.3903, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.3222918531781558, |
|
"grad_norm": 3.220245361328125, |
|
"learning_rate": 7.781774580335733e-06, |
|
"loss": 0.2484, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.3294538943598926, |
|
"grad_norm": 7.8549346923828125, |
|
"learning_rate": 7.769784172661872e-06, |
|
"loss": 0.2953, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.3366159355416294, |
|
"grad_norm": 4.862519264221191, |
|
"learning_rate": 7.75779376498801e-06, |
|
"loss": 0.3309, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.3437779767233662, |
|
"grad_norm": 7.397862434387207, |
|
"learning_rate": 7.745803357314148e-06, |
|
"loss": 0.3559, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.350940017905103, |
|
"grad_norm": 6.308946132659912, |
|
"learning_rate": 7.733812949640287e-06, |
|
"loss": 0.2876, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.3581020590868398, |
|
"grad_norm": 6.779823303222656, |
|
"learning_rate": 7.721822541966428e-06, |
|
"loss": 0.1981, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.3652641002685766, |
|
"grad_norm": 7.278501033782959, |
|
"learning_rate": 7.709832134292567e-06, |
|
"loss": 0.424, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.3724261414503134, |
|
"grad_norm": 7.412106037139893, |
|
"learning_rate": 7.697841726618706e-06, |
|
"loss": 0.2568, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.3795881826320502, |
|
"grad_norm": 4.081076145172119, |
|
"learning_rate": 7.685851318944845e-06, |
|
"loss": 0.227, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.386750223813787, |
|
"grad_norm": 10.722938537597656, |
|
"learning_rate": 7.673860911270984e-06, |
|
"loss": 0.4147, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.3939122649955238, |
|
"grad_norm": 4.308574676513672, |
|
"learning_rate": 7.661870503597123e-06, |
|
"loss": 0.2981, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.4010743061772606, |
|
"grad_norm": 6.543905735015869, |
|
"learning_rate": 7.649880095923262e-06, |
|
"loss": 0.146, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.4082363473589974, |
|
"grad_norm": 7.446622371673584, |
|
"learning_rate": 7.637889688249401e-06, |
|
"loss": 0.3292, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.4153983885407342, |
|
"grad_norm": 6.272434234619141, |
|
"learning_rate": 7.62589928057554e-06, |
|
"loss": 0.3738, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.422560429722471, |
|
"grad_norm": 6.094548225402832, |
|
"learning_rate": 7.613908872901679e-06, |
|
"loss": 0.2389, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.4297224709042076, |
|
"grad_norm": 6.979650020599365, |
|
"learning_rate": 7.601918465227819e-06, |
|
"loss": 0.2491, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.4368845120859444, |
|
"grad_norm": 7.520836353302002, |
|
"learning_rate": 7.589928057553958e-06, |
|
"loss": 0.2936, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.4440465532676812, |
|
"grad_norm": 9.974089622497559, |
|
"learning_rate": 7.5779376498800964e-06, |
|
"loss": 0.372, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.451208594449418, |
|
"grad_norm": 8.374258995056152, |
|
"learning_rate": 7.5659472422062355e-06, |
|
"loss": 0.2795, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.4583706356311548, |
|
"grad_norm": 6.120067119598389, |
|
"learning_rate": 7.5539568345323745e-06, |
|
"loss": 0.2502, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.4655326768128916, |
|
"grad_norm": 2.661911725997925, |
|
"learning_rate": 7.5419664268585136e-06, |
|
"loss": 0.2178, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.4726947179946284, |
|
"grad_norm": 5.33140754699707, |
|
"learning_rate": 7.529976019184653e-06, |
|
"loss": 0.2776, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.4798567591763652, |
|
"grad_norm": 6.527672290802002, |
|
"learning_rate": 7.517985611510792e-06, |
|
"loss": 0.2005, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.487018800358102, |
|
"grad_norm": 3.9632420539855957, |
|
"learning_rate": 7.505995203836931e-06, |
|
"loss": 0.1922, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.4941808415398388, |
|
"grad_norm": 5.8358330726623535, |
|
"learning_rate": 7.4940047961630706e-06, |
|
"loss": 0.327, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.5013428827215756, |
|
"grad_norm": 5.2152557373046875, |
|
"learning_rate": 7.48201438848921e-06, |
|
"loss": 0.3027, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5085049239033124, |
|
"grad_norm": 8.100699424743652, |
|
"learning_rate": 7.470023980815349e-06, |
|
"loss": 0.2879, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.5156669650850492, |
|
"grad_norm": 7.577643871307373, |
|
"learning_rate": 7.458033573141488e-06, |
|
"loss": 0.3307, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.522829006266786, |
|
"grad_norm": 7.359758377075195, |
|
"learning_rate": 7.446043165467627e-06, |
|
"loss": 0.2822, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.5299910474485228, |
|
"grad_norm": 5.6610612869262695, |
|
"learning_rate": 7.434052757793766e-06, |
|
"loss": 0.4894, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.5371530886302596, |
|
"grad_norm": 3.8541088104248047, |
|
"learning_rate": 7.422062350119905e-06, |
|
"loss": 0.2774, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.5443151298119964, |
|
"grad_norm": 6.980274200439453, |
|
"learning_rate": 7.410071942446043e-06, |
|
"loss": 0.4373, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.5514771709937332, |
|
"grad_norm": 6.333699703216553, |
|
"learning_rate": 7.398081534772182e-06, |
|
"loss": 0.4068, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.55863921217547, |
|
"grad_norm": 8.193052291870117, |
|
"learning_rate": 7.386091127098322e-06, |
|
"loss": 0.2872, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.5658012533572068, |
|
"grad_norm": 8.756412506103516, |
|
"learning_rate": 7.374100719424461e-06, |
|
"loss": 0.3259, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.5729632945389436, |
|
"grad_norm": 5.721433639526367, |
|
"learning_rate": 7.3621103117506e-06, |
|
"loss": 0.2389, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.5801253357206804, |
|
"grad_norm": 4.392519474029541, |
|
"learning_rate": 7.350119904076739e-06, |
|
"loss": 0.2862, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.5872873769024172, |
|
"grad_norm": 7.019931316375732, |
|
"learning_rate": 7.338129496402878e-06, |
|
"loss": 0.4388, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.594449418084154, |
|
"grad_norm": 6.2576470375061035, |
|
"learning_rate": 7.326139088729017e-06, |
|
"loss": 0.1563, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.6016114592658908, |
|
"grad_norm": 5.387588024139404, |
|
"learning_rate": 7.314148681055156e-06, |
|
"loss": 0.23, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.6087735004476276, |
|
"grad_norm": 4.647127151489258, |
|
"learning_rate": 7.302158273381296e-06, |
|
"loss": 0.2865, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.6159355416293644, |
|
"grad_norm": 4.853245735168457, |
|
"learning_rate": 7.290167865707435e-06, |
|
"loss": 0.2604, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.6230975828111012, |
|
"grad_norm": 7.052027225494385, |
|
"learning_rate": 7.278177458033574e-06, |
|
"loss": 0.3141, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.630259623992838, |
|
"grad_norm": 4.379312515258789, |
|
"learning_rate": 7.266187050359713e-06, |
|
"loss": 0.2006, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.6374216651745748, |
|
"grad_norm": 8.26744556427002, |
|
"learning_rate": 7.254196642685852e-06, |
|
"loss": 0.2165, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.6445837063563116, |
|
"grad_norm": 5.053855895996094, |
|
"learning_rate": 7.242206235011991e-06, |
|
"loss": 0.3284, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.6517457475380484, |
|
"grad_norm": 9.649201393127441, |
|
"learning_rate": 7.230215827338129e-06, |
|
"loss": 0.3844, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.6589077887197852, |
|
"grad_norm": 4.344612121582031, |
|
"learning_rate": 7.218225419664268e-06, |
|
"loss": 0.2531, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.666069829901522, |
|
"grad_norm": 3.039994955062866, |
|
"learning_rate": 7.206235011990408e-06, |
|
"loss": 0.1779, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.6732318710832588, |
|
"grad_norm": 15.308979034423828, |
|
"learning_rate": 7.194244604316547e-06, |
|
"loss": 0.2409, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.6803939122649956, |
|
"grad_norm": 6.968617916107178, |
|
"learning_rate": 7.182254196642686e-06, |
|
"loss": 0.2493, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.6875559534467324, |
|
"grad_norm": 5.991739749908447, |
|
"learning_rate": 7.170263788968825e-06, |
|
"loss": 0.2102, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.6947179946284692, |
|
"grad_norm": 7.561839580535889, |
|
"learning_rate": 7.1582733812949644e-06, |
|
"loss": 0.5371, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.701880035810206, |
|
"grad_norm": 7.78062105178833, |
|
"learning_rate": 7.1462829736211035e-06, |
|
"loss": 0.3472, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.7090420769919428, |
|
"grad_norm": 9.596529960632324, |
|
"learning_rate": 7.1342925659472425e-06, |
|
"loss": 0.4896, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.7162041181736796, |
|
"grad_norm": 4.697896480560303, |
|
"learning_rate": 7.122302158273382e-06, |
|
"loss": 0.2315, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.7233661593554164, |
|
"grad_norm": 7.397878646850586, |
|
"learning_rate": 7.1103117505995214e-06, |
|
"loss": 0.273, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.7305282005371532, |
|
"grad_norm": 8.375514030456543, |
|
"learning_rate": 7.0983213429256605e-06, |
|
"loss": 0.5812, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.73769024171889, |
|
"grad_norm": 3.8642501831054688, |
|
"learning_rate": 7.0863309352517995e-06, |
|
"loss": 0.233, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.7448522829006268, |
|
"grad_norm": 11.222294807434082, |
|
"learning_rate": 7.0743405275779385e-06, |
|
"loss": 0.4143, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.7520143240823636, |
|
"grad_norm": 4.19779634475708, |
|
"learning_rate": 7.062350119904078e-06, |
|
"loss": 0.216, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.7591763652641004, |
|
"grad_norm": 8.382499694824219, |
|
"learning_rate": 7.050359712230216e-06, |
|
"loss": 0.2655, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.7663384064458372, |
|
"grad_norm": 4.434614658355713, |
|
"learning_rate": 7.038369304556355e-06, |
|
"loss": 0.1881, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.773500447627574, |
|
"grad_norm": 9.419454574584961, |
|
"learning_rate": 7.026378896882494e-06, |
|
"loss": 0.4074, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.7806624888093108, |
|
"grad_norm": 5.077871799468994, |
|
"learning_rate": 7.014388489208634e-06, |
|
"loss": 0.1809, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.7878245299910476, |
|
"grad_norm": 8.029293060302734, |
|
"learning_rate": 7.002398081534773e-06, |
|
"loss": 0.3477, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.7949865711727844, |
|
"grad_norm": 7.078944206237793, |
|
"learning_rate": 6.990407673860912e-06, |
|
"loss": 0.2644, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.8021486123545212, |
|
"grad_norm": 4.196276664733887, |
|
"learning_rate": 6.978417266187051e-06, |
|
"loss": 0.3626, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.809310653536258, |
|
"grad_norm": 4.02357816696167, |
|
"learning_rate": 6.96642685851319e-06, |
|
"loss": 0.219, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.8164726947179948, |
|
"grad_norm": 6.769551753997803, |
|
"learning_rate": 6.954436450839329e-06, |
|
"loss": 0.2301, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.8236347358997316, |
|
"grad_norm": 4.0238118171691895, |
|
"learning_rate": 6.942446043165468e-06, |
|
"loss": 0.2041, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.8307967770814684, |
|
"grad_norm": 4.813575744628906, |
|
"learning_rate": 6.930455635491608e-06, |
|
"loss": 0.3333, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.8379588182632052, |
|
"grad_norm": 7.067838668823242, |
|
"learning_rate": 6.918465227817747e-06, |
|
"loss": 0.2188, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.845120859444942, |
|
"grad_norm": 5.983901023864746, |
|
"learning_rate": 6.906474820143886e-06, |
|
"loss": 0.1423, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.8522829006266786, |
|
"grad_norm": 8.536234855651855, |
|
"learning_rate": 6.894484412470025e-06, |
|
"loss": 0.1956, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.8594449418084154, |
|
"grad_norm": 6.614276885986328, |
|
"learning_rate": 6.882494004796164e-06, |
|
"loss": 0.3024, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.8666069829901522, |
|
"grad_norm": 7.686590671539307, |
|
"learning_rate": 6.870503597122302e-06, |
|
"loss": 0.2499, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.873769024171889, |
|
"grad_norm": 2.95554256439209, |
|
"learning_rate": 6.858513189448441e-06, |
|
"loss": 0.1194, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.8809310653536258, |
|
"grad_norm": 12.062966346740723, |
|
"learning_rate": 6.84652278177458e-06, |
|
"loss": 0.5564, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.8880931065353626, |
|
"grad_norm": 6.321432590484619, |
|
"learning_rate": 6.834532374100719e-06, |
|
"loss": 0.3792, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.8952551477170994, |
|
"grad_norm": 4.885496139526367, |
|
"learning_rate": 6.822541966426859e-06, |
|
"loss": 0.1913, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.9024171888988362, |
|
"grad_norm": 8.380147933959961, |
|
"learning_rate": 6.810551558752998e-06, |
|
"loss": 0.3116, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.909579230080573, |
|
"grad_norm": 10.321130752563477, |
|
"learning_rate": 6.798561151079137e-06, |
|
"loss": 0.3192, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.9167412712623098, |
|
"grad_norm": 8.94253158569336, |
|
"learning_rate": 6.786570743405276e-06, |
|
"loss": 0.4641, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.9239033124440466, |
|
"grad_norm": 7.346745014190674, |
|
"learning_rate": 6.774580335731415e-06, |
|
"loss": 0.2584, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.9310653536257834, |
|
"grad_norm": 25.712692260742188, |
|
"learning_rate": 6.762589928057554e-06, |
|
"loss": 0.4284, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.9382273948075202, |
|
"grad_norm": 4.491262912750244, |
|
"learning_rate": 6.750599520383694e-06, |
|
"loss": 0.1754, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.945389435989257, |
|
"grad_norm": 14.674046516418457, |
|
"learning_rate": 6.738609112709833e-06, |
|
"loss": 0.3855, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.9525514771709938, |
|
"grad_norm": 11.2119722366333, |
|
"learning_rate": 6.726618705035972e-06, |
|
"loss": 0.3796, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.9597135183527306, |
|
"grad_norm": 9.643905639648438, |
|
"learning_rate": 6.714628297362111e-06, |
|
"loss": 0.3383, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.9668755595344674, |
|
"grad_norm": 9.514199256896973, |
|
"learning_rate": 6.70263788968825e-06, |
|
"loss": 0.2208, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.9740376007162042, |
|
"grad_norm": 10.071746826171875, |
|
"learning_rate": 6.6906474820143886e-06, |
|
"loss": 0.2416, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.981199641897941, |
|
"grad_norm": 6.356027603149414, |
|
"learning_rate": 6.678657074340528e-06, |
|
"loss": 0.3232, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.9883616830796778, |
|
"grad_norm": 1.8917155265808105, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.0916, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.9955237242614146, |
|
"grad_norm": 10.453146934509277, |
|
"learning_rate": 6.654676258992806e-06, |
|
"loss": 0.3523, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 7.3855509757995605, |
|
"learning_rate": 6.6426858513189456e-06, |
|
"loss": 0.2806, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.8752515090543259, |
|
"eval_loss": 0.3062034249305725, |
|
"eval_runtime": 13.2652, |
|
"eval_samples_per_second": 37.466, |
|
"eval_steps_per_second": 37.466, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.007162041181737, |
|
"grad_norm": 12.951683044433594, |
|
"learning_rate": 6.630695443645085e-06, |
|
"loss": 0.6262, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.0143240823634736, |
|
"grad_norm": 6.761510372161865, |
|
"learning_rate": 6.618705035971224e-06, |
|
"loss": 0.202, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.0214861235452104, |
|
"grad_norm": 9.198619842529297, |
|
"learning_rate": 6.606714628297363e-06, |
|
"loss": 0.271, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.028648164726947, |
|
"grad_norm": 11.052300453186035, |
|
"learning_rate": 6.594724220623502e-06, |
|
"loss": 0.3322, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.035810205908684, |
|
"grad_norm": 5.700530529022217, |
|
"learning_rate": 6.582733812949641e-06, |
|
"loss": 0.1473, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.042972247090421, |
|
"grad_norm": 3.494630813598633, |
|
"learning_rate": 6.57074340527578e-06, |
|
"loss": 0.1127, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.0501342882721576, |
|
"grad_norm": 4.829661846160889, |
|
"learning_rate": 6.55875299760192e-06, |
|
"loss": 0.1499, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.0572963294538944, |
|
"grad_norm": 2.1077747344970703, |
|
"learning_rate": 6.546762589928059e-06, |
|
"loss": 0.1221, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.064458370635631, |
|
"grad_norm": 8.28569221496582, |
|
"learning_rate": 6.534772182254198e-06, |
|
"loss": 0.3386, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.071620411817368, |
|
"grad_norm": 6.121078968048096, |
|
"learning_rate": 6.522781774580337e-06, |
|
"loss": 0.2158, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.078782452999105, |
|
"grad_norm": 7.616806507110596, |
|
"learning_rate": 6.510791366906475e-06, |
|
"loss": 0.241, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.0859444941808416, |
|
"grad_norm": 10.66478443145752, |
|
"learning_rate": 6.498800959232614e-06, |
|
"loss": 0.2952, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.0931065353625784, |
|
"grad_norm": 4.883520126342773, |
|
"learning_rate": 6.486810551558753e-06, |
|
"loss": 0.1603, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.100268576544315, |
|
"grad_norm": 2.091519594192505, |
|
"learning_rate": 6.474820143884892e-06, |
|
"loss": 0.0847, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.107430617726052, |
|
"grad_norm": 6.887018203735352, |
|
"learning_rate": 6.462829736211031e-06, |
|
"loss": 0.2551, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.114592658907789, |
|
"grad_norm": 12.55301284790039, |
|
"learning_rate": 6.450839328537171e-06, |
|
"loss": 0.305, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.1217547000895256, |
|
"grad_norm": 5.844974517822266, |
|
"learning_rate": 6.43884892086331e-06, |
|
"loss": 0.3444, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.1289167412712624, |
|
"grad_norm": 10.359457969665527, |
|
"learning_rate": 6.426858513189449e-06, |
|
"loss": 0.3414, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.136078782452999, |
|
"grad_norm": 5.8697686195373535, |
|
"learning_rate": 6.414868105515588e-06, |
|
"loss": 0.2207, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.143240823634736, |
|
"grad_norm": 8.140408515930176, |
|
"learning_rate": 6.402877697841727e-06, |
|
"loss": 0.1734, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.150402864816473, |
|
"grad_norm": 11.230256080627441, |
|
"learning_rate": 6.390887290167866e-06, |
|
"loss": 0.2641, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.1575649059982096, |
|
"grad_norm": 7.876967430114746, |
|
"learning_rate": 6.378896882494005e-06, |
|
"loss": 0.5413, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.1647269471799464, |
|
"grad_norm": 4.870555400848389, |
|
"learning_rate": 6.366906474820145e-06, |
|
"loss": 0.1618, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.171888988361683, |
|
"grad_norm": 6.328149795532227, |
|
"learning_rate": 6.354916067146284e-06, |
|
"loss": 0.1336, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.17905102954342, |
|
"grad_norm": 11.720122337341309, |
|
"learning_rate": 6.342925659472423e-06, |
|
"loss": 0.3316, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.186213070725157, |
|
"grad_norm": 4.192595958709717, |
|
"learning_rate": 6.330935251798561e-06, |
|
"loss": 0.2262, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.1933751119068936, |
|
"grad_norm": 7.089945316314697, |
|
"learning_rate": 6.3189448441247e-06, |
|
"loss": 0.3939, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.2005371530886304, |
|
"grad_norm": 3.7617053985595703, |
|
"learning_rate": 6.3069544364508395e-06, |
|
"loss": 0.129, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.207699194270367, |
|
"grad_norm": 2.709459066390991, |
|
"learning_rate": 6.2949640287769785e-06, |
|
"loss": 0.106, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.214861235452104, |
|
"grad_norm": 3.1630465984344482, |
|
"learning_rate": 6.2829736211031175e-06, |
|
"loss": 0.1199, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.222023276633841, |
|
"grad_norm": 8.380224227905273, |
|
"learning_rate": 6.2709832134292566e-06, |
|
"loss": 0.1654, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.2291853178155776, |
|
"grad_norm": 6.432971000671387, |
|
"learning_rate": 6.2589928057553964e-06, |
|
"loss": 0.2642, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.2363473589973144, |
|
"grad_norm": 8.790579795837402, |
|
"learning_rate": 6.2470023980815355e-06, |
|
"loss": 0.2575, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.243509400179051, |
|
"grad_norm": 6.955909252166748, |
|
"learning_rate": 6.2350119904076745e-06, |
|
"loss": 0.1253, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.250671441360788, |
|
"grad_norm": 4.945182800292969, |
|
"learning_rate": 6.2230215827338136e-06, |
|
"loss": 0.23, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.257833482542525, |
|
"grad_norm": 6.262834072113037, |
|
"learning_rate": 6.211031175059953e-06, |
|
"loss": 0.2261, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.2649955237242616, |
|
"grad_norm": 6.618960380554199, |
|
"learning_rate": 6.199040767386092e-06, |
|
"loss": 0.2217, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.2721575649059984, |
|
"grad_norm": 16.147371292114258, |
|
"learning_rate": 6.1870503597122315e-06, |
|
"loss": 0.3444, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.2793196060877348, |
|
"grad_norm": 9.18713092803955, |
|
"learning_rate": 6.1750599520383706e-06, |
|
"loss": 0.1846, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.286481647269472, |
|
"grad_norm": 4.649369716644287, |
|
"learning_rate": 6.16306954436451e-06, |
|
"loss": 0.2223, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.2936436884512084, |
|
"grad_norm": 4.171594619750977, |
|
"learning_rate": 6.151079136690648e-06, |
|
"loss": 0.1455, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.3008057296329456, |
|
"grad_norm": 9.076400756835938, |
|
"learning_rate": 6.139088729016787e-06, |
|
"loss": 0.2683, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.307967770814682, |
|
"grad_norm": 8.380135536193848, |
|
"learning_rate": 6.127098321342926e-06, |
|
"loss": 0.2054, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.315129811996419, |
|
"grad_norm": 8.534207344055176, |
|
"learning_rate": 6.115107913669065e-06, |
|
"loss": 0.1961, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.3222918531781556, |
|
"grad_norm": 5.501217365264893, |
|
"learning_rate": 6.103117505995204e-06, |
|
"loss": 0.1083, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.329453894359893, |
|
"grad_norm": 3.4403066635131836, |
|
"learning_rate": 6.091127098321343e-06, |
|
"loss": 0.0808, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.336615935541629, |
|
"grad_norm": 9.280027389526367, |
|
"learning_rate": 6.079136690647483e-06, |
|
"loss": 0.2427, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.3437779767233664, |
|
"grad_norm": 6.337725639343262, |
|
"learning_rate": 6.067146282973622e-06, |
|
"loss": 0.2028, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.3509400179051028, |
|
"grad_norm": 4.871727466583252, |
|
"learning_rate": 6.055155875299761e-06, |
|
"loss": 0.2069, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.3581020590868396, |
|
"grad_norm": 1.3199633359909058, |
|
"learning_rate": 6.0431654676259e-06, |
|
"loss": 0.0468, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.3652641002685764, |
|
"grad_norm": 7.1639790534973145, |
|
"learning_rate": 6.031175059952039e-06, |
|
"loss": 0.271, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.372426141450313, |
|
"grad_norm": 4.194430351257324, |
|
"learning_rate": 6.019184652278178e-06, |
|
"loss": 0.2205, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.37958818263205, |
|
"grad_norm": 5.920024871826172, |
|
"learning_rate": 6.007194244604317e-06, |
|
"loss": 0.3546, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.3867502238137868, |
|
"grad_norm": 7.114736557006836, |
|
"learning_rate": 5.995203836930457e-06, |
|
"loss": 0.234, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.3939122649955236, |
|
"grad_norm": 6.038070201873779, |
|
"learning_rate": 5.983213429256596e-06, |
|
"loss": 0.2333, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.4010743061772604, |
|
"grad_norm": 7.507596969604492, |
|
"learning_rate": 5.971223021582734e-06, |
|
"loss": 0.1927, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.408236347358997, |
|
"grad_norm": 5.830931663513184, |
|
"learning_rate": 5.959232613908873e-06, |
|
"loss": 0.123, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.415398388540734, |
|
"grad_norm": 5.003695964813232, |
|
"learning_rate": 5.947242206235012e-06, |
|
"loss": 0.1651, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.4225604297224708, |
|
"grad_norm": 5.298439979553223, |
|
"learning_rate": 5.935251798561151e-06, |
|
"loss": 0.2354, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.4297224709042076, |
|
"grad_norm": 8.523574829101562, |
|
"learning_rate": 5.92326139088729e-06, |
|
"loss": 0.206, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.4368845120859444, |
|
"grad_norm": 6.604531764984131, |
|
"learning_rate": 5.911270983213429e-06, |
|
"loss": 0.2668, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.444046553267681, |
|
"grad_norm": 5.022377014160156, |
|
"learning_rate": 5.899280575539568e-06, |
|
"loss": 0.111, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.451208594449418, |
|
"grad_norm": 5.974075794219971, |
|
"learning_rate": 5.887290167865708e-06, |
|
"loss": 0.2463, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.4583706356311548, |
|
"grad_norm": 6.426002025604248, |
|
"learning_rate": 5.875299760191847e-06, |
|
"loss": 0.1471, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.4655326768128916, |
|
"grad_norm": 4.997922420501709, |
|
"learning_rate": 5.863309352517986e-06, |
|
"loss": 0.2057, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.4726947179946284, |
|
"grad_norm": 9.264660835266113, |
|
"learning_rate": 5.851318944844125e-06, |
|
"loss": 0.3078, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.479856759176365, |
|
"grad_norm": 4.701541423797607, |
|
"learning_rate": 5.8393285371702644e-06, |
|
"loss": 0.2177, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.487018800358102, |
|
"grad_norm": 11.523097038269043, |
|
"learning_rate": 5.8273381294964035e-06, |
|
"loss": 0.2449, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.4941808415398388, |
|
"grad_norm": 3.427780866622925, |
|
"learning_rate": 5.8153477218225425e-06, |
|
"loss": 0.2195, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.5013428827215756, |
|
"grad_norm": 10.555095672607422, |
|
"learning_rate": 5.803357314148681e-06, |
|
"loss": 0.2723, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.5085049239033124, |
|
"grad_norm": 8.235042572021484, |
|
"learning_rate": 5.79136690647482e-06, |
|
"loss": 0.2281, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.515666965085049, |
|
"grad_norm": 6.958029270172119, |
|
"learning_rate": 5.77937649880096e-06, |
|
"loss": 0.3334, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.522829006266786, |
|
"grad_norm": 4.380754470825195, |
|
"learning_rate": 5.767386091127099e-06, |
|
"loss": 0.1564, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.5299910474485228, |
|
"grad_norm": 3.2662181854248047, |
|
"learning_rate": 5.755395683453238e-06, |
|
"loss": 0.1156, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.5371530886302596, |
|
"grad_norm": 5.031314849853516, |
|
"learning_rate": 5.743405275779377e-06, |
|
"loss": 0.122, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.5443151298119964, |
|
"grad_norm": 5.606161117553711, |
|
"learning_rate": 5.731414868105516e-06, |
|
"loss": 0.2256, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.551477170993733, |
|
"grad_norm": 1.6429898738861084, |
|
"learning_rate": 5.719424460431655e-06, |
|
"loss": 0.0744, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.55863921217547, |
|
"grad_norm": 4.822231769561768, |
|
"learning_rate": 5.707434052757795e-06, |
|
"loss": 0.198, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.5658012533572068, |
|
"grad_norm": 7.1267991065979, |
|
"learning_rate": 5.695443645083934e-06, |
|
"loss": 0.3007, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.5729632945389436, |
|
"grad_norm": 7.829804420471191, |
|
"learning_rate": 5.683453237410073e-06, |
|
"loss": 0.3531, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.5801253357206804, |
|
"grad_norm": 10.854474067687988, |
|
"learning_rate": 5.671462829736212e-06, |
|
"loss": 0.21, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.587287376902417, |
|
"grad_norm": 2.879049777984619, |
|
"learning_rate": 5.659472422062351e-06, |
|
"loss": 0.158, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.594449418084154, |
|
"grad_norm": 9.582825660705566, |
|
"learning_rate": 5.64748201438849e-06, |
|
"loss": 0.1221, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.6016114592658908, |
|
"grad_norm": 13.478569984436035, |
|
"learning_rate": 5.635491606714629e-06, |
|
"loss": 0.3543, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.6087735004476276, |
|
"grad_norm": 13.520654678344727, |
|
"learning_rate": 5.623501199040767e-06, |
|
"loss": 0.2749, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.6159355416293644, |
|
"grad_norm": 9.655123710632324, |
|
"learning_rate": 5.611510791366906e-06, |
|
"loss": 0.3296, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.623097582811101, |
|
"grad_norm": 8.334728240966797, |
|
"learning_rate": 5.599520383693046e-06, |
|
"loss": 0.201, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.630259623992838, |
|
"grad_norm": 9.50122356414795, |
|
"learning_rate": 5.587529976019185e-06, |
|
"loss": 0.1588, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.6374216651745748, |
|
"grad_norm": 6.593572616577148, |
|
"learning_rate": 5.575539568345324e-06, |
|
"loss": 0.1869, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.6445837063563116, |
|
"grad_norm": 5.8361735343933105, |
|
"learning_rate": 5.563549160671463e-06, |
|
"loss": 0.1161, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.6517457475380484, |
|
"grad_norm": 6.58205509185791, |
|
"learning_rate": 5.551558752997602e-06, |
|
"loss": 0.1315, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.658907788719785, |
|
"grad_norm": 10.432963371276855, |
|
"learning_rate": 5.539568345323741e-06, |
|
"loss": 0.4753, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.666069829901522, |
|
"grad_norm": 10.057262420654297, |
|
"learning_rate": 5.52757793764988e-06, |
|
"loss": 0.2506, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.6732318710832588, |
|
"grad_norm": 6.905503749847412, |
|
"learning_rate": 5.51558752997602e-06, |
|
"loss": 0.2124, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.6803939122649956, |
|
"grad_norm": 7.519169807434082, |
|
"learning_rate": 5.503597122302159e-06, |
|
"loss": 0.307, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.6875559534467324, |
|
"grad_norm": 5.79476261138916, |
|
"learning_rate": 5.491606714628298e-06, |
|
"loss": 0.0908, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.694717994628469, |
|
"grad_norm": 10.047815322875977, |
|
"learning_rate": 5.479616306954437e-06, |
|
"loss": 0.2494, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.701880035810206, |
|
"grad_norm": 3.0948004722595215, |
|
"learning_rate": 5.467625899280576e-06, |
|
"loss": 0.0827, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.7090420769919428, |
|
"grad_norm": 6.911858558654785, |
|
"learning_rate": 5.455635491606715e-06, |
|
"loss": 0.2022, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.7162041181736796, |
|
"grad_norm": 13.745474815368652, |
|
"learning_rate": 5.4436450839328535e-06, |
|
"loss": 0.2462, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.7233661593554164, |
|
"grad_norm": 9.535720825195312, |
|
"learning_rate": 5.4316546762589925e-06, |
|
"loss": 0.442, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.730528200537153, |
|
"grad_norm": 4.10536527633667, |
|
"learning_rate": 5.4196642685851316e-06, |
|
"loss": 0.3275, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.73769024171889, |
|
"grad_norm": 5.548096656799316, |
|
"learning_rate": 5.4076738609112715e-06, |
|
"loss": 0.1592, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.7448522829006268, |
|
"grad_norm": 5.5715179443359375, |
|
"learning_rate": 5.3956834532374105e-06, |
|
"loss": 0.3444, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.7520143240823636, |
|
"grad_norm": 5.943526744842529, |
|
"learning_rate": 5.3836930455635495e-06, |
|
"loss": 0.2461, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.7591763652641004, |
|
"grad_norm": 4.913482189178467, |
|
"learning_rate": 5.3717026378896886e-06, |
|
"loss": 0.162, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.766338406445837, |
|
"grad_norm": 5.035208702087402, |
|
"learning_rate": 5.359712230215828e-06, |
|
"loss": 0.1334, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.773500447627574, |
|
"grad_norm": 7.2981390953063965, |
|
"learning_rate": 5.347721822541967e-06, |
|
"loss": 0.1701, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.7806624888093108, |
|
"grad_norm": 7.505518913269043, |
|
"learning_rate": 5.335731414868106e-06, |
|
"loss": 0.1757, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.7878245299910476, |
|
"grad_norm": 3.761885404586792, |
|
"learning_rate": 5.3237410071942456e-06, |
|
"loss": 0.2144, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.7949865711727844, |
|
"grad_norm": 3.8127267360687256, |
|
"learning_rate": 5.311750599520385e-06, |
|
"loss": 0.0838, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.802148612354521, |
|
"grad_norm": 6.290581226348877, |
|
"learning_rate": 5.299760191846524e-06, |
|
"loss": 0.1923, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.809310653536258, |
|
"grad_norm": 9.498542785644531, |
|
"learning_rate": 5.287769784172663e-06, |
|
"loss": 0.2806, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.8164726947179948, |
|
"grad_norm": 5.22800874710083, |
|
"learning_rate": 5.275779376498802e-06, |
|
"loss": 0.2014, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.8236347358997316, |
|
"grad_norm": 3.5994250774383545, |
|
"learning_rate": 5.26378896882494e-06, |
|
"loss": 0.1525, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.8307967770814684, |
|
"grad_norm": 8.245004653930664, |
|
"learning_rate": 5.251798561151079e-06, |
|
"loss": 0.2496, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.837958818263205, |
|
"grad_norm": 13.4406156539917, |
|
"learning_rate": 5.239808153477218e-06, |
|
"loss": 0.4056, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.845120859444942, |
|
"grad_norm": 5.844360828399658, |
|
"learning_rate": 5.227817745803357e-06, |
|
"loss": 0.1751, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.8522829006266788, |
|
"grad_norm": 9.290830612182617, |
|
"learning_rate": 5.215827338129497e-06, |
|
"loss": 0.3863, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.859444941808415, |
|
"grad_norm": 6.542262554168701, |
|
"learning_rate": 5.203836930455636e-06, |
|
"loss": 0.1952, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.8666069829901524, |
|
"grad_norm": 7.181087493896484, |
|
"learning_rate": 5.191846522781775e-06, |
|
"loss": 0.3431, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.8737690241718887, |
|
"grad_norm": 4.160923957824707, |
|
"learning_rate": 5.179856115107914e-06, |
|
"loss": 0.0841, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.880931065353626, |
|
"grad_norm": 10.504855155944824, |
|
"learning_rate": 5.167865707434053e-06, |
|
"loss": 0.2541, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.8880931065353623, |
|
"grad_norm": 2.3804681301116943, |
|
"learning_rate": 5.155875299760192e-06, |
|
"loss": 0.0668, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.8952551477170996, |
|
"grad_norm": 6.530309200286865, |
|
"learning_rate": 5.143884892086332e-06, |
|
"loss": 0.2772, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.902417188898836, |
|
"grad_norm": 9.804174423217773, |
|
"learning_rate": 5.131894484412471e-06, |
|
"loss": 0.2987, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.909579230080573, |
|
"grad_norm": 13.136397361755371, |
|
"learning_rate": 5.11990407673861e-06, |
|
"loss": 0.308, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.9167412712623095, |
|
"grad_norm": 6.506509304046631, |
|
"learning_rate": 5.107913669064749e-06, |
|
"loss": 0.1547, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.9239033124440468, |
|
"grad_norm": 7.714542865753174, |
|
"learning_rate": 5.095923261390888e-06, |
|
"loss": 0.2585, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.931065353625783, |
|
"grad_norm": 4.894972801208496, |
|
"learning_rate": 5.083932853717026e-06, |
|
"loss": 0.0756, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.9382273948075204, |
|
"grad_norm": 12.52349853515625, |
|
"learning_rate": 5.071942446043165e-06, |
|
"loss": 0.2471, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.9453894359892567, |
|
"grad_norm": 9.225778579711914, |
|
"learning_rate": 5.059952038369304e-06, |
|
"loss": 0.2481, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.952551477170994, |
|
"grad_norm": 6.006895065307617, |
|
"learning_rate": 5.047961630695443e-06, |
|
"loss": 0.2832, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.9597135183527303, |
|
"grad_norm": 5.98204231262207, |
|
"learning_rate": 5.035971223021583e-06, |
|
"loss": 0.2155, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.9668755595344676, |
|
"grad_norm": 3.6064493656158447, |
|
"learning_rate": 5.023980815347722e-06, |
|
"loss": 0.1033, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.974037600716204, |
|
"grad_norm": 7.625882148742676, |
|
"learning_rate": 5.011990407673861e-06, |
|
"loss": 0.1979, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.981199641897941, |
|
"grad_norm": 9.841728210449219, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3448, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.9883616830796775, |
|
"grad_norm": 10.841019630432129, |
|
"learning_rate": 4.9880095923261394e-06, |
|
"loss": 0.289, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.9955237242614148, |
|
"grad_norm": 10.639505386352539, |
|
"learning_rate": 4.9760191846522785e-06, |
|
"loss": 0.2548, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 7.099693298339844, |
|
"learning_rate": 4.9640287769784175e-06, |
|
"loss": 0.2351, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.8812877263581489, |
|
"eval_loss": 0.2769426703453064, |
|
"eval_runtime": 12.7975, |
|
"eval_samples_per_second": 38.836, |
|
"eval_steps_per_second": 38.836, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.007162041181737, |
|
"grad_norm": 8.117796897888184, |
|
"learning_rate": 4.9520383693045566e-06, |
|
"loss": 0.2663, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 3.0143240823634736, |
|
"grad_norm": 8.604570388793945, |
|
"learning_rate": 4.940047961630696e-06, |
|
"loss": 0.1473, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 3.0214861235452104, |
|
"grad_norm": 3.096344232559204, |
|
"learning_rate": 4.928057553956835e-06, |
|
"loss": 0.0984, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 3.028648164726947, |
|
"grad_norm": 6.029706001281738, |
|
"learning_rate": 4.916067146282974e-06, |
|
"loss": 0.3858, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 3.035810205908684, |
|
"grad_norm": 4.804579734802246, |
|
"learning_rate": 4.9040767386091136e-06, |
|
"loss": 0.0763, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.042972247090421, |
|
"grad_norm": 5.308372497558594, |
|
"learning_rate": 4.892086330935253e-06, |
|
"loss": 0.2166, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 3.0501342882721576, |
|
"grad_norm": 2.092256784439087, |
|
"learning_rate": 4.880095923261392e-06, |
|
"loss": 0.0637, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 3.0572963294538944, |
|
"grad_norm": 8.963829040527344, |
|
"learning_rate": 4.868105515587531e-06, |
|
"loss": 0.1555, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 3.064458370635631, |
|
"grad_norm": 3.819185256958008, |
|
"learning_rate": 4.856115107913669e-06, |
|
"loss": 0.1032, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 3.071620411817368, |
|
"grad_norm": 5.126814365386963, |
|
"learning_rate": 4.844124700239809e-06, |
|
"loss": 0.1586, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.078782452999105, |
|
"grad_norm": 5.945977687835693, |
|
"learning_rate": 4.832134292565948e-06, |
|
"loss": 0.2215, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 3.0859444941808416, |
|
"grad_norm": 7.399024963378906, |
|
"learning_rate": 4.820143884892087e-06, |
|
"loss": 0.1983, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 3.0931065353625784, |
|
"grad_norm": 3.2025091648101807, |
|
"learning_rate": 4.808153477218226e-06, |
|
"loss": 0.0709, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 3.100268576544315, |
|
"grad_norm": 7.664062976837158, |
|
"learning_rate": 4.796163069544365e-06, |
|
"loss": 0.2531, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 3.107430617726052, |
|
"grad_norm": 8.884867668151855, |
|
"learning_rate": 4.784172661870504e-06, |
|
"loss": 0.156, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 3.114592658907789, |
|
"grad_norm": 8.288426399230957, |
|
"learning_rate": 4.772182254196643e-06, |
|
"loss": 0.1259, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 3.1217547000895256, |
|
"grad_norm": 4.304559230804443, |
|
"learning_rate": 4.760191846522782e-06, |
|
"loss": 0.0615, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 3.1289167412712624, |
|
"grad_norm": 8.041988372802734, |
|
"learning_rate": 4.748201438848921e-06, |
|
"loss": 0.2884, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 3.136078782452999, |
|
"grad_norm": 2.3492345809936523, |
|
"learning_rate": 4.73621103117506e-06, |
|
"loss": 0.0363, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 3.143240823634736, |
|
"grad_norm": 7.380918502807617, |
|
"learning_rate": 4.724220623501199e-06, |
|
"loss": 0.1686, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.150402864816473, |
|
"grad_norm": 7.309660911560059, |
|
"learning_rate": 4.712230215827339e-06, |
|
"loss": 0.3719, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 3.1575649059982096, |
|
"grad_norm": 8.47828197479248, |
|
"learning_rate": 4.700239808153478e-06, |
|
"loss": 0.1023, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 3.1647269471799464, |
|
"grad_norm": 10.671316146850586, |
|
"learning_rate": 4.688249400479617e-06, |
|
"loss": 0.1549, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 3.171888988361683, |
|
"grad_norm": 8.016914367675781, |
|
"learning_rate": 4.676258992805755e-06, |
|
"loss": 0.1275, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 3.17905102954342, |
|
"grad_norm": 6.72282075881958, |
|
"learning_rate": 4.664268585131895e-06, |
|
"loss": 0.1603, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 3.186213070725157, |
|
"grad_norm": 11.111018180847168, |
|
"learning_rate": 4.652278177458034e-06, |
|
"loss": 0.2358, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 3.1933751119068936, |
|
"grad_norm": 7.229121685028076, |
|
"learning_rate": 4.640287769784173e-06, |
|
"loss": 0.1746, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 3.2005371530886304, |
|
"grad_norm": 4.709452152252197, |
|
"learning_rate": 4.628297362110312e-06, |
|
"loss": 0.0628, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 3.207699194270367, |
|
"grad_norm": 3.247600793838501, |
|
"learning_rate": 4.616306954436451e-06, |
|
"loss": 0.0916, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 3.214861235452104, |
|
"grad_norm": 5.488199710845947, |
|
"learning_rate": 4.60431654676259e-06, |
|
"loss": 0.2426, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.222023276633841, |
|
"grad_norm": 4.127665996551514, |
|
"learning_rate": 4.592326139088729e-06, |
|
"loss": 0.1155, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 3.2291853178155776, |
|
"grad_norm": 6.826393127441406, |
|
"learning_rate": 4.580335731414868e-06, |
|
"loss": 0.2058, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 3.2363473589973144, |
|
"grad_norm": 7.5438151359558105, |
|
"learning_rate": 4.5683453237410074e-06, |
|
"loss": 0.0832, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 3.243509400179051, |
|
"grad_norm": 4.956328392028809, |
|
"learning_rate": 4.5563549160671465e-06, |
|
"loss": 0.1414, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 3.250671441360788, |
|
"grad_norm": 5.9591779708862305, |
|
"learning_rate": 4.5443645083932855e-06, |
|
"loss": 0.2873, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 3.257833482542525, |
|
"grad_norm": 7.277362823486328, |
|
"learning_rate": 4.5323741007194245e-06, |
|
"loss": 0.2129, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 3.2649955237242616, |
|
"grad_norm": 4.973333835601807, |
|
"learning_rate": 4.5203836930455644e-06, |
|
"loss": 0.1075, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 3.2721575649059984, |
|
"grad_norm": 6.4229912757873535, |
|
"learning_rate": 4.508393285371703e-06, |
|
"loss": 0.1338, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 3.2793196060877348, |
|
"grad_norm": 7.826179504394531, |
|
"learning_rate": 4.496402877697842e-06, |
|
"loss": 0.0805, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 3.286481647269472, |
|
"grad_norm": 5.4623894691467285, |
|
"learning_rate": 4.484412470023981e-06, |
|
"loss": 0.2125, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.2936436884512084, |
|
"grad_norm": 3.689229965209961, |
|
"learning_rate": 4.472422062350121e-06, |
|
"loss": 0.094, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 3.3008057296329456, |
|
"grad_norm": 5.583723068237305, |
|
"learning_rate": 4.46043165467626e-06, |
|
"loss": 0.1595, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 3.307967770814682, |
|
"grad_norm": 6.716935634613037, |
|
"learning_rate": 4.448441247002399e-06, |
|
"loss": 0.2875, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 3.315129811996419, |
|
"grad_norm": 5.648532390594482, |
|
"learning_rate": 4.436450839328538e-06, |
|
"loss": 0.2079, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 3.3222918531781556, |
|
"grad_norm": 9.457056999206543, |
|
"learning_rate": 4.424460431654677e-06, |
|
"loss": 0.3384, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.329453894359893, |
|
"grad_norm": 2.3951103687286377, |
|
"learning_rate": 4.412470023980816e-06, |
|
"loss": 0.0539, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 3.336615935541629, |
|
"grad_norm": 4.021335124969482, |
|
"learning_rate": 4.400479616306955e-06, |
|
"loss": 0.067, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 3.3437779767233664, |
|
"grad_norm": 7.070362091064453, |
|
"learning_rate": 4.388489208633094e-06, |
|
"loss": 0.2575, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 3.3509400179051028, |
|
"grad_norm": 4.526058197021484, |
|
"learning_rate": 4.376498800959233e-06, |
|
"loss": 0.1562, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 3.3581020590868396, |
|
"grad_norm": 10.771688461303711, |
|
"learning_rate": 4.364508393285372e-06, |
|
"loss": 0.2787, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.3652641002685764, |
|
"grad_norm": 13.02984619140625, |
|
"learning_rate": 4.352517985611511e-06, |
|
"loss": 0.3604, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 3.372426141450313, |
|
"grad_norm": 11.359794616699219, |
|
"learning_rate": 4.340527577937651e-06, |
|
"loss": 0.2501, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 3.37958818263205, |
|
"grad_norm": 3.5190794467926025, |
|
"learning_rate": 4.328537170263789e-06, |
|
"loss": 0.0857, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 3.3867502238137868, |
|
"grad_norm": 5.3390374183654785, |
|
"learning_rate": 4.316546762589928e-06, |
|
"loss": 0.2399, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 3.3939122649955236, |
|
"grad_norm": 4.161365509033203, |
|
"learning_rate": 4.304556354916067e-06, |
|
"loss": 0.1279, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.4010743061772604, |
|
"grad_norm": 5.861349105834961, |
|
"learning_rate": 4.292565947242206e-06, |
|
"loss": 0.1378, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 3.408236347358997, |
|
"grad_norm": 5.710063457489014, |
|
"learning_rate": 4.280575539568346e-06, |
|
"loss": 0.1279, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 3.415398388540734, |
|
"grad_norm": 2.9877302646636963, |
|
"learning_rate": 4.268585131894485e-06, |
|
"loss": 0.0612, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 3.4225604297224708, |
|
"grad_norm": 5.60707426071167, |
|
"learning_rate": 4.256594724220624e-06, |
|
"loss": 0.0963, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 3.4297224709042076, |
|
"grad_norm": 4.96213960647583, |
|
"learning_rate": 4.244604316546763e-06, |
|
"loss": 0.1391, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.4368845120859444, |
|
"grad_norm": 8.810404777526855, |
|
"learning_rate": 4.232613908872902e-06, |
|
"loss": 0.2233, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 3.444046553267681, |
|
"grad_norm": 9.261563301086426, |
|
"learning_rate": 4.220623501199041e-06, |
|
"loss": 0.1481, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 3.451208594449418, |
|
"grad_norm": 5.470502853393555, |
|
"learning_rate": 4.20863309352518e-06, |
|
"loss": 0.096, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 3.4583706356311548, |
|
"grad_norm": 4.597263336181641, |
|
"learning_rate": 4.196642685851319e-06, |
|
"loss": 0.0983, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 3.4655326768128916, |
|
"grad_norm": 15.841909408569336, |
|
"learning_rate": 4.184652278177458e-06, |
|
"loss": 0.4426, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.4726947179946284, |
|
"grad_norm": 6.602854251861572, |
|
"learning_rate": 4.172661870503597e-06, |
|
"loss": 0.0847, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 3.479856759176365, |
|
"grad_norm": 2.0710320472717285, |
|
"learning_rate": 4.160671462829736e-06, |
|
"loss": 0.1633, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 3.487018800358102, |
|
"grad_norm": 5.60425329208374, |
|
"learning_rate": 4.148681055155875e-06, |
|
"loss": 0.1899, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 3.4941808415398388, |
|
"grad_norm": 4.323896408081055, |
|
"learning_rate": 4.1366906474820145e-06, |
|
"loss": 0.083, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 3.5013428827215756, |
|
"grad_norm": 3.02128529548645, |
|
"learning_rate": 4.1247002398081535e-06, |
|
"loss": 0.0691, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.5085049239033124, |
|
"grad_norm": 9.449849128723145, |
|
"learning_rate": 4.1127098321342925e-06, |
|
"loss": 0.1293, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 3.515666965085049, |
|
"grad_norm": 7.267805576324463, |
|
"learning_rate": 4.100719424460432e-06, |
|
"loss": 0.3555, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 3.522829006266786, |
|
"grad_norm": 9.87209701538086, |
|
"learning_rate": 4.0887290167865715e-06, |
|
"loss": 0.3113, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 3.5299910474485228, |
|
"grad_norm": 11.590337753295898, |
|
"learning_rate": 4.0767386091127105e-06, |
|
"loss": 0.1438, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 3.5371530886302596, |
|
"grad_norm": 7.622054576873779, |
|
"learning_rate": 4.0647482014388495e-06, |
|
"loss": 0.1367, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.5443151298119964, |
|
"grad_norm": 21.82820701599121, |
|
"learning_rate": 4.052757793764988e-06, |
|
"loss": 0.164, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 3.551477170993733, |
|
"grad_norm": 7.063591003417969, |
|
"learning_rate": 4.040767386091128e-06, |
|
"loss": 0.1362, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 3.55863921217547, |
|
"grad_norm": 13.529502868652344, |
|
"learning_rate": 4.028776978417267e-06, |
|
"loss": 0.2081, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 3.5658012533572068, |
|
"grad_norm": 6.6618547439575195, |
|
"learning_rate": 4.016786570743406e-06, |
|
"loss": 0.0999, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 3.5729632945389436, |
|
"grad_norm": 2.7395083904266357, |
|
"learning_rate": 4.004796163069545e-06, |
|
"loss": 0.1659, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.5801253357206804, |
|
"grad_norm": 6.758154392242432, |
|
"learning_rate": 3.992805755395684e-06, |
|
"loss": 0.1001, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 3.587287376902417, |
|
"grad_norm": 6.874615669250488, |
|
"learning_rate": 3.980815347721823e-06, |
|
"loss": 0.3106, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 3.594449418084154, |
|
"grad_norm": 6.3880228996276855, |
|
"learning_rate": 3.968824940047962e-06, |
|
"loss": 0.2233, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 3.6016114592658908, |
|
"grad_norm": 5.770565032958984, |
|
"learning_rate": 3.956834532374101e-06, |
|
"loss": 0.1179, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 3.6087735004476276, |
|
"grad_norm": 2.633235454559326, |
|
"learning_rate": 3.94484412470024e-06, |
|
"loss": 0.0658, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 3.6159355416293644, |
|
"grad_norm": 6.661257743835449, |
|
"learning_rate": 3.932853717026379e-06, |
|
"loss": 0.1756, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 3.623097582811101, |
|
"grad_norm": 3.2861239910125732, |
|
"learning_rate": 3.920863309352518e-06, |
|
"loss": 0.0856, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 3.630259623992838, |
|
"grad_norm": 8.988011360168457, |
|
"learning_rate": 3.908872901678658e-06, |
|
"loss": 0.2723, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 3.6374216651745748, |
|
"grad_norm": 8.593001365661621, |
|
"learning_rate": 3.896882494004797e-06, |
|
"loss": 0.1671, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 3.6445837063563116, |
|
"grad_norm": 4.522592067718506, |
|
"learning_rate": 3.884892086330936e-06, |
|
"loss": 0.1685, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.6517457475380484, |
|
"grad_norm": 8.32598876953125, |
|
"learning_rate": 3.872901678657074e-06, |
|
"loss": 0.0991, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 3.658907788719785, |
|
"grad_norm": 6.495327949523926, |
|
"learning_rate": 3.860911270983214e-06, |
|
"loss": 0.1693, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 3.666069829901522, |
|
"grad_norm": 10.446044921875, |
|
"learning_rate": 3.848920863309353e-06, |
|
"loss": 0.2238, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 3.6732318710832588, |
|
"grad_norm": 6.772848606109619, |
|
"learning_rate": 3.836930455635492e-06, |
|
"loss": 0.2505, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 3.6803939122649956, |
|
"grad_norm": 7.457878112792969, |
|
"learning_rate": 3.824940047961631e-06, |
|
"loss": 0.1456, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 3.6875559534467324, |
|
"grad_norm": 7.897626876831055, |
|
"learning_rate": 3.81294964028777e-06, |
|
"loss": 0.2099, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 3.694717994628469, |
|
"grad_norm": 3.0930118560791016, |
|
"learning_rate": 3.8009592326139096e-06, |
|
"loss": 0.0628, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 3.701880035810206, |
|
"grad_norm": 5.182916641235352, |
|
"learning_rate": 3.7889688249400482e-06, |
|
"loss": 0.2316, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 3.7090420769919428, |
|
"grad_norm": 7.7203874588012695, |
|
"learning_rate": 3.7769784172661873e-06, |
|
"loss": 0.1151, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 3.7162041181736796, |
|
"grad_norm": 3.8697283267974854, |
|
"learning_rate": 3.7649880095923263e-06, |
|
"loss": 0.1911, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.7233661593554164, |
|
"grad_norm": 6.677131175994873, |
|
"learning_rate": 3.7529976019184653e-06, |
|
"loss": 0.1403, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 3.730528200537153, |
|
"grad_norm": 10.677637100219727, |
|
"learning_rate": 3.741007194244605e-06, |
|
"loss": 0.2475, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 3.73769024171889, |
|
"grad_norm": 8.453975677490234, |
|
"learning_rate": 3.729016786570744e-06, |
|
"loss": 0.2107, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 3.7448522829006268, |
|
"grad_norm": 7.9489054679870605, |
|
"learning_rate": 3.717026378896883e-06, |
|
"loss": 0.2636, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 3.7520143240823636, |
|
"grad_norm": 5.376030445098877, |
|
"learning_rate": 3.7050359712230215e-06, |
|
"loss": 0.3502, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.7591763652641004, |
|
"grad_norm": 7.20943021774292, |
|
"learning_rate": 3.693045563549161e-06, |
|
"loss": 0.0799, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 3.766338406445837, |
|
"grad_norm": 9.997661590576172, |
|
"learning_rate": 3.6810551558753e-06, |
|
"loss": 0.206, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 3.773500447627574, |
|
"grad_norm": 13.116365432739258, |
|
"learning_rate": 3.669064748201439e-06, |
|
"loss": 0.2156, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 3.7806624888093108, |
|
"grad_norm": 8.536945343017578, |
|
"learning_rate": 3.657074340527578e-06, |
|
"loss": 0.2, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 3.7878245299910476, |
|
"grad_norm": 9.58375358581543, |
|
"learning_rate": 3.6450839328537175e-06, |
|
"loss": 0.2438, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.7949865711727844, |
|
"grad_norm": 4.94558572769165, |
|
"learning_rate": 3.6330935251798566e-06, |
|
"loss": 0.0869, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 3.802148612354521, |
|
"grad_norm": 13.01669979095459, |
|
"learning_rate": 3.6211031175059956e-06, |
|
"loss": 0.2684, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 3.809310653536258, |
|
"grad_norm": 7.679543972015381, |
|
"learning_rate": 3.609112709832134e-06, |
|
"loss": 0.1146, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 3.8164726947179948, |
|
"grad_norm": 12.043442726135254, |
|
"learning_rate": 3.5971223021582737e-06, |
|
"loss": 0.2482, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 3.8236347358997316, |
|
"grad_norm": 8.951347351074219, |
|
"learning_rate": 3.5851318944844127e-06, |
|
"loss": 0.2051, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 3.8307967770814684, |
|
"grad_norm": 11.88425350189209, |
|
"learning_rate": 3.5731414868105517e-06, |
|
"loss": 0.3122, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 3.837958818263205, |
|
"grad_norm": 5.419406414031982, |
|
"learning_rate": 3.561151079136691e-06, |
|
"loss": 0.1697, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 3.845120859444942, |
|
"grad_norm": 10.882148742675781, |
|
"learning_rate": 3.5491606714628302e-06, |
|
"loss": 0.1324, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 3.8522829006266788, |
|
"grad_norm": 5.880361557006836, |
|
"learning_rate": 3.5371702637889693e-06, |
|
"loss": 0.1096, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 3.859444941808415, |
|
"grad_norm": 7.374398708343506, |
|
"learning_rate": 3.525179856115108e-06, |
|
"loss": 0.1297, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.8666069829901524, |
|
"grad_norm": 5.992897987365723, |
|
"learning_rate": 3.513189448441247e-06, |
|
"loss": 0.2659, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 3.8737690241718887, |
|
"grad_norm": 3.3240163326263428, |
|
"learning_rate": 3.5011990407673864e-06, |
|
"loss": 0.0718, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 3.880931065353626, |
|
"grad_norm": 14.215323448181152, |
|
"learning_rate": 3.4892086330935254e-06, |
|
"loss": 0.3404, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 3.8880931065353623, |
|
"grad_norm": 8.490641593933105, |
|
"learning_rate": 3.4772182254196645e-06, |
|
"loss": 0.191, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 3.8952551477170996, |
|
"grad_norm": 8.60212516784668, |
|
"learning_rate": 3.465227817745804e-06, |
|
"loss": 0.3062, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 3.902417188898836, |
|
"grad_norm": 9.142696380615234, |
|
"learning_rate": 3.453237410071943e-06, |
|
"loss": 0.3727, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 3.909579230080573, |
|
"grad_norm": 8.537508010864258, |
|
"learning_rate": 3.441247002398082e-06, |
|
"loss": 0.213, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 3.9167412712623095, |
|
"grad_norm": 4.520055294036865, |
|
"learning_rate": 3.4292565947242206e-06, |
|
"loss": 0.1401, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 3.9239033124440468, |
|
"grad_norm": 10.023722648620605, |
|
"learning_rate": 3.4172661870503596e-06, |
|
"loss": 0.16, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 3.931065353625783, |
|
"grad_norm": 6.136500835418701, |
|
"learning_rate": 3.405275779376499e-06, |
|
"loss": 0.0564, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.9382273948075204, |
|
"grad_norm": 6.584155559539795, |
|
"learning_rate": 3.393285371702638e-06, |
|
"loss": 0.1132, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 3.9453894359892567, |
|
"grad_norm": 7.42992639541626, |
|
"learning_rate": 3.381294964028777e-06, |
|
"loss": 0.1504, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 3.952551477170994, |
|
"grad_norm": 4.1336798667907715, |
|
"learning_rate": 3.3693045563549166e-06, |
|
"loss": 0.2082, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 3.9597135183527303, |
|
"grad_norm": 8.42125415802002, |
|
"learning_rate": 3.3573141486810557e-06, |
|
"loss": 0.1697, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 3.9668755595344676, |
|
"grad_norm": 9.528365135192871, |
|
"learning_rate": 3.3453237410071943e-06, |
|
"loss": 0.1943, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 3.974037600716204, |
|
"grad_norm": 8.06141471862793, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.1819, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 3.981199641897941, |
|
"grad_norm": 5.582054138183594, |
|
"learning_rate": 3.3213429256594728e-06, |
|
"loss": 0.1164, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 3.9883616830796775, |
|
"grad_norm": 4.734123706817627, |
|
"learning_rate": 3.309352517985612e-06, |
|
"loss": 0.1066, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 3.9955237242614148, |
|
"grad_norm": 9.022132873535156, |
|
"learning_rate": 3.297362110311751e-06, |
|
"loss": 0.2624, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 3.551133155822754, |
|
"learning_rate": 3.28537170263789e-06, |
|
"loss": 0.1409, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.8893360160965795, |
|
"eval_loss": 0.2713683247566223, |
|
"eval_runtime": 12.7906, |
|
"eval_samples_per_second": 38.857, |
|
"eval_steps_per_second": 38.857, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.007162041181736, |
|
"grad_norm": 6.001535892486572, |
|
"learning_rate": 3.2733812949640294e-06, |
|
"loss": 0.1727, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 4.014324082363474, |
|
"grad_norm": 7.856527805328369, |
|
"learning_rate": 3.2613908872901684e-06, |
|
"loss": 0.2077, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 4.02148612354521, |
|
"grad_norm": 12.504968643188477, |
|
"learning_rate": 3.249400479616307e-06, |
|
"loss": 0.1911, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 4.028648164726947, |
|
"grad_norm": 6.551493167877197, |
|
"learning_rate": 3.237410071942446e-06, |
|
"loss": 0.1348, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 4.035810205908684, |
|
"grad_norm": 6.259852886199951, |
|
"learning_rate": 3.2254196642685855e-06, |
|
"loss": 0.0678, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 4.042972247090421, |
|
"grad_norm": 11.477996826171875, |
|
"learning_rate": 3.2134292565947245e-06, |
|
"loss": 0.187, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 4.050134288272157, |
|
"grad_norm": 8.440922737121582, |
|
"learning_rate": 3.2014388489208636e-06, |
|
"loss": 0.1693, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 4.057296329453894, |
|
"grad_norm": 2.096426486968994, |
|
"learning_rate": 3.1894484412470026e-06, |
|
"loss": 0.0708, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 4.064458370635631, |
|
"grad_norm": 1.9437880516052246, |
|
"learning_rate": 3.177458033573142e-06, |
|
"loss": 0.0456, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 4.071620411817368, |
|
"grad_norm": 6.8768792152404785, |
|
"learning_rate": 3.1654676258992807e-06, |
|
"loss": 0.1657, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.078782452999104, |
|
"grad_norm": 7.917454242706299, |
|
"learning_rate": 3.1534772182254197e-06, |
|
"loss": 0.2116, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 4.085944494180842, |
|
"grad_norm": 6.435678005218506, |
|
"learning_rate": 3.1414868105515588e-06, |
|
"loss": 0.1276, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 4.093106535362578, |
|
"grad_norm": 7.636709213256836, |
|
"learning_rate": 3.1294964028776982e-06, |
|
"loss": 0.2434, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 4.100268576544315, |
|
"grad_norm": 7.988414287567139, |
|
"learning_rate": 3.1175059952038373e-06, |
|
"loss": 0.1653, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 4.107430617726052, |
|
"grad_norm": 8.184667587280273, |
|
"learning_rate": 3.1055155875299763e-06, |
|
"loss": 0.1045, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 4.114592658907789, |
|
"grad_norm": 4.961910247802734, |
|
"learning_rate": 3.0935251798561158e-06, |
|
"loss": 0.0968, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 4.121754700089525, |
|
"grad_norm": 5.732913494110107, |
|
"learning_rate": 3.081534772182255e-06, |
|
"loss": 0.147, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 4.128916741271262, |
|
"grad_norm": 3.643427848815918, |
|
"learning_rate": 3.0695443645083934e-06, |
|
"loss": 0.2073, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 4.136078782452999, |
|
"grad_norm": 7.914638519287109, |
|
"learning_rate": 3.0575539568345324e-06, |
|
"loss": 0.3222, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 4.143240823634736, |
|
"grad_norm": 2.1673357486724854, |
|
"learning_rate": 3.0455635491606715e-06, |
|
"loss": 0.0498, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.150402864816472, |
|
"grad_norm": 5.7334513664245605, |
|
"learning_rate": 3.033573141486811e-06, |
|
"loss": 0.1663, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 4.15756490599821, |
|
"grad_norm": 4.769826412200928, |
|
"learning_rate": 3.02158273381295e-06, |
|
"loss": 0.202, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 4.164726947179946, |
|
"grad_norm": 5.7501540184021, |
|
"learning_rate": 3.009592326139089e-06, |
|
"loss": 0.2056, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 4.171888988361683, |
|
"grad_norm": 11.669347763061523, |
|
"learning_rate": 2.9976019184652285e-06, |
|
"loss": 0.2501, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 4.17905102954342, |
|
"grad_norm": 5.45068359375, |
|
"learning_rate": 2.985611510791367e-06, |
|
"loss": 0.0684, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 4.186213070725157, |
|
"grad_norm": 1.9370661973953247, |
|
"learning_rate": 2.973621103117506e-06, |
|
"loss": 0.0478, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 4.193375111906893, |
|
"grad_norm": 2.850411891937256, |
|
"learning_rate": 2.961630695443645e-06, |
|
"loss": 0.0768, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 4.20053715308863, |
|
"grad_norm": 2.566312313079834, |
|
"learning_rate": 2.949640287769784e-06, |
|
"loss": 0.0937, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 4.207699194270367, |
|
"grad_norm": 4.700612545013428, |
|
"learning_rate": 2.9376498800959237e-06, |
|
"loss": 0.0683, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 4.214861235452104, |
|
"grad_norm": 9.014172554016113, |
|
"learning_rate": 2.9256594724220627e-06, |
|
"loss": 0.1393, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.22202327663384, |
|
"grad_norm": 7.445233345031738, |
|
"learning_rate": 2.9136690647482017e-06, |
|
"loss": 0.1373, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 4.229185317815578, |
|
"grad_norm": 5.060126781463623, |
|
"learning_rate": 2.9016786570743403e-06, |
|
"loss": 0.098, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 4.236347358997314, |
|
"grad_norm": 5.344047546386719, |
|
"learning_rate": 2.88968824940048e-06, |
|
"loss": 0.1017, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 4.243509400179051, |
|
"grad_norm": 13.688530921936035, |
|
"learning_rate": 2.877697841726619e-06, |
|
"loss": 0.2884, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 4.250671441360788, |
|
"grad_norm": 9.703275680541992, |
|
"learning_rate": 2.865707434052758e-06, |
|
"loss": 0.1058, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 4.257833482542525, |
|
"grad_norm": 8.223041534423828, |
|
"learning_rate": 2.8537170263788973e-06, |
|
"loss": 0.28, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 4.264995523724261, |
|
"grad_norm": 5.328052520751953, |
|
"learning_rate": 2.8417266187050364e-06, |
|
"loss": 0.0894, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 4.272157564905998, |
|
"grad_norm": 4.801018238067627, |
|
"learning_rate": 2.8297362110311754e-06, |
|
"loss": 0.0728, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 4.279319606087735, |
|
"grad_norm": 8.624038696289062, |
|
"learning_rate": 2.8177458033573145e-06, |
|
"loss": 0.2589, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 4.286481647269472, |
|
"grad_norm": 1.4504754543304443, |
|
"learning_rate": 2.805755395683453e-06, |
|
"loss": 0.0433, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.293643688451208, |
|
"grad_norm": 3.4955124855041504, |
|
"learning_rate": 2.7937649880095925e-06, |
|
"loss": 0.08, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 4.300805729632946, |
|
"grad_norm": 8.949994087219238, |
|
"learning_rate": 2.7817745803357316e-06, |
|
"loss": 0.094, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 4.307967770814682, |
|
"grad_norm": 7.984180450439453, |
|
"learning_rate": 2.7697841726618706e-06, |
|
"loss": 0.1631, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 4.315129811996419, |
|
"grad_norm": 2.0268032550811768, |
|
"learning_rate": 2.75779376498801e-06, |
|
"loss": 0.0622, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 4.322291853178156, |
|
"grad_norm": 6.695650577545166, |
|
"learning_rate": 2.745803357314149e-06, |
|
"loss": 0.0929, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 4.329453894359893, |
|
"grad_norm": 2.7270710468292236, |
|
"learning_rate": 2.733812949640288e-06, |
|
"loss": 0.0688, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 4.336615935541629, |
|
"grad_norm": 9.790903091430664, |
|
"learning_rate": 2.7218225419664268e-06, |
|
"loss": 0.1525, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 4.343777976723366, |
|
"grad_norm": 6.341964244842529, |
|
"learning_rate": 2.7098321342925658e-06, |
|
"loss": 0.0756, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 4.350940017905103, |
|
"grad_norm": 3.1827850341796875, |
|
"learning_rate": 2.6978417266187052e-06, |
|
"loss": 0.0814, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 4.35810205908684, |
|
"grad_norm": 7.853371620178223, |
|
"learning_rate": 2.6858513189448443e-06, |
|
"loss": 0.072, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.365264100268576, |
|
"grad_norm": 2.4876341819763184, |
|
"learning_rate": 2.6738609112709833e-06, |
|
"loss": 0.0512, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 4.372426141450314, |
|
"grad_norm": 2.8959743976593018, |
|
"learning_rate": 2.6618705035971228e-06, |
|
"loss": 0.0398, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 4.37958818263205, |
|
"grad_norm": 10.225565910339355, |
|
"learning_rate": 2.649880095923262e-06, |
|
"loss": 0.2628, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 4.386750223813787, |
|
"grad_norm": 9.800642013549805, |
|
"learning_rate": 2.637889688249401e-06, |
|
"loss": 0.3922, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 4.393912264995524, |
|
"grad_norm": 5.1125617027282715, |
|
"learning_rate": 2.6258992805755395e-06, |
|
"loss": 0.0934, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 4.401074306177261, |
|
"grad_norm": 3.544774293899536, |
|
"learning_rate": 2.6139088729016785e-06, |
|
"loss": 0.0726, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 4.408236347358997, |
|
"grad_norm": 10.818737030029297, |
|
"learning_rate": 2.601918465227818e-06, |
|
"loss": 0.1664, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 4.415398388540734, |
|
"grad_norm": 3.033714771270752, |
|
"learning_rate": 2.589928057553957e-06, |
|
"loss": 0.1875, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 4.422560429722471, |
|
"grad_norm": 3.4782607555389404, |
|
"learning_rate": 2.577937649880096e-06, |
|
"loss": 0.065, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 4.429722470904208, |
|
"grad_norm": 1.43113112449646, |
|
"learning_rate": 2.5659472422062355e-06, |
|
"loss": 0.0302, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.436884512085944, |
|
"grad_norm": 6.158276557922363, |
|
"learning_rate": 2.5539568345323745e-06, |
|
"loss": 0.0774, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 4.444046553267682, |
|
"grad_norm": 5.511786937713623, |
|
"learning_rate": 2.541966426858513e-06, |
|
"loss": 0.0645, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 4.451208594449418, |
|
"grad_norm": 5.461114406585693, |
|
"learning_rate": 2.529976019184652e-06, |
|
"loss": 0.1123, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 4.458370635631155, |
|
"grad_norm": 5.242578983306885, |
|
"learning_rate": 2.5179856115107916e-06, |
|
"loss": 0.1413, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 4.465532676812892, |
|
"grad_norm": 4.951175212860107, |
|
"learning_rate": 2.5059952038369307e-06, |
|
"loss": 0.0973, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 4.472694717994629, |
|
"grad_norm": 7.635180950164795, |
|
"learning_rate": 2.4940047961630697e-06, |
|
"loss": 0.1024, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 4.479856759176365, |
|
"grad_norm": 6.080862522125244, |
|
"learning_rate": 2.4820143884892088e-06, |
|
"loss": 0.0925, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 4.487018800358102, |
|
"grad_norm": 12.718398094177246, |
|
"learning_rate": 2.470023980815348e-06, |
|
"loss": 0.1659, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 4.494180841539839, |
|
"grad_norm": 8.396491050720215, |
|
"learning_rate": 2.458033573141487e-06, |
|
"loss": 0.3336, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 4.501342882721576, |
|
"grad_norm": 16.718276977539062, |
|
"learning_rate": 2.4460431654676263e-06, |
|
"loss": 0.3048, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.508504923903312, |
|
"grad_norm": 7.400913238525391, |
|
"learning_rate": 2.4340527577937653e-06, |
|
"loss": 0.104, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 4.51566696508505, |
|
"grad_norm": 6.40206241607666, |
|
"learning_rate": 2.4220623501199044e-06, |
|
"loss": 0.118, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 4.522829006266786, |
|
"grad_norm": 11.661595344543457, |
|
"learning_rate": 2.4100719424460434e-06, |
|
"loss": 0.2508, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 4.529991047448523, |
|
"grad_norm": 15.65445613861084, |
|
"learning_rate": 2.3980815347721824e-06, |
|
"loss": 0.1724, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 4.53715308863026, |
|
"grad_norm": 20.107707977294922, |
|
"learning_rate": 2.3860911270983215e-06, |
|
"loss": 0.1529, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 4.544315129811997, |
|
"grad_norm": 1.682045340538025, |
|
"learning_rate": 2.3741007194244605e-06, |
|
"loss": 0.0449, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 4.551477170993733, |
|
"grad_norm": 3.9842913150787354, |
|
"learning_rate": 2.3621103117505996e-06, |
|
"loss": 0.183, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 4.5586392121754695, |
|
"grad_norm": 4.31380558013916, |
|
"learning_rate": 2.350119904076739e-06, |
|
"loss": 0.0579, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 4.565801253357207, |
|
"grad_norm": 7.769341468811035, |
|
"learning_rate": 2.3381294964028776e-06, |
|
"loss": 0.1413, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 4.572963294538944, |
|
"grad_norm": 9.923673629760742, |
|
"learning_rate": 2.326139088729017e-06, |
|
"loss": 0.3786, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.58012533572068, |
|
"grad_norm": 10.427968978881836, |
|
"learning_rate": 2.314148681055156e-06, |
|
"loss": 0.1294, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 4.587287376902417, |
|
"grad_norm": 3.505420207977295, |
|
"learning_rate": 2.302158273381295e-06, |
|
"loss": 0.0478, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 4.594449418084154, |
|
"grad_norm": 4.628006458282471, |
|
"learning_rate": 2.290167865707434e-06, |
|
"loss": 0.1185, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 4.601611459265891, |
|
"grad_norm": 6.239457130432129, |
|
"learning_rate": 2.2781774580335732e-06, |
|
"loss": 0.1386, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 4.608773500447628, |
|
"grad_norm": 1.7492996454238892, |
|
"learning_rate": 2.2661870503597123e-06, |
|
"loss": 0.0346, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 4.615935541629364, |
|
"grad_norm": 5.875699520111084, |
|
"learning_rate": 2.2541966426858513e-06, |
|
"loss": 0.1871, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 4.623097582811101, |
|
"grad_norm": 6.977318286895752, |
|
"learning_rate": 2.2422062350119903e-06, |
|
"loss": 0.2546, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 4.630259623992838, |
|
"grad_norm": 12.939038276672363, |
|
"learning_rate": 2.23021582733813e-06, |
|
"loss": 0.121, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 4.637421665174575, |
|
"grad_norm": 8.233115196228027, |
|
"learning_rate": 2.218225419664269e-06, |
|
"loss": 0.1027, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 4.644583706356311, |
|
"grad_norm": 9.395356178283691, |
|
"learning_rate": 2.206235011990408e-06, |
|
"loss": 0.1401, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.651745747538048, |
|
"grad_norm": 2.7126667499542236, |
|
"learning_rate": 2.194244604316547e-06, |
|
"loss": 0.0724, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 4.658907788719786, |
|
"grad_norm": 8.101604461669922, |
|
"learning_rate": 2.182254196642686e-06, |
|
"loss": 0.2692, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 4.666069829901522, |
|
"grad_norm": 4.733114719390869, |
|
"learning_rate": 2.1702637889688254e-06, |
|
"loss": 0.1407, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 4.673231871083258, |
|
"grad_norm": 9.878451347351074, |
|
"learning_rate": 2.158273381294964e-06, |
|
"loss": 0.335, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 4.680393912264996, |
|
"grad_norm": 7.976416110992432, |
|
"learning_rate": 2.146282973621103e-06, |
|
"loss": 0.0987, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 4.687555953446733, |
|
"grad_norm": 6.621464252471924, |
|
"learning_rate": 2.1342925659472425e-06, |
|
"loss": 0.1108, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 4.694717994628469, |
|
"grad_norm": 2.5924878120422363, |
|
"learning_rate": 2.1223021582733816e-06, |
|
"loss": 0.0372, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 4.7018800358102055, |
|
"grad_norm": 2.272629737854004, |
|
"learning_rate": 2.1103117505995206e-06, |
|
"loss": 0.0476, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 4.709042076991943, |
|
"grad_norm": 5.590856552124023, |
|
"learning_rate": 2.0983213429256596e-06, |
|
"loss": 0.3509, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 4.716204118173679, |
|
"grad_norm": 5.7683210372924805, |
|
"learning_rate": 2.0863309352517987e-06, |
|
"loss": 0.0917, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.723366159355416, |
|
"grad_norm": 7.388215065002441, |
|
"learning_rate": 2.0743405275779377e-06, |
|
"loss": 0.1412, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 4.730528200537153, |
|
"grad_norm": 6.264549732208252, |
|
"learning_rate": 2.0623501199040767e-06, |
|
"loss": 0.1877, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 4.73769024171889, |
|
"grad_norm": 7.518822193145752, |
|
"learning_rate": 2.050359712230216e-06, |
|
"loss": 0.2453, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 4.744852282900626, |
|
"grad_norm": 2.944272041320801, |
|
"learning_rate": 2.0383693045563552e-06, |
|
"loss": 0.0604, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 4.752014324082364, |
|
"grad_norm": 10.040867805480957, |
|
"learning_rate": 2.026378896882494e-06, |
|
"loss": 0.1441, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 4.7591763652641, |
|
"grad_norm": 9.994035720825195, |
|
"learning_rate": 2.0143884892086333e-06, |
|
"loss": 0.2145, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 4.766338406445837, |
|
"grad_norm": 9.281600952148438, |
|
"learning_rate": 2.0023980815347724e-06, |
|
"loss": 0.18, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 4.7735004476275735, |
|
"grad_norm": 5.464154243469238, |
|
"learning_rate": 1.9904076738609114e-06, |
|
"loss": 0.0456, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 4.780662488809311, |
|
"grad_norm": 5.3792524337768555, |
|
"learning_rate": 1.9784172661870504e-06, |
|
"loss": 0.0947, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 4.787824529991047, |
|
"grad_norm": 8.45373821258545, |
|
"learning_rate": 1.9664268585131895e-06, |
|
"loss": 0.1664, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.794986571172784, |
|
"grad_norm": 4.617059230804443, |
|
"learning_rate": 1.954436450839329e-06, |
|
"loss": 0.0759, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 4.802148612354521, |
|
"grad_norm": 10.997187614440918, |
|
"learning_rate": 1.942446043165468e-06, |
|
"loss": 0.1521, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 4.809310653536258, |
|
"grad_norm": 5.464894771575928, |
|
"learning_rate": 1.930455635491607e-06, |
|
"loss": 0.0674, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 4.816472694717994, |
|
"grad_norm": 5.334858417510986, |
|
"learning_rate": 1.918465227817746e-06, |
|
"loss": 0.1741, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 4.823634735899732, |
|
"grad_norm": 11.286782264709473, |
|
"learning_rate": 1.906474820143885e-06, |
|
"loss": 0.2467, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 4.830796777081468, |
|
"grad_norm": 7.515106678009033, |
|
"learning_rate": 1.8944844124700241e-06, |
|
"loss": 0.0812, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 4.837958818263205, |
|
"grad_norm": 2.932992935180664, |
|
"learning_rate": 1.8824940047961631e-06, |
|
"loss": 0.0713, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 4.8451208594449415, |
|
"grad_norm": 1.9469927549362183, |
|
"learning_rate": 1.8705035971223024e-06, |
|
"loss": 0.0513, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 4.852282900626679, |
|
"grad_norm": 10.902048110961914, |
|
"learning_rate": 1.8585131894484414e-06, |
|
"loss": 0.2673, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 4.859444941808415, |
|
"grad_norm": 10.670211791992188, |
|
"learning_rate": 1.8465227817745805e-06, |
|
"loss": 0.0831, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 4.866606982990152, |
|
"grad_norm": 15.201674461364746, |
|
"learning_rate": 1.8345323741007195e-06, |
|
"loss": 0.0817, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 4.873769024171889, |
|
"grad_norm": 7.9115681648254395, |
|
"learning_rate": 1.8225419664268588e-06, |
|
"loss": 0.2511, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 4.880931065353626, |
|
"grad_norm": 8.73487663269043, |
|
"learning_rate": 1.8105515587529978e-06, |
|
"loss": 0.1621, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 4.888093106535362, |
|
"grad_norm": 9.163458824157715, |
|
"learning_rate": 1.7985611510791368e-06, |
|
"loss": 0.2148, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 4.8952551477171, |
|
"grad_norm": 3.965367317199707, |
|
"learning_rate": 1.7865707434052759e-06, |
|
"loss": 0.0611, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 4.902417188898836, |
|
"grad_norm": 3.5464630126953125, |
|
"learning_rate": 1.7745803357314151e-06, |
|
"loss": 0.05, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 4.909579230080573, |
|
"grad_norm": 6.229908466339111, |
|
"learning_rate": 1.762589928057554e-06, |
|
"loss": 0.335, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 4.9167412712623095, |
|
"grad_norm": 7.500967025756836, |
|
"learning_rate": 1.7505995203836932e-06, |
|
"loss": 0.1911, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 4.923903312444047, |
|
"grad_norm": 4.766892433166504, |
|
"learning_rate": 1.7386091127098322e-06, |
|
"loss": 0.0773, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 4.931065353625783, |
|
"grad_norm": 7.059886932373047, |
|
"learning_rate": 1.7266187050359715e-06, |
|
"loss": 0.1107, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 4.93822739480752, |
|
"grad_norm": 3.412309408187866, |
|
"learning_rate": 1.7146282973621103e-06, |
|
"loss": 0.0494, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 4.945389435989257, |
|
"grad_norm": 12.20711898803711, |
|
"learning_rate": 1.7026378896882496e-06, |
|
"loss": 0.2606, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 4.952551477170994, |
|
"grad_norm": 12.949819564819336, |
|
"learning_rate": 1.6906474820143886e-06, |
|
"loss": 0.1445, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 4.95971351835273, |
|
"grad_norm": 5.260175704956055, |
|
"learning_rate": 1.6786570743405278e-06, |
|
"loss": 0.0488, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 4.966875559534468, |
|
"grad_norm": 5.705483436584473, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.1724, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 4.974037600716204, |
|
"grad_norm": 9.60573959350586, |
|
"learning_rate": 1.654676258992806e-06, |
|
"loss": 0.2245, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 4.981199641897941, |
|
"grad_norm": 3.508620500564575, |
|
"learning_rate": 1.642685851318945e-06, |
|
"loss": 0.0552, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 4.9883616830796775, |
|
"grad_norm": 3.118699789047241, |
|
"learning_rate": 1.6306954436450842e-06, |
|
"loss": 0.0483, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 4.995523724261415, |
|
"grad_norm": 5.063241958618164, |
|
"learning_rate": 1.618705035971223e-06, |
|
"loss": 0.0576, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.9771332740783691, |
|
"learning_rate": 1.6067146282973623e-06, |
|
"loss": 0.0227, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.8893360160965795, |
|
"eval_loss": 0.2797456681728363, |
|
"eval_runtime": 12.7936, |
|
"eval_samples_per_second": 38.847, |
|
"eval_steps_per_second": 38.847, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.007162041181736, |
|
"grad_norm": 9.170907974243164, |
|
"learning_rate": 1.5947242206235013e-06, |
|
"loss": 0.0874, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 5.014324082363474, |
|
"grad_norm": 13.936522483825684, |
|
"learning_rate": 1.5827338129496403e-06, |
|
"loss": 0.1382, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 5.02148612354521, |
|
"grad_norm": 5.094358921051025, |
|
"learning_rate": 1.5707434052757794e-06, |
|
"loss": 0.1265, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 5.028648164726947, |
|
"grad_norm": 2.9558494091033936, |
|
"learning_rate": 1.5587529976019186e-06, |
|
"loss": 0.1431, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 5.035810205908684, |
|
"grad_norm": 3.9764819145202637, |
|
"learning_rate": 1.5467625899280579e-06, |
|
"loss": 0.057, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 5.042972247090421, |
|
"grad_norm": 10.071995735168457, |
|
"learning_rate": 1.5347721822541967e-06, |
|
"loss": 0.0821, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 5.050134288272157, |
|
"grad_norm": 7.149175643920898, |
|
"learning_rate": 1.5227817745803357e-06, |
|
"loss": 0.0672, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 5.057296329453894, |
|
"grad_norm": 7.112942218780518, |
|
"learning_rate": 1.510791366906475e-06, |
|
"loss": 0.1492, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 5.064458370635631, |
|
"grad_norm": 8.392587661743164, |
|
"learning_rate": 1.4988009592326142e-06, |
|
"loss": 0.2037, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 5.071620411817368, |
|
"grad_norm": 10.566859245300293, |
|
"learning_rate": 1.486810551558753e-06, |
|
"loss": 0.1623, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 5.078782452999104, |
|
"grad_norm": 7.337512016296387, |
|
"learning_rate": 1.474820143884892e-06, |
|
"loss": 0.2304, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 5.085944494180842, |
|
"grad_norm": 5.944802284240723, |
|
"learning_rate": 1.4628297362110313e-06, |
|
"loss": 0.3046, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 5.093106535362578, |
|
"grad_norm": 7.068042755126953, |
|
"learning_rate": 1.4508393285371702e-06, |
|
"loss": 0.1634, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 5.100268576544315, |
|
"grad_norm": 6.807985305786133, |
|
"learning_rate": 1.4388489208633094e-06, |
|
"loss": 0.0956, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 5.107430617726052, |
|
"grad_norm": 1.3011399507522583, |
|
"learning_rate": 1.4268585131894487e-06, |
|
"loss": 0.0329, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 5.114592658907789, |
|
"grad_norm": 4.0383782386779785, |
|
"learning_rate": 1.4148681055155877e-06, |
|
"loss": 0.1312, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 5.121754700089525, |
|
"grad_norm": 9.570232391357422, |
|
"learning_rate": 1.4028776978417265e-06, |
|
"loss": 0.1369, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 5.128916741271262, |
|
"grad_norm": 9.754796981811523, |
|
"learning_rate": 1.3908872901678658e-06, |
|
"loss": 0.2643, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 5.136078782452999, |
|
"grad_norm": 2.8878424167633057, |
|
"learning_rate": 1.378896882494005e-06, |
|
"loss": 0.0438, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 5.143240823634736, |
|
"grad_norm": 6.284052848815918, |
|
"learning_rate": 1.366906474820144e-06, |
|
"loss": 0.1165, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 5.150402864816472, |
|
"grad_norm": 7.874203681945801, |
|
"learning_rate": 1.3549160671462829e-06, |
|
"loss": 0.1494, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 5.15756490599821, |
|
"grad_norm": 5.88767147064209, |
|
"learning_rate": 1.3429256594724221e-06, |
|
"loss": 0.0979, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 5.164726947179946, |
|
"grad_norm": 9.50105094909668, |
|
"learning_rate": 1.3309352517985614e-06, |
|
"loss": 0.3674, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 5.171888988361683, |
|
"grad_norm": 6.08318567276001, |
|
"learning_rate": 1.3189448441247004e-06, |
|
"loss": 0.1707, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 5.17905102954342, |
|
"grad_norm": 3.7768077850341797, |
|
"learning_rate": 1.3069544364508393e-06, |
|
"loss": 0.1428, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 5.186213070725157, |
|
"grad_norm": 5.646254062652588, |
|
"learning_rate": 1.2949640287769785e-06, |
|
"loss": 0.0879, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 5.193375111906893, |
|
"grad_norm": 13.744306564331055, |
|
"learning_rate": 1.2829736211031178e-06, |
|
"loss": 0.1985, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 5.20053715308863, |
|
"grad_norm": 3.692706346511841, |
|
"learning_rate": 1.2709832134292566e-06, |
|
"loss": 0.0749, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 5.207699194270367, |
|
"grad_norm": 7.3681488037109375, |
|
"learning_rate": 1.2589928057553958e-06, |
|
"loss": 0.1026, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 5.214861235452104, |
|
"grad_norm": 5.465104579925537, |
|
"learning_rate": 1.2470023980815349e-06, |
|
"loss": 0.085, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 5.22202327663384, |
|
"grad_norm": 7.1183037757873535, |
|
"learning_rate": 1.235011990407674e-06, |
|
"loss": 0.0392, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 5.229185317815578, |
|
"grad_norm": 3.9316046237945557, |
|
"learning_rate": 1.2230215827338131e-06, |
|
"loss": 0.0543, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 5.236347358997314, |
|
"grad_norm": 2.7083520889282227, |
|
"learning_rate": 1.2110311750599522e-06, |
|
"loss": 0.0454, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 5.243509400179051, |
|
"grad_norm": 8.070670127868652, |
|
"learning_rate": 1.1990407673860912e-06, |
|
"loss": 0.2432, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 5.250671441360788, |
|
"grad_norm": 5.080997943878174, |
|
"learning_rate": 1.1870503597122303e-06, |
|
"loss": 0.1378, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 5.257833482542525, |
|
"grad_norm": 5.301459312438965, |
|
"learning_rate": 1.1750599520383695e-06, |
|
"loss": 0.0696, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 5.264995523724261, |
|
"grad_norm": 10.343586921691895, |
|
"learning_rate": 1.1630695443645085e-06, |
|
"loss": 0.2511, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 5.272157564905998, |
|
"grad_norm": 1.9044795036315918, |
|
"learning_rate": 1.1510791366906476e-06, |
|
"loss": 0.0365, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 5.279319606087735, |
|
"grad_norm": 6.329905033111572, |
|
"learning_rate": 1.1390887290167866e-06, |
|
"loss": 0.118, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 5.286481647269472, |
|
"grad_norm": 12.016860961914062, |
|
"learning_rate": 1.1270983213429257e-06, |
|
"loss": 0.1504, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 5.293643688451208, |
|
"grad_norm": 8.04209041595459, |
|
"learning_rate": 1.115107913669065e-06, |
|
"loss": 0.1188, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 5.300805729632946, |
|
"grad_norm": 5.305018424987793, |
|
"learning_rate": 1.103117505995204e-06, |
|
"loss": 0.0588, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 5.307967770814682, |
|
"grad_norm": 5.174502372741699, |
|
"learning_rate": 1.091127098321343e-06, |
|
"loss": 0.1221, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 5.315129811996419, |
|
"grad_norm": 5.626262664794922, |
|
"learning_rate": 1.079136690647482e-06, |
|
"loss": 0.0911, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 5.322291853178156, |
|
"grad_norm": 3.473499059677124, |
|
"learning_rate": 1.0671462829736213e-06, |
|
"loss": 0.0435, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 5.329453894359893, |
|
"grad_norm": 7.590930461883545, |
|
"learning_rate": 1.0551558752997603e-06, |
|
"loss": 0.1276, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 5.336615935541629, |
|
"grad_norm": 6.063522815704346, |
|
"learning_rate": 1.0431654676258993e-06, |
|
"loss": 0.2146, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 5.343777976723366, |
|
"grad_norm": 6.192073345184326, |
|
"learning_rate": 1.0311750599520384e-06, |
|
"loss": 0.2152, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 5.350940017905103, |
|
"grad_norm": 5.46181058883667, |
|
"learning_rate": 1.0191846522781776e-06, |
|
"loss": 0.2623, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 5.35810205908684, |
|
"grad_norm": 5.2007012367248535, |
|
"learning_rate": 1.0071942446043167e-06, |
|
"loss": 0.0852, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 5.365264100268576, |
|
"grad_norm": 2.142305850982666, |
|
"learning_rate": 9.952038369304557e-07, |
|
"loss": 0.0339, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 5.372426141450314, |
|
"grad_norm": 5.1917901039123535, |
|
"learning_rate": 9.832134292565947e-07, |
|
"loss": 0.2234, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 5.37958818263205, |
|
"grad_norm": 1.764793038368225, |
|
"learning_rate": 9.71223021582734e-07, |
|
"loss": 0.0293, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 5.386750223813787, |
|
"grad_norm": 2.4261093139648438, |
|
"learning_rate": 9.59232613908873e-07, |
|
"loss": 0.0506, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 5.393912264995524, |
|
"grad_norm": 7.679261207580566, |
|
"learning_rate": 9.472422062350121e-07, |
|
"loss": 0.2158, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 5.401074306177261, |
|
"grad_norm": 5.182496547698975, |
|
"learning_rate": 9.352517985611512e-07, |
|
"loss": 0.1834, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 5.408236347358997, |
|
"grad_norm": 2.914945125579834, |
|
"learning_rate": 9.232613908872902e-07, |
|
"loss": 0.0413, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 5.415398388540734, |
|
"grad_norm": 7.0388360023498535, |
|
"learning_rate": 9.112709832134294e-07, |
|
"loss": 0.066, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 5.422560429722471, |
|
"grad_norm": 2.673398494720459, |
|
"learning_rate": 8.992805755395684e-07, |
|
"loss": 0.0511, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 5.429722470904208, |
|
"grad_norm": 5.319000244140625, |
|
"learning_rate": 8.872901678657076e-07, |
|
"loss": 0.1015, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 5.436884512085944, |
|
"grad_norm": 2.27585506439209, |
|
"learning_rate": 8.752997601918466e-07, |
|
"loss": 0.0394, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 5.444046553267682, |
|
"grad_norm": 2.567699909210205, |
|
"learning_rate": 8.633093525179857e-07, |
|
"loss": 0.0778, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 5.451208594449418, |
|
"grad_norm": 11.017365455627441, |
|
"learning_rate": 8.513189448441248e-07, |
|
"loss": 0.2491, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 5.458370635631155, |
|
"grad_norm": 3.768150568008423, |
|
"learning_rate": 8.393285371702639e-07, |
|
"loss": 0.0674, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 5.465532676812892, |
|
"grad_norm": 3.6968348026275635, |
|
"learning_rate": 8.27338129496403e-07, |
|
"loss": 0.0515, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 5.472694717994629, |
|
"grad_norm": 3.3715782165527344, |
|
"learning_rate": 8.153477218225421e-07, |
|
"loss": 0.1695, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 5.479856759176365, |
|
"grad_norm": 4.479177951812744, |
|
"learning_rate": 8.033573141486811e-07, |
|
"loss": 0.0891, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 5.487018800358102, |
|
"grad_norm": 3.904632329940796, |
|
"learning_rate": 7.913669064748202e-07, |
|
"loss": 0.0651, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 5.494180841539839, |
|
"grad_norm": 10.056451797485352, |
|
"learning_rate": 7.793764988009593e-07, |
|
"loss": 0.0672, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 5.501342882721576, |
|
"grad_norm": 7.317890644073486, |
|
"learning_rate": 7.673860911270984e-07, |
|
"loss": 0.0775, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 5.508504923903312, |
|
"grad_norm": 4.815303802490234, |
|
"learning_rate": 7.553956834532375e-07, |
|
"loss": 0.1864, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 5.51566696508505, |
|
"grad_norm": 4.484886169433594, |
|
"learning_rate": 7.434052757793765e-07, |
|
"loss": 0.0818, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 5.522829006266786, |
|
"grad_norm": 8.391207695007324, |
|
"learning_rate": 7.314148681055157e-07, |
|
"loss": 0.109, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 5.529991047448523, |
|
"grad_norm": 9.510869026184082, |
|
"learning_rate": 7.194244604316547e-07, |
|
"loss": 0.1167, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 5.53715308863026, |
|
"grad_norm": 1.1796542406082153, |
|
"learning_rate": 7.074340527577939e-07, |
|
"loss": 0.0309, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 5.544315129811997, |
|
"grad_norm": 24.52149772644043, |
|
"learning_rate": 6.954436450839329e-07, |
|
"loss": 0.1257, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 5.551477170993733, |
|
"grad_norm": 10.662769317626953, |
|
"learning_rate": 6.83453237410072e-07, |
|
"loss": 0.3804, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 5.5586392121754695, |
|
"grad_norm": 4.737334728240967, |
|
"learning_rate": 6.714628297362111e-07, |
|
"loss": 0.0736, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 5.565801253357207, |
|
"grad_norm": 10.771794319152832, |
|
"learning_rate": 6.594724220623502e-07, |
|
"loss": 0.1611, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 5.572963294538944, |
|
"grad_norm": 5.774040222167969, |
|
"learning_rate": 6.474820143884893e-07, |
|
"loss": 0.0813, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 5.58012533572068, |
|
"grad_norm": 4.687991142272949, |
|
"learning_rate": 6.354916067146283e-07, |
|
"loss": 0.107, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 5.587287376902417, |
|
"grad_norm": 2.2647032737731934, |
|
"learning_rate": 6.235011990407674e-07, |
|
"loss": 0.0432, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 5.594449418084154, |
|
"grad_norm": 2.810767650604248, |
|
"learning_rate": 6.115107913669066e-07, |
|
"loss": 0.0731, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 5.601611459265891, |
|
"grad_norm": 5.394442558288574, |
|
"learning_rate": 5.995203836930456e-07, |
|
"loss": 0.0826, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 5.608773500447628, |
|
"grad_norm": 7.001992225646973, |
|
"learning_rate": 5.875299760191848e-07, |
|
"loss": 0.077, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 5.615935541629364, |
|
"grad_norm": 5.397133827209473, |
|
"learning_rate": 5.755395683453238e-07, |
|
"loss": 0.1079, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 5.623097582811101, |
|
"grad_norm": 3.6474523544311523, |
|
"learning_rate": 5.635491606714628e-07, |
|
"loss": 0.1313, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 5.630259623992838, |
|
"grad_norm": 10.120123863220215, |
|
"learning_rate": 5.51558752997602e-07, |
|
"loss": 0.2847, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 5.637421665174575, |
|
"grad_norm": 7.304914951324463, |
|
"learning_rate": 5.39568345323741e-07, |
|
"loss": 0.0728, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 5.644583706356311, |
|
"grad_norm": 6.6555399894714355, |
|
"learning_rate": 5.275779376498801e-07, |
|
"loss": 0.064, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 5.651745747538048, |
|
"grad_norm": 2.417214870452881, |
|
"learning_rate": 5.155875299760192e-07, |
|
"loss": 0.0618, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 5.658907788719786, |
|
"grad_norm": 4.194424152374268, |
|
"learning_rate": 5.035971223021583e-07, |
|
"loss": 0.0489, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 5.666069829901522, |
|
"grad_norm": 12.794995307922363, |
|
"learning_rate": 4.916067146282974e-07, |
|
"loss": 0.1404, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 5.673231871083258, |
|
"grad_norm": 4.588229656219482, |
|
"learning_rate": 4.796163069544365e-07, |
|
"loss": 0.1745, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 5.680393912264996, |
|
"grad_norm": 12.849517822265625, |
|
"learning_rate": 4.676258992805756e-07, |
|
"loss": 0.1715, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 5.687555953446733, |
|
"grad_norm": 7.121222496032715, |
|
"learning_rate": 4.556354916067147e-07, |
|
"loss": 0.0975, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 5.694717994628469, |
|
"grad_norm": 4.74954080581665, |
|
"learning_rate": 4.436450839328538e-07, |
|
"loss": 0.0625, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 5.7018800358102055, |
|
"grad_norm": 11.487862586975098, |
|
"learning_rate": 4.3165467625899287e-07, |
|
"loss": 0.2519, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 5.709042076991943, |
|
"grad_norm": 14.895743370056152, |
|
"learning_rate": 4.1966426858513196e-07, |
|
"loss": 0.1979, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 5.716204118173679, |
|
"grad_norm": 5.849491119384766, |
|
"learning_rate": 4.0767386091127105e-07, |
|
"loss": 0.0798, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 5.723366159355416, |
|
"grad_norm": 9.15971851348877, |
|
"learning_rate": 3.956834532374101e-07, |
|
"loss": 0.1698, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 5.730528200537153, |
|
"grad_norm": 14.715744972229004, |
|
"learning_rate": 3.836930455635492e-07, |
|
"loss": 0.1073, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 5.73769024171889, |
|
"grad_norm": 3.501526355743408, |
|
"learning_rate": 3.7170263788968827e-07, |
|
"loss": 0.0403, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 5.744852282900626, |
|
"grad_norm": 4.736666202545166, |
|
"learning_rate": 3.5971223021582736e-07, |
|
"loss": 0.0978, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 5.752014324082364, |
|
"grad_norm": 6.315278053283691, |
|
"learning_rate": 3.4772182254196645e-07, |
|
"loss": 0.0792, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 5.7591763652641, |
|
"grad_norm": 7.229060649871826, |
|
"learning_rate": 3.3573141486810554e-07, |
|
"loss": 0.1235, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 5.766338406445837, |
|
"grad_norm": 5.501567840576172, |
|
"learning_rate": 3.237410071942446e-07, |
|
"loss": 0.0607, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 5.7735004476275735, |
|
"grad_norm": 3.9031436443328857, |
|
"learning_rate": 3.117505995203837e-07, |
|
"loss": 0.0455, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 5.780662488809311, |
|
"grad_norm": 1.905010461807251, |
|
"learning_rate": 2.997601918465228e-07, |
|
"loss": 0.0362, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 5.787824529991047, |
|
"grad_norm": 20.247007369995117, |
|
"learning_rate": 2.877697841726619e-07, |
|
"loss": 0.1452, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 5.794986571172784, |
|
"grad_norm": 9.428034782409668, |
|
"learning_rate": 2.75779376498801e-07, |
|
"loss": 0.1267, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 5.802148612354521, |
|
"grad_norm": 2.8392276763916016, |
|
"learning_rate": 2.637889688249401e-07, |
|
"loss": 0.2061, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 5.809310653536258, |
|
"grad_norm": 4.5366926193237305, |
|
"learning_rate": 2.5179856115107916e-07, |
|
"loss": 0.0542, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 5.816472694717994, |
|
"grad_norm": 5.767791271209717, |
|
"learning_rate": 2.3980815347721825e-07, |
|
"loss": 0.0548, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 5.823634735899732, |
|
"grad_norm": 3.0440096855163574, |
|
"learning_rate": 2.2781774580335734e-07, |
|
"loss": 0.0408, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 5.830796777081468, |
|
"grad_norm": 8.914772033691406, |
|
"learning_rate": 2.1582733812949643e-07, |
|
"loss": 0.0981, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 5.837958818263205, |
|
"grad_norm": 6.404951572418213, |
|
"learning_rate": 2.0383693045563552e-07, |
|
"loss": 0.2039, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 5.8451208594449415, |
|
"grad_norm": 7.629164695739746, |
|
"learning_rate": 1.918465227817746e-07, |
|
"loss": 0.2741, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 5.852282900626679, |
|
"grad_norm": 8.382116317749023, |
|
"learning_rate": 1.7985611510791368e-07, |
|
"loss": 0.2178, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 5.859444941808415, |
|
"grad_norm": 12.157092094421387, |
|
"learning_rate": 1.6786570743405277e-07, |
|
"loss": 0.2596, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 5.866606982990152, |
|
"grad_norm": 7.726680755615234, |
|
"learning_rate": 1.5587529976019186e-07, |
|
"loss": 0.1078, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 5.873769024171889, |
|
"grad_norm": 1.8861736059188843, |
|
"learning_rate": 1.4388489208633095e-07, |
|
"loss": 0.054, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 5.880931065353626, |
|
"grad_norm": 8.069757461547852, |
|
"learning_rate": 1.3189448441247004e-07, |
|
"loss": 0.1343, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 5.888093106535362, |
|
"grad_norm": 1.1135824918746948, |
|
"learning_rate": 1.1990407673860913e-07, |
|
"loss": 0.0295, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 5.8952551477171, |
|
"grad_norm": 5.592498302459717, |
|
"learning_rate": 1.0791366906474822e-07, |
|
"loss": 0.0817, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 5.902417188898836, |
|
"grad_norm": 11.28853702545166, |
|
"learning_rate": 9.59232613908873e-08, |
|
"loss": 0.1309, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 5.909579230080573, |
|
"grad_norm": 8.105303764343262, |
|
"learning_rate": 8.393285371702638e-08, |
|
"loss": 0.1098, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 5.9167412712623095, |
|
"grad_norm": 2.810290575027466, |
|
"learning_rate": 7.194244604316547e-08, |
|
"loss": 0.0679, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 5.923903312444047, |
|
"grad_norm": 4.570773124694824, |
|
"learning_rate": 5.995203836930456e-08, |
|
"loss": 0.0602, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 5.931065353625783, |
|
"grad_norm": 4.349383354187012, |
|
"learning_rate": 4.796163069544365e-08, |
|
"loss": 0.0524, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 5.93822739480752, |
|
"grad_norm": 3.1582112312316895, |
|
"learning_rate": 3.597122302158274e-08, |
|
"loss": 0.0263, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 5.945389435989257, |
|
"grad_norm": 7.5621867179870605, |
|
"learning_rate": 2.3980815347721823e-08, |
|
"loss": 0.1276, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 5.952551477170994, |
|
"grad_norm": 6.433109760284424, |
|
"learning_rate": 1.1990407673860912e-08, |
|
"loss": 0.1642, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 5.95971351835273, |
|
"grad_norm": 13.487061500549316, |
|
"learning_rate": 0.0, |
|
"loss": 0.3048, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 5.95971351835273, |
|
"eval_accuracy": 0.8893360160965795, |
|
"eval_loss": 0.280468225479126, |
|
"eval_runtime": 12.771, |
|
"eval_samples_per_second": 38.916, |
|
"eval_steps_per_second": 38.916, |
|
"step": 834 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 834, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7006309733928960.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|