bad-apple-8b-v1 / trainer_state.json
kalomaze's picture
Upload folder using huggingface_hub
070073a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.925925925925926,
"eval_steps": 500,
"global_step": 324,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.018518518518518517,
"grad_norm": 11.543508930958351,
"learning_rate": 1.3333333333333334e-07,
"loss": 2.5502,
"step": 1
},
{
"epoch": 0.037037037037037035,
"grad_norm": 11.036573442393484,
"learning_rate": 2.6666666666666667e-07,
"loss": 2.524,
"step": 2
},
{
"epoch": 0.05555555555555555,
"grad_norm": 10.633243381981275,
"learning_rate": 4e-07,
"loss": 2.204,
"step": 3
},
{
"epoch": 0.07407407407407407,
"grad_norm": 10.844156107788931,
"learning_rate": 5.333333333333333e-07,
"loss": 2.6556,
"step": 4
},
{
"epoch": 0.09259259259259259,
"grad_norm": 10.31689604512179,
"learning_rate": 6.666666666666666e-07,
"loss": 2.3083,
"step": 5
},
{
"epoch": 0.1111111111111111,
"grad_norm": 9.817262372273788,
"learning_rate": 8e-07,
"loss": 2.4079,
"step": 6
},
{
"epoch": 0.12962962962962962,
"grad_norm": 9.116167654173315,
"learning_rate": 9.333333333333333e-07,
"loss": 2.3343,
"step": 7
},
{
"epoch": 0.14814814814814814,
"grad_norm": 7.891190287353295,
"learning_rate": 1.0666666666666667e-06,
"loss": 2.3883,
"step": 8
},
{
"epoch": 0.16666666666666666,
"grad_norm": 8.42233222280676,
"learning_rate": 1.2e-06,
"loss": 2.4733,
"step": 9
},
{
"epoch": 0.18518518518518517,
"grad_norm": 6.5902867588718825,
"learning_rate": 1.3333333333333332e-06,
"loss": 2.2598,
"step": 10
},
{
"epoch": 0.2037037037037037,
"grad_norm": 7.468618276890062,
"learning_rate": 1.4666666666666665e-06,
"loss": 2.6818,
"step": 11
},
{
"epoch": 0.2222222222222222,
"grad_norm": 6.524430399848726,
"learning_rate": 1.6e-06,
"loss": 2.0609,
"step": 12
},
{
"epoch": 0.24074074074074073,
"grad_norm": 15.819198637332978,
"learning_rate": 1.7333333333333334e-06,
"loss": 1.8734,
"step": 13
},
{
"epoch": 0.25925925925925924,
"grad_norm": 12.251404296601525,
"learning_rate": 1.8666666666666667e-06,
"loss": 2.3952,
"step": 14
},
{
"epoch": 0.2777777777777778,
"grad_norm": 12.014341658055084,
"learning_rate": 2e-06,
"loss": 2.0763,
"step": 15
},
{
"epoch": 0.2962962962962963,
"grad_norm": 9.119171460936416,
"learning_rate": 1.999948316841124e-06,
"loss": 2.2581,
"step": 16
},
{
"epoch": 0.3148148148148148,
"grad_norm": 7.0075699362300785,
"learning_rate": 1.999793272706794e-06,
"loss": 2.3189,
"step": 17
},
{
"epoch": 0.3333333333333333,
"grad_norm": 8.434551205593468,
"learning_rate": 1.9995348836233515e-06,
"loss": 2.2956,
"step": 18
},
{
"epoch": 0.35185185185185186,
"grad_norm": 9.802253199544783,
"learning_rate": 1.999173176299524e-06,
"loss": 2.1106,
"step": 19
},
{
"epoch": 0.37037037037037035,
"grad_norm": 7.364889431202562,
"learning_rate": 1.9987081881236665e-06,
"loss": 2.4001,
"step": 20
},
{
"epoch": 0.3888888888888889,
"grad_norm": 5.5030313904087995,
"learning_rate": 1.9981399671598938e-06,
"loss": 2.0534,
"step": 21
},
{
"epoch": 0.4074074074074074,
"grad_norm": 4.022498450217217,
"learning_rate": 1.997468572143115e-06,
"loss": 1.9262,
"step": 22
},
{
"epoch": 0.42592592592592593,
"grad_norm": 4.237115597250525,
"learning_rate": 1.9966940724729603e-06,
"loss": 2.2743,
"step": 23
},
{
"epoch": 0.4444444444444444,
"grad_norm": 4.300566273621826,
"learning_rate": 1.995816548206609e-06,
"loss": 2.028,
"step": 24
},
{
"epoch": 0.46296296296296297,
"grad_norm": 4.157703163471443,
"learning_rate": 1.994836090050514e-06,
"loss": 2.2021,
"step": 25
},
{
"epoch": 0.48148148148148145,
"grad_norm": 4.59475590188255,
"learning_rate": 1.993752799351023e-06,
"loss": 2.1409,
"step": 26
},
{
"epoch": 0.5,
"grad_norm": 3.553829762084,
"learning_rate": 1.992566788083908e-06,
"loss": 2.1277,
"step": 27
},
{
"epoch": 0.5185185185185185,
"grad_norm": 2.808767466788676,
"learning_rate": 1.9912781788427856e-06,
"loss": 2.074,
"step": 28
},
{
"epoch": 0.5370370370370371,
"grad_norm": 2.945994143903197,
"learning_rate": 1.989887104826449e-06,
"loss": 1.9894,
"step": 29
},
{
"epoch": 0.5555555555555556,
"grad_norm": 2.9859402190241,
"learning_rate": 1.988393709825096e-06,
"loss": 2.1096,
"step": 30
},
{
"epoch": 0.5740740740740741,
"grad_norm": 2.788646179800959,
"learning_rate": 1.9867981482054697e-06,
"loss": 2.315,
"step": 31
},
{
"epoch": 0.5925925925925926,
"grad_norm": 2.428878990731119,
"learning_rate": 1.9851005848948986e-06,
"loss": 2.1129,
"step": 32
},
{
"epoch": 0.6111111111111112,
"grad_norm": 2.326070514005508,
"learning_rate": 1.983301195364252e-06,
"loss": 2.3507,
"step": 33
},
{
"epoch": 0.6296296296296297,
"grad_norm": 2.2448623338584524,
"learning_rate": 1.9814001656098e-06,
"loss": 2.2176,
"step": 34
},
{
"epoch": 0.6481481481481481,
"grad_norm": 3.345489216172997,
"learning_rate": 1.9793976921339876e-06,
"loss": 2.0352,
"step": 35
},
{
"epoch": 0.6666666666666666,
"grad_norm": 2.454245882780074,
"learning_rate": 1.9772939819251245e-06,
"loss": 1.7644,
"step": 36
},
{
"epoch": 0.6851851851851852,
"grad_norm": 2.2823601110851115,
"learning_rate": 1.9750892524359894e-06,
"loss": 2.0044,
"step": 37
},
{
"epoch": 0.7037037037037037,
"grad_norm": 2.378703420397497,
"learning_rate": 1.9727837315613503e-06,
"loss": 1.9992,
"step": 38
},
{
"epoch": 0.7222222222222222,
"grad_norm": 2.2038000284491392,
"learning_rate": 1.9703776576144106e-06,
"loss": 2.1248,
"step": 39
},
{
"epoch": 0.7407407407407407,
"grad_norm": 1.6625652175528476,
"learning_rate": 1.9678712793021747e-06,
"loss": 1.7908,
"step": 40
},
{
"epoch": 0.7592592592592593,
"grad_norm": 1.649500064167637,
"learning_rate": 1.9652648556997396e-06,
"loss": 2.0346,
"step": 41
},
{
"epoch": 0.7777777777777778,
"grad_norm": 2.127402784391995,
"learning_rate": 1.962558656223516e-06,
"loss": 2.1544,
"step": 42
},
{
"epoch": 0.7962962962962963,
"grad_norm": 2.4572023559040668,
"learning_rate": 1.959752960603378e-06,
"loss": 1.9295,
"step": 43
},
{
"epoch": 0.8148148148148148,
"grad_norm": 1.511188510592738,
"learning_rate": 1.956848058853751e-06,
"loss": 2.1473,
"step": 44
},
{
"epoch": 0.8333333333333334,
"grad_norm": 2.6425186462750276,
"learning_rate": 1.9538442512436325e-06,
"loss": 1.7632,
"step": 45
},
{
"epoch": 0.8518518518518519,
"grad_norm": 2.528104013708182,
"learning_rate": 1.9507418482655546e-06,
"loss": 1.9125,
"step": 46
},
{
"epoch": 0.8703703703703703,
"grad_norm": 2.660072260955662,
"learning_rate": 1.947541170603488e-06,
"loss": 1.9839,
"step": 47
},
{
"epoch": 0.8888888888888888,
"grad_norm": 2.303424321729968,
"learning_rate": 1.9442425490996984e-06,
"loss": 1.8381,
"step": 48
},
{
"epoch": 0.9074074074074074,
"grad_norm": 1.7413263437826438,
"learning_rate": 1.940846324720544e-06,
"loss": 2.2322,
"step": 49
},
{
"epoch": 0.9259259259259259,
"grad_norm": 3.681741007928878,
"learning_rate": 1.9373528485212327e-06,
"loss": 2.1221,
"step": 50
},
{
"epoch": 0.9444444444444444,
"grad_norm": 2.729258330107977,
"learning_rate": 1.9337624816095357e-06,
"loss": 1.8567,
"step": 51
},
{
"epoch": 0.9629629629629629,
"grad_norm": 1.9607649593150183,
"learning_rate": 1.9300755951084592e-06,
"loss": 2.0553,
"step": 52
},
{
"epoch": 0.9814814814814815,
"grad_norm": 2.119362131138027,
"learning_rate": 1.9262925701178863e-06,
"loss": 1.936,
"step": 53
},
{
"epoch": 1.0,
"grad_norm": 2.057082578120893,
"learning_rate": 1.9224137976751793e-06,
"loss": 1.9584,
"step": 54
},
{
"epoch": 1.0185185185185186,
"grad_norm": 2.0207421134902708,
"learning_rate": 1.918439678714763e-06,
"loss": 1.9837,
"step": 55
},
{
"epoch": 1.0185185185185186,
"grad_norm": 1.938684997881939,
"learning_rate": 1.9143706240266807e-06,
"loss": 1.9354,
"step": 56
},
{
"epoch": 1.037037037037037,
"grad_norm": 2.0601195298871398,
"learning_rate": 1.910207054214133e-06,
"loss": 2.0174,
"step": 57
},
{
"epoch": 1.0555555555555556,
"grad_norm": 2.041620934780644,
"learning_rate": 1.9059493996499985e-06,
"loss": 1.7447,
"step": 58
},
{
"epoch": 1.074074074074074,
"grad_norm": 1.5682604954979573,
"learning_rate": 1.9015981004323534e-06,
"loss": 2.0106,
"step": 59
},
{
"epoch": 1.0925925925925926,
"grad_norm": 2.865965004078874,
"learning_rate": 1.8971536063389742e-06,
"loss": 2.2393,
"step": 60
},
{
"epoch": 1.1111111111111112,
"grad_norm": 2.7462581398678787,
"learning_rate": 1.89261637678085e-06,
"loss": 1.7421,
"step": 61
},
{
"epoch": 1.1296296296296295,
"grad_norm": 3.120548437283878,
"learning_rate": 1.8879868807546932e-06,
"loss": 1.9877,
"step": 62
},
{
"epoch": 1.1481481481481481,
"grad_norm": 3.242255359642735,
"learning_rate": 1.8832655967944605e-06,
"loss": 1.9799,
"step": 63
},
{
"epoch": 1.1666666666666667,
"grad_norm": 2.2159733738020275,
"learning_rate": 1.8784530129218907e-06,
"loss": 2.0581,
"step": 64
},
{
"epoch": 1.1851851851851851,
"grad_norm": 2.08989006018966,
"learning_rate": 1.873549626596057e-06,
"loss": 1.8653,
"step": 65
},
{
"epoch": 1.2037037037037037,
"grad_norm": 1.4837874153680628,
"learning_rate": 1.8685559446619487e-06,
"loss": 1.9734,
"step": 66
},
{
"epoch": 1.2222222222222223,
"grad_norm": 2.1071721482630403,
"learning_rate": 1.863472483298079e-06,
"loss": 1.7762,
"step": 67
},
{
"epoch": 1.2407407407407407,
"grad_norm": 2.6554851825477646,
"learning_rate": 1.858299767963131e-06,
"loss": 2.2267,
"step": 68
},
{
"epoch": 1.2592592592592593,
"grad_norm": 2.135758261049139,
"learning_rate": 1.8530383333416415e-06,
"loss": 2.0624,
"step": 69
},
{
"epoch": 1.2777777777777777,
"grad_norm": 2.256153463268274,
"learning_rate": 1.847688723288733e-06,
"loss": 2.0254,
"step": 70
},
{
"epoch": 1.2962962962962963,
"grad_norm": 1.9270711341308566,
"learning_rate": 1.8422514907738986e-06,
"loss": 2.0873,
"step": 71
},
{
"epoch": 1.3148148148148149,
"grad_norm": 1.3698407936967985,
"learning_rate": 1.8367271978238418e-06,
"loss": 1.5655,
"step": 72
},
{
"epoch": 1.3333333333333333,
"grad_norm": 1.7934950271719698,
"learning_rate": 1.8311164154643833e-06,
"loss": 2.1081,
"step": 73
},
{
"epoch": 1.3518518518518519,
"grad_norm": 1.7554770045810462,
"learning_rate": 1.8254197236614353e-06,
"loss": 1.6326,
"step": 74
},
{
"epoch": 1.3703703703703702,
"grad_norm": 1.7910726004582642,
"learning_rate": 1.8196377112610524e-06,
"loss": 1.9896,
"step": 75
},
{
"epoch": 1.3888888888888888,
"grad_norm": 1.43155366985165,
"learning_rate": 1.8137709759285662e-06,
"loss": 1.8557,
"step": 76
},
{
"epoch": 1.4074074074074074,
"grad_norm": 1.816009532890727,
"learning_rate": 1.8078201240868048e-06,
"loss": 1.7878,
"step": 77
},
{
"epoch": 1.425925925925926,
"grad_norm": 1.612331881267257,
"learning_rate": 1.8017857708534106e-06,
"loss": 1.982,
"step": 78
},
{
"epoch": 1.4444444444444444,
"grad_norm": 1.8628647966869196,
"learning_rate": 1.7956685399772576e-06,
"loss": 1.9704,
"step": 79
},
{
"epoch": 1.462962962962963,
"grad_norm": 1.9936817464029801,
"learning_rate": 1.7894690637739762e-06,
"loss": 1.8299,
"step": 80
},
{
"epoch": 1.4814814814814814,
"grad_norm": 2.463393533692339,
"learning_rate": 1.7831879830605936e-06,
"loss": 2.0444,
"step": 81
},
{
"epoch": 1.5,
"grad_norm": 2.4979859149192305,
"learning_rate": 1.776825947089294e-06,
"loss": 2.0278,
"step": 82
},
{
"epoch": 1.5185185185185186,
"grad_norm": 2.7584711281071606,
"learning_rate": 1.7703836134803102e-06,
"loss": 1.8715,
"step": 83
},
{
"epoch": 1.5370370370370372,
"grad_norm": 1.9266117476771798,
"learning_rate": 1.7638616481539448e-06,
"loss": 2.3658,
"step": 84
},
{
"epoch": 1.5555555555555556,
"grad_norm": 2.7609401761288908,
"learning_rate": 1.7572607252617377e-06,
"loss": 1.9736,
"step": 85
},
{
"epoch": 1.574074074074074,
"grad_norm": 2.114937054090088,
"learning_rate": 1.7505815271167822e-06,
"loss": 2.0398,
"step": 86
},
{
"epoch": 1.5925925925925926,
"grad_norm": 2.0664911123203513,
"learning_rate": 1.743824744123196e-06,
"loss": 2.1056,
"step": 87
},
{
"epoch": 1.6111111111111112,
"grad_norm": 2.5153483082090213,
"learning_rate": 1.7369910747047571e-06,
"loss": 1.8765,
"step": 88
},
{
"epoch": 1.6296296296296298,
"grad_norm": 1.8949983903048848,
"learning_rate": 1.7300812252327102e-06,
"loss": 2.1245,
"step": 89
},
{
"epoch": 1.6481481481481481,
"grad_norm": 2.7037983362018565,
"learning_rate": 1.723095909952751e-06,
"loss": 1.5174,
"step": 90
},
{
"epoch": 1.6666666666666665,
"grad_norm": 2.3396050215927673,
"learning_rate": 1.7160358509111989e-06,
"loss": 2.0559,
"step": 91
},
{
"epoch": 1.6851851851851851,
"grad_norm": 2.1357187531056976,
"learning_rate": 1.7089017778803595e-06,
"loss": 1.8264,
"step": 92
},
{
"epoch": 1.7037037037037037,
"grad_norm": 2.5298502653457358,
"learning_rate": 1.701694428283093e-06,
"loss": 2.1282,
"step": 93
},
{
"epoch": 1.7222222222222223,
"grad_norm": 2.0789215851330343,
"learning_rate": 1.6944145471165881e-06,
"loss": 2.1829,
"step": 94
},
{
"epoch": 1.7407407407407407,
"grad_norm": 1.8110067836025452,
"learning_rate": 1.6870628868753545e-06,
"loss": 1.7584,
"step": 95
},
{
"epoch": 1.7592592592592593,
"grad_norm": 2.7069181555694666,
"learning_rate": 1.6796402074734402e-06,
"loss": 1.897,
"step": 96
},
{
"epoch": 1.7777777777777777,
"grad_norm": 2.3956521553142176,
"learning_rate": 1.6721472761658836e-06,
"loss": 1.9119,
"step": 97
},
{
"epoch": 1.7962962962962963,
"grad_norm": 1.3732811625669847,
"learning_rate": 1.664584867469403e-06,
"loss": 1.6848,
"step": 98
},
{
"epoch": 1.8148148148148149,
"grad_norm": 1.9512817035138257,
"learning_rate": 1.6569537630823382e-06,
"loss": 2.0185,
"step": 99
},
{
"epoch": 1.8333333333333335,
"grad_norm": 1.864374052494234,
"learning_rate": 1.6492547518038503e-06,
"loss": 1.925,
"step": 100
},
{
"epoch": 1.8518518518518519,
"grad_norm": 1.7728078338576356,
"learning_rate": 1.6414886294523857e-06,
"loss": 1.8965,
"step": 101
},
{
"epoch": 1.8703703703703702,
"grad_norm": 1.8362690886038369,
"learning_rate": 1.6336561987834151e-06,
"loss": 1.8881,
"step": 102
},
{
"epoch": 1.8888888888888888,
"grad_norm": 3.120191999390615,
"learning_rate": 1.6257582694064556e-06,
"loss": 1.7192,
"step": 103
},
{
"epoch": 1.9074074074074074,
"grad_norm": 2.3586839267066044,
"learning_rate": 1.6177956577013846e-06,
"loss": 1.9387,
"step": 104
},
{
"epoch": 1.925925925925926,
"grad_norm": 2.779686602481001,
"learning_rate": 1.6097691867340543e-06,
"loss": 1.9497,
"step": 105
},
{
"epoch": 1.9444444444444444,
"grad_norm": 2.321935224272705,
"learning_rate": 1.6016796861712125e-06,
"loss": 1.9367,
"step": 106
},
{
"epoch": 1.9629629629629628,
"grad_norm": 2.3211469537338276,
"learning_rate": 1.5935279921947451e-06,
"loss": 1.9765,
"step": 107
},
{
"epoch": 1.9814814814814814,
"grad_norm": 1.8048838385036454,
"learning_rate": 1.585314947415242e-06,
"loss": 2.1524,
"step": 108
},
{
"epoch": 2.0,
"grad_norm": 2.2432536623121866,
"learning_rate": 1.5770414007848994e-06,
"loss": 1.7596,
"step": 109
},
{
"epoch": 2.0185185185185186,
"grad_norm": 2.1527401042322984,
"learning_rate": 1.5687082075097674e-06,
"loss": 2.1903,
"step": 110
},
{
"epoch": 2.0185185185185186,
"grad_norm": 2.822074512897879,
"learning_rate": 1.5603162289613501e-06,
"loss": 2.0324,
"step": 111
},
{
"epoch": 2.037037037037037,
"grad_norm": 1.9685786022400997,
"learning_rate": 1.551866332587568e-06,
"loss": 1.8009,
"step": 112
},
{
"epoch": 2.0555555555555554,
"grad_norm": 2.515751939304619,
"learning_rate": 1.5433593918230955e-06,
"loss": 1.9487,
"step": 113
},
{
"epoch": 2.074074074074074,
"grad_norm": 2.349862710312166,
"learning_rate": 1.5347962859990742e-06,
"loss": 1.9967,
"step": 114
},
{
"epoch": 2.0925925925925926,
"grad_norm": 3.1803776539735233,
"learning_rate": 1.5261779002522216e-06,
"loss": 2.0633,
"step": 115
},
{
"epoch": 2.111111111111111,
"grad_norm": 2.6762686321709372,
"learning_rate": 1.517505125433338e-06,
"loss": 2.1631,
"step": 116
},
{
"epoch": 2.1296296296296298,
"grad_norm": 3.17350275984332,
"learning_rate": 1.5087788580152206e-06,
"loss": 1.7666,
"step": 117
},
{
"epoch": 2.148148148148148,
"grad_norm": 2.7374508335058128,
"learning_rate": 1.5e-06,
"loss": 1.6363,
"step": 118
},
{
"epoch": 2.1666666666666665,
"grad_norm": 2.518836889589819,
"learning_rate": 1.4911694588259037e-06,
"loss": 2.0306,
"step": 119
},
{
"epoch": 2.185185185185185,
"grad_norm": 2.0516490709057438,
"learning_rate": 1.482288147273456e-06,
"loss": 1.7322,
"step": 120
},
{
"epoch": 2.2037037037037037,
"grad_norm": 2.143653181079979,
"learning_rate": 1.4733569833711299e-06,
"loss": 1.9715,
"step": 121
},
{
"epoch": 2.2222222222222223,
"grad_norm": 1.5702663497071736,
"learning_rate": 1.4643768903004504e-06,
"loss": 1.6725,
"step": 122
},
{
"epoch": 2.240740740740741,
"grad_norm": 2.1780515590527045,
"learning_rate": 1.455348796300571e-06,
"loss": 1.8871,
"step": 123
},
{
"epoch": 2.259259259259259,
"grad_norm": 1.7384200856334007,
"learning_rate": 1.4462736345723259e-06,
"loss": 1.8607,
"step": 124
},
{
"epoch": 2.2777777777777777,
"grad_norm": 2.602970978377197,
"learning_rate": 1.437152343181765e-06,
"loss": 2.0933,
"step": 125
},
{
"epoch": 2.2962962962962963,
"grad_norm": 2.2409639030493516,
"learning_rate": 1.4279858649631928e-06,
"loss": 2.1028,
"step": 126
},
{
"epoch": 2.314814814814815,
"grad_norm": 2.083427934167806,
"learning_rate": 1.4187751474217096e-06,
"loss": 1.7588,
"step": 127
},
{
"epoch": 2.3333333333333335,
"grad_norm": 1.710343556502238,
"learning_rate": 1.4095211426352718e-06,
"loss": 1.8985,
"step": 128
},
{
"epoch": 2.351851851851852,
"grad_norm": 2.4282958584597645,
"learning_rate": 1.4002248071562778e-06,
"loss": 1.8267,
"step": 129
},
{
"epoch": 2.3703703703703702,
"grad_norm": 2.2052175185263936,
"learning_rate": 1.3908871019126954e-06,
"loss": 2.254,
"step": 130
},
{
"epoch": 2.388888888888889,
"grad_norm": 2.4962771616425745,
"learning_rate": 1.3815089921087315e-06,
"loss": 1.8375,
"step": 131
},
{
"epoch": 2.4074074074074074,
"grad_norm": 2.420921240604477,
"learning_rate": 1.3720914471250642e-06,
"loss": 1.9705,
"step": 132
},
{
"epoch": 2.425925925925926,
"grad_norm": 1.6871014164962779,
"learning_rate": 1.3626354404186404e-06,
"loss": 1.866,
"step": 133
},
{
"epoch": 2.4444444444444446,
"grad_norm": 1.5220778910671986,
"learning_rate": 1.3531419494220545e-06,
"loss": 2.0116,
"step": 134
},
{
"epoch": 2.462962962962963,
"grad_norm": 1.9736590287767704,
"learning_rate": 1.343611955442513e-06,
"loss": 1.7881,
"step": 135
},
{
"epoch": 2.4814814814814814,
"grad_norm": 1.357453526449638,
"learning_rate": 1.334046443560402e-06,
"loss": 1.7624,
"step": 136
},
{
"epoch": 2.5,
"grad_norm": 1.7906511346102865,
"learning_rate": 1.324446402527462e-06,
"loss": 1.7147,
"step": 137
},
{
"epoch": 2.5185185185185186,
"grad_norm": 2.0256913340352,
"learning_rate": 1.3148128246645848e-06,
"loss": 1.657,
"step": 138
},
{
"epoch": 2.537037037037037,
"grad_norm": 2.4368648915605786,
"learning_rate": 1.3051467057592413e-06,
"loss": 1.848,
"step": 139
},
{
"epoch": 2.5555555555555554,
"grad_norm": 1.7920760208344662,
"learning_rate": 1.2954490449625491e-06,
"loss": 2.2794,
"step": 140
},
{
"epoch": 2.574074074074074,
"grad_norm": 2.5934703428783115,
"learning_rate": 1.2857208446859957e-06,
"loss": 2.1465,
"step": 141
},
{
"epoch": 2.5925925925925926,
"grad_norm": 1.788260906958661,
"learning_rate": 1.2759631104978224e-06,
"loss": 2.067,
"step": 142
},
{
"epoch": 2.611111111111111,
"grad_norm": 2.7522723362234474,
"learning_rate": 1.2661768510190816e-06,
"loss": 1.8176,
"step": 143
},
{
"epoch": 2.6296296296296298,
"grad_norm": 2.43143502900473,
"learning_rate": 1.2563630778193802e-06,
"loss": 2.3366,
"step": 144
},
{
"epoch": 2.648148148148148,
"grad_norm": 1.7241238478367036,
"learning_rate": 1.2465228053123172e-06,
"loss": 1.9895,
"step": 145
},
{
"epoch": 2.6666666666666665,
"grad_norm": 2.0266143160589802,
"learning_rate": 1.2366570506506268e-06,
"loss": 1.7781,
"step": 146
},
{
"epoch": 2.685185185185185,
"grad_norm": 1.9459670874156856,
"learning_rate": 1.226766833621041e-06,
"loss": 2.3116,
"step": 147
},
{
"epoch": 2.7037037037037037,
"grad_norm": 2.248556130449579,
"learning_rate": 1.2168531765388755e-06,
"loss": 1.8032,
"step": 148
},
{
"epoch": 2.7222222222222223,
"grad_norm": 1.711136470727862,
"learning_rate": 1.2069171041423583e-06,
"loss": 1.6228,
"step": 149
},
{
"epoch": 2.7407407407407405,
"grad_norm": 2.4614425382704352,
"learning_rate": 1.1969596434867062e-06,
"loss": 1.9709,
"step": 150
},
{
"epoch": 2.7592592592592595,
"grad_norm": 2.3445742482429788,
"learning_rate": 1.186981823837961e-06,
"loss": 2.0597,
"step": 151
},
{
"epoch": 2.7777777777777777,
"grad_norm": 1.6706837512637804,
"learning_rate": 1.1769846765665992e-06,
"loss": 1.6263,
"step": 152
},
{
"epoch": 2.7962962962962963,
"grad_norm": 1.6603060271536991,
"learning_rate": 1.1669692350409222e-06,
"loss": 1.8723,
"step": 153
},
{
"epoch": 2.814814814814815,
"grad_norm": 1.7552257393882156,
"learning_rate": 1.1569365345202413e-06,
"loss": 2.224,
"step": 154
},
{
"epoch": 2.8333333333333335,
"grad_norm": 1.3677514217091091,
"learning_rate": 1.1468876120478662e-06,
"loss": 1.897,
"step": 155
},
{
"epoch": 2.851851851851852,
"grad_norm": 1.4681588115995392,
"learning_rate": 1.1368235063439102e-06,
"loss": 1.7654,
"step": 156
},
{
"epoch": 2.8703703703703702,
"grad_norm": 1.4166676047405766,
"learning_rate": 1.1267452576979218e-06,
"loss": 1.7817,
"step": 157
},
{
"epoch": 2.888888888888889,
"grad_norm": 1.5999665116208726,
"learning_rate": 1.1166539078613525e-06,
"loss": 1.814,
"step": 158
},
{
"epoch": 2.9074074074074074,
"grad_norm": 1.8734358713251535,
"learning_rate": 1.106550499939876e-06,
"loss": 2.0783,
"step": 159
},
{
"epoch": 2.925925925925926,
"grad_norm": 1.7212322982329384,
"learning_rate": 1.0964360782855666e-06,
"loss": 2.0753,
"step": 160
},
{
"epoch": 2.9444444444444446,
"grad_norm": 2.144799198000555,
"learning_rate": 1.086311688388946e-06,
"loss": 1.8936,
"step": 161
},
{
"epoch": 2.962962962962963,
"grad_norm": 1.578076988317517,
"learning_rate": 1.076178376770918e-06,
"loss": 1.8926,
"step": 162
},
{
"epoch": 2.9814814814814814,
"grad_norm": 2.092387225323448,
"learning_rate": 1.0660371908745908e-06,
"loss": 1.8244,
"step": 163
},
{
"epoch": 3.0,
"grad_norm": 1.91051937209127,
"learning_rate": 1.0558891789570082e-06,
"loss": 1.8447,
"step": 164
},
{
"epoch": 3.0185185185185186,
"grad_norm": 2.011878655711519,
"learning_rate": 1.0457353899807946e-06,
"loss": 1.6429,
"step": 165
},
{
"epoch": 3.0185185185185186,
"grad_norm": 1.501437779159261,
"learning_rate": 1.0355768735057273e-06,
"loss": 1.8726,
"step": 166
},
{
"epoch": 3.037037037037037,
"grad_norm": 2.2762397392089597,
"learning_rate": 1.0254146795802495e-06,
"loss": 1.8501,
"step": 167
},
{
"epoch": 3.0555555555555554,
"grad_norm": 1.711019377794848,
"learning_rate": 1.015249858632926e-06,
"loss": 1.9443,
"step": 168
},
{
"epoch": 3.074074074074074,
"grad_norm": 2.1218173803583733,
"learning_rate": 1.0050834613638694e-06,
"loss": 1.5682,
"step": 169
},
{
"epoch": 3.0925925925925926,
"grad_norm": 2.2421674612074383,
"learning_rate": 9.949165386361303e-07,
"loss": 1.8014,
"step": 170
},
{
"epoch": 3.111111111111111,
"grad_norm": 2.0898372243057706,
"learning_rate": 9.847501413670742e-07,
"loss": 1.8711,
"step": 171
},
{
"epoch": 3.1296296296296298,
"grad_norm": 2.367436693252952,
"learning_rate": 9.745853204197508e-07,
"loss": 1.9004,
"step": 172
},
{
"epoch": 3.148148148148148,
"grad_norm": 1.756679866289546,
"learning_rate": 9.644231264942724e-07,
"loss": 1.8121,
"step": 173
},
{
"epoch": 3.1666666666666665,
"grad_norm": 1.8172318892802939,
"learning_rate": 9.542646100192055e-07,
"loss": 1.9013,
"step": 174
},
{
"epoch": 3.185185185185185,
"grad_norm": 1.7124997061951257,
"learning_rate": 9.441108210429921e-07,
"loss": 1.7851,
"step": 175
},
{
"epoch": 3.2037037037037037,
"grad_norm": 1.4438554381375786,
"learning_rate": 9.339628091254091e-07,
"loss": 1.5955,
"step": 176
},
{
"epoch": 3.2222222222222223,
"grad_norm": 1.4447841103018684,
"learning_rate": 9.238216232290821e-07,
"loss": 2.0907,
"step": 177
},
{
"epoch": 3.240740740740741,
"grad_norm": 1.6937928047736799,
"learning_rate": 9.136883116110541e-07,
"loss": 1.915,
"step": 178
},
{
"epoch": 3.259259259259259,
"grad_norm": 1.306322824987709,
"learning_rate": 9.035639217144334e-07,
"loss": 2.0679,
"step": 179
},
{
"epoch": 3.2777777777777777,
"grad_norm": 1.850877358174252,
"learning_rate": 8.93449500060124e-07,
"loss": 2.024,
"step": 180
},
{
"epoch": 3.2962962962962963,
"grad_norm": 1.5601775038920753,
"learning_rate": 8.833460921386477e-07,
"loss": 1.9335,
"step": 181
},
{
"epoch": 3.314814814814815,
"grad_norm": 2.270038996895677,
"learning_rate": 8.732547423020784e-07,
"loss": 2.3019,
"step": 182
},
{
"epoch": 3.3333333333333335,
"grad_norm": 1.3421300711986788,
"learning_rate": 8.631764936560899e-07,
"loss": 1.8503,
"step": 183
},
{
"epoch": 3.351851851851852,
"grad_norm": 2.071755001265988,
"learning_rate": 8.53112387952134e-07,
"loss": 1.838,
"step": 184
},
{
"epoch": 3.3703703703703702,
"grad_norm": 1.6174575169544287,
"learning_rate": 8.430634654797588e-07,
"loss": 2.2364,
"step": 185
},
{
"epoch": 3.388888888888889,
"grad_norm": 1.801580774474325,
"learning_rate": 8.330307649590779e-07,
"loss": 1.7633,
"step": 186
},
{
"epoch": 3.4074074074074074,
"grad_norm": 2.058657705709402,
"learning_rate": 8.230153234334007e-07,
"loss": 2.2177,
"step": 187
},
{
"epoch": 3.425925925925926,
"grad_norm": 1.5267427939756337,
"learning_rate": 8.130181761620392e-07,
"loss": 1.8588,
"step": 188
},
{
"epoch": 3.4444444444444446,
"grad_norm": 1.8491296560891988,
"learning_rate": 8.030403565132942e-07,
"loss": 2.0561,
"step": 189
},
{
"epoch": 3.462962962962963,
"grad_norm": 1.1987453530026493,
"learning_rate": 7.930828958576417e-07,
"loss": 2.0565,
"step": 190
},
{
"epoch": 3.4814814814814814,
"grad_norm": 1.7195298906541316,
"learning_rate": 7.831468234611247e-07,
"loss": 2.0798,
"step": 191
},
{
"epoch": 3.5,
"grad_norm": 1.20797833272688,
"learning_rate": 7.73233166378959e-07,
"loss": 1.8627,
"step": 192
},
{
"epoch": 3.5185185185185186,
"grad_norm": 1.5640684128902402,
"learning_rate": 7.633429493493729e-07,
"loss": 2.0137,
"step": 193
},
{
"epoch": 3.537037037037037,
"grad_norm": 1.6824510280578688,
"learning_rate": 7.53477194687683e-07,
"loss": 2.1517,
"step": 194
},
{
"epoch": 3.5555555555555554,
"grad_norm": 1.4155640553151332,
"learning_rate": 7.4363692218062e-07,
"loss": 1.9426,
"step": 195
},
{
"epoch": 3.574074074074074,
"grad_norm": 1.3939742232946681,
"learning_rate": 7.338231489809182e-07,
"loss": 1.7207,
"step": 196
},
{
"epoch": 3.5925925925925926,
"grad_norm": 1.4589160544776356,
"learning_rate": 7.240368895021775e-07,
"loss": 1.8217,
"step": 197
},
{
"epoch": 3.611111111111111,
"grad_norm": 1.3991775241667967,
"learning_rate": 7.142791553140044e-07,
"loss": 1.9021,
"step": 198
},
{
"epoch": 3.6296296296296298,
"grad_norm": 1.5300112446112555,
"learning_rate": 7.045509550374509e-07,
"loss": 1.9647,
"step": 199
},
{
"epoch": 3.648148148148148,
"grad_norm": 1.449273309005635,
"learning_rate": 6.948532942407587e-07,
"loss": 1.9613,
"step": 200
},
{
"epoch": 3.6666666666666665,
"grad_norm": 1.069899380500529,
"learning_rate": 6.851871753354153e-07,
"loss": 1.7452,
"step": 201
},
{
"epoch": 3.685185185185185,
"grad_norm": 1.5579308530316032,
"learning_rate": 6.755535974725379e-07,
"loss": 1.9134,
"step": 202
},
{
"epoch": 3.7037037037037037,
"grad_norm": 1.0814459794670248,
"learning_rate": 6.659535564395982e-07,
"loss": 1.6609,
"step": 203
},
{
"epoch": 3.7222222222222223,
"grad_norm": 1.8876967693657951,
"learning_rate": 6.563880445574872e-07,
"loss": 2.0948,
"step": 204
},
{
"epoch": 3.7407407407407405,
"grad_norm": 1.6093595543167938,
"learning_rate": 6.468580505779455e-07,
"loss": 1.6327,
"step": 205
},
{
"epoch": 3.7592592592592595,
"grad_norm": 1.9559640817344714,
"learning_rate": 6.373645595813596e-07,
"loss": 1.6376,
"step": 206
},
{
"epoch": 3.7777777777777777,
"grad_norm": 2.0405778845643288,
"learning_rate": 6.27908552874936e-07,
"loss": 2.1409,
"step": 207
},
{
"epoch": 3.7962962962962963,
"grad_norm": 1.230340254163767,
"learning_rate": 6.184910078912686e-07,
"loss": 1.686,
"step": 208
},
{
"epoch": 3.814814814814815,
"grad_norm": 2.171420345125834,
"learning_rate": 6.091128980873045e-07,
"loss": 1.9347,
"step": 209
},
{
"epoch": 3.8333333333333335,
"grad_norm": 1.8008532771859842,
"learning_rate": 5.997751928437219e-07,
"loss": 2.1292,
"step": 210
},
{
"epoch": 3.851851851851852,
"grad_norm": 1.502892647903443,
"learning_rate": 5.904788573647282e-07,
"loss": 1.7302,
"step": 211
},
{
"epoch": 3.8703703703703702,
"grad_norm": 1.4720170454603325,
"learning_rate": 5.812248525782901e-07,
"loss": 1.6652,
"step": 212
},
{
"epoch": 3.888888888888889,
"grad_norm": 1.4078435809618528,
"learning_rate": 5.720141350368072e-07,
"loss": 1.7847,
"step": 213
},
{
"epoch": 3.9074074074074074,
"grad_norm": 1.2860107867972834,
"learning_rate": 5.628476568182349e-07,
"loss": 1.818,
"step": 214
},
{
"epoch": 3.925925925925926,
"grad_norm": 1.5761560916907795,
"learning_rate": 5.537263654276743e-07,
"loss": 1.787,
"step": 215
},
{
"epoch": 3.9444444444444446,
"grad_norm": 1.463921943518727,
"learning_rate": 5.446512036994286e-07,
"loss": 1.9223,
"step": 216
},
{
"epoch": 3.962962962962963,
"grad_norm": 1.2770391505323755,
"learning_rate": 5.356231096995499e-07,
"loss": 1.5593,
"step": 217
},
{
"epoch": 3.9814814814814814,
"grad_norm": 1.4711865688844035,
"learning_rate": 5.266430166288704e-07,
"loss": 2.0863,
"step": 218
},
{
"epoch": 4.0,
"grad_norm": 1.1447313661292717,
"learning_rate": 5.177118527265437e-07,
"loss": 1.9428,
"step": 219
},
{
"epoch": 4.018518518518519,
"grad_norm": 1.6196943319397998,
"learning_rate": 5.088305411740965e-07,
"loss": 2.2068,
"step": 220
},
{
"epoch": 4.018518518518518,
"grad_norm": 1.2766493962889875,
"learning_rate": 5.000000000000002e-07,
"loss": 1.7437,
"step": 221
},
{
"epoch": 4.037037037037037,
"grad_norm": 1.594306405599087,
"learning_rate": 4.912211419847793e-07,
"loss": 2.0219,
"step": 222
},
{
"epoch": 4.055555555555555,
"grad_norm": 1.227716475966799,
"learning_rate": 4.82494874566662e-07,
"loss": 2.187,
"step": 223
},
{
"epoch": 4.074074074074074,
"grad_norm": 1.2852396998354376,
"learning_rate": 4.738220997477784e-07,
"loss": 1.8363,
"step": 224
},
{
"epoch": 4.092592592592593,
"grad_norm": 1.0923893050000644,
"learning_rate": 4.6520371400092584e-07,
"loss": 1.7177,
"step": 225
},
{
"epoch": 4.111111111111111,
"grad_norm": 1.1495819987216884,
"learning_rate": 4.5664060817690476e-07,
"loss": 2.0734,
"step": 226
},
{
"epoch": 4.12962962962963,
"grad_norm": 1.1120083230916684,
"learning_rate": 4.481336674124323e-07,
"loss": 1.7847,
"step": 227
},
{
"epoch": 4.148148148148148,
"grad_norm": 0.9789098979808262,
"learning_rate": 4.3968377103865016e-07,
"loss": 1.7989,
"step": 228
},
{
"epoch": 4.166666666666667,
"grad_norm": 0.9342477457439083,
"learning_rate": 4.3129179249023274e-07,
"loss": 1.6785,
"step": 229
},
{
"epoch": 4.185185185185185,
"grad_norm": 1.0718449337061247,
"learning_rate": 4.229585992151006e-07,
"loss": 1.7953,
"step": 230
},
{
"epoch": 4.203703703703703,
"grad_norm": 1.1500516991492213,
"learning_rate": 4.1468505258475784e-07,
"loss": 1.3975,
"step": 231
},
{
"epoch": 4.222222222222222,
"grad_norm": 0.9650831232767911,
"learning_rate": 4.0647200780525483e-07,
"loss": 1.8603,
"step": 232
},
{
"epoch": 4.2407407407407405,
"grad_norm": 1.0207088687244406,
"learning_rate": 3.983203138287876e-07,
"loss": 1.9807,
"step": 233
},
{
"epoch": 4.2592592592592595,
"grad_norm": 1.1991752171611891,
"learning_rate": 3.9023081326594564e-07,
"loss": 2.2322,
"step": 234
},
{
"epoch": 4.277777777777778,
"grad_norm": 1.0807801212200088,
"learning_rate": 3.822043422986153e-07,
"loss": 1.6295,
"step": 235
},
{
"epoch": 4.296296296296296,
"grad_norm": 1.0103392155699495,
"learning_rate": 3.742417305935442e-07,
"loss": 1.7882,
"step": 236
},
{
"epoch": 4.314814814814815,
"grad_norm": 1.0657639750720669,
"learning_rate": 3.663438012165848e-07,
"loss": 1.6027,
"step": 237
},
{
"epoch": 4.333333333333333,
"grad_norm": 0.9495451533397854,
"learning_rate": 3.5851137054761426e-07,
"loss": 1.8212,
"step": 238
},
{
"epoch": 4.351851851851852,
"grad_norm": 1.0780389016215326,
"learning_rate": 3.507452481961495e-07,
"loss": 1.6304,
"step": 239
},
{
"epoch": 4.37037037037037,
"grad_norm": 1.0244203325558825,
"learning_rate": 3.430462369176619e-07,
"loss": 1.9347,
"step": 240
},
{
"epoch": 4.388888888888889,
"grad_norm": 0.9762810523750869,
"learning_rate": 3.3541513253059726e-07,
"loss": 2.0351,
"step": 241
},
{
"epoch": 4.407407407407407,
"grad_norm": 0.8894982063199672,
"learning_rate": 3.278527238341163e-07,
"loss": 1.7788,
"step": 242
},
{
"epoch": 4.425925925925926,
"grad_norm": 0.9573443483478868,
"learning_rate": 3.2035979252655976e-07,
"loss": 1.6824,
"step": 243
},
{
"epoch": 4.444444444444445,
"grad_norm": 0.878347387417952,
"learning_rate": 3.129371131246459e-07,
"loss": 1.7893,
"step": 244
},
{
"epoch": 4.462962962962963,
"grad_norm": 1.2131347174643223,
"learning_rate": 3.05585452883412e-07,
"loss": 2.4755,
"step": 245
},
{
"epoch": 4.481481481481482,
"grad_norm": 0.9278993006726863,
"learning_rate": 2.9830557171690693e-07,
"loss": 2.051,
"step": 246
},
{
"epoch": 4.5,
"grad_norm": 0.9769923688632531,
"learning_rate": 2.910982221196404e-07,
"loss": 1.8307,
"step": 247
},
{
"epoch": 4.518518518518518,
"grad_norm": 1.0084007217465136,
"learning_rate": 2.8396414908880095e-07,
"loss": 2.0386,
"step": 248
},
{
"epoch": 4.537037037037037,
"grad_norm": 1.0273787706173494,
"learning_rate": 2.769040900472488e-07,
"loss": 1.9072,
"step": 249
},
{
"epoch": 4.555555555555555,
"grad_norm": 0.8621559648712259,
"learning_rate": 2.6991877476728985e-07,
"loss": 1.706,
"step": 250
},
{
"epoch": 4.574074074074074,
"grad_norm": 0.8247377172080764,
"learning_rate": 2.6300892529524264e-07,
"loss": 1.8414,
"step": 251
},
{
"epoch": 4.592592592592593,
"grad_norm": 0.8925073470001154,
"learning_rate": 2.56175255876804e-07,
"loss": 1.9007,
"step": 252
},
{
"epoch": 4.611111111111111,
"grad_norm": 0.7860274094152706,
"learning_rate": 2.494184728832179e-07,
"loss": 1.8654,
"step": 253
},
{
"epoch": 4.62962962962963,
"grad_norm": 0.8936613069940655,
"learning_rate": 2.427392747382623e-07,
"loss": 1.6996,
"step": 254
},
{
"epoch": 4.648148148148148,
"grad_norm": 1.0827181264619206,
"learning_rate": 2.3613835184605523e-07,
"loss": 1.9413,
"step": 255
},
{
"epoch": 4.666666666666667,
"grad_norm": 0.8918696543620299,
"learning_rate": 2.2961638651968974e-07,
"loss": 1.856,
"step": 256
},
{
"epoch": 4.685185185185185,
"grad_norm": 0.9976782397503938,
"learning_rate": 2.2317405291070567e-07,
"loss": 1.8228,
"step": 257
},
{
"epoch": 4.703703703703704,
"grad_norm": 1.021922767232776,
"learning_rate": 2.1681201693940666e-07,
"loss": 2.0057,
"step": 258
},
{
"epoch": 4.722222222222222,
"grad_norm": 1.1673943142630625,
"learning_rate": 2.1053093622602402e-07,
"loss": 1.9204,
"step": 259
},
{
"epoch": 4.7407407407407405,
"grad_norm": 1.0912049168909328,
"learning_rate": 2.043314600227425e-07,
"loss": 1.8173,
"step": 260
},
{
"epoch": 4.7592592592592595,
"grad_norm": 0.9358084522077252,
"learning_rate": 1.9821422914658957e-07,
"loss": 2.0846,
"step": 261
},
{
"epoch": 4.777777777777778,
"grad_norm": 1.0481784665647413,
"learning_rate": 1.921798759131953e-07,
"loss": 1.9789,
"step": 262
},
{
"epoch": 4.796296296296296,
"grad_norm": 0.983048254792995,
"learning_rate": 1.8622902407143392e-07,
"loss": 1.9294,
"step": 263
},
{
"epoch": 4.814814814814815,
"grad_norm": 0.8359638487960833,
"learning_rate": 1.8036228873894744e-07,
"loss": 1.7806,
"step": 264
},
{
"epoch": 4.833333333333333,
"grad_norm": 1.1295927764034195,
"learning_rate": 1.7458027633856475e-07,
"loss": 1.9495,
"step": 265
},
{
"epoch": 4.851851851851852,
"grad_norm": 1.1032897990848558,
"learning_rate": 1.6888358453561646e-07,
"loss": 2.0724,
"step": 266
},
{
"epoch": 4.87037037037037,
"grad_norm": 0.855002738874884,
"learning_rate": 1.632728021761579e-07,
"loss": 2.102,
"step": 267
},
{
"epoch": 4.888888888888889,
"grad_norm": 1.0646161730662291,
"learning_rate": 1.5774850922610116e-07,
"loss": 1.9046,
"step": 268
},
{
"epoch": 4.907407407407407,
"grad_norm": 1.0109654313968932,
"learning_rate": 1.5231127671126676e-07,
"loss": 2.0854,
"step": 269
},
{
"epoch": 4.925925925925926,
"grad_norm": 0.9390534047671891,
"learning_rate": 1.4696166665835852e-07,
"loss": 2.1436,
"step": 270
},
{
"epoch": 4.944444444444445,
"grad_norm": 0.9838446669064714,
"learning_rate": 1.4170023203686875e-07,
"loss": 1.9317,
"step": 271
},
{
"epoch": 4.962962962962963,
"grad_norm": 1.0678273880700424,
"learning_rate": 1.3652751670192075e-07,
"loss": 1.8309,
"step": 272
},
{
"epoch": 4.981481481481482,
"grad_norm": 1.1853311551704062,
"learning_rate": 1.3144405533805136e-07,
"loss": 1.948,
"step": 273
},
{
"epoch": 5.0,
"grad_norm": 1.0844767215232378,
"learning_rate": 1.2645037340394281e-07,
"loss": 2.1066,
"step": 274
},
{
"epoch": 5.018518518518518,
"grad_norm": 0.8509695959322425,
"learning_rate": 1.2154698707810928e-07,
"loss": 1.9217,
"step": 275
},
{
"epoch": 5.037037037037037,
"grad_norm": 0.9599815386335595,
"learning_rate": 1.167344032055394e-07,
"loss": 1.9898,
"step": 276
},
{
"epoch": 5.055555555555555,
"grad_norm": 0.9561022219351966,
"learning_rate": 1.1201311924530688e-07,
"loss": 1.6967,
"step": 277
},
{
"epoch": 5.074074074074074,
"grad_norm": 0.8614534074294055,
"learning_rate": 1.0738362321914995e-07,
"loss": 1.7586,
"step": 278
},
{
"epoch": 5.092592592592593,
"grad_norm": 0.884706815883145,
"learning_rate": 1.0284639366102598e-07,
"loss": 1.8692,
"step": 279
},
{
"epoch": 5.111111111111111,
"grad_norm": 0.8641496604329509,
"learning_rate": 9.840189956764677e-08,
"loss": 2.1101,
"step": 280
},
{
"epoch": 5.12962962962963,
"grad_norm": 0.8465414034017087,
"learning_rate": 9.405060035000134e-08,
"loss": 1.7827,
"step": 281
},
{
"epoch": 5.148148148148148,
"grad_norm": 0.6966794157650356,
"learning_rate": 8.979294578586738e-08,
"loss": 1.6446,
"step": 282
},
{
"epoch": 5.166666666666667,
"grad_norm": 0.8581271311276034,
"learning_rate": 8.562937597331898e-08,
"loss": 1.7243,
"step": 283
},
{
"epoch": 5.185185185185185,
"grad_norm": 0.9976947326325505,
"learning_rate": 8.156032128523694e-08,
"loss": 1.8994,
"step": 284
},
{
"epoch": 5.203703703703703,
"grad_norm": 0.9786757162446749,
"learning_rate": 7.758620232482083e-08,
"loss": 1.8625,
"step": 285
},
{
"epoch": 5.222222222222222,
"grad_norm": 0.7563393752170862,
"learning_rate": 7.370742988211364e-08,
"loss": 1.7512,
"step": 286
},
{
"epoch": 5.2407407407407405,
"grad_norm": 0.7955178168012043,
"learning_rate": 6.99244048915405e-08,
"loss": 2.2105,
"step": 287
},
{
"epoch": 5.2592592592592595,
"grad_norm": 0.8951178929520269,
"learning_rate": 6.623751839046455e-08,
"loss": 1.8276,
"step": 288
},
{
"epoch": 5.277777777777778,
"grad_norm": 0.9912120605663316,
"learning_rate": 6.264715147876742e-08,
"loss": 2.2784,
"step": 289
},
{
"epoch": 5.296296296296296,
"grad_norm": 0.759976000502015,
"learning_rate": 5.915367527945614e-08,
"loss": 1.9346,
"step": 290
},
{
"epoch": 5.314814814814815,
"grad_norm": 0.7423091105639062,
"learning_rate": 5.575745090030137e-08,
"loss": 1.8795,
"step": 291
},
{
"epoch": 5.333333333333333,
"grad_norm": 0.7811530255930925,
"learning_rate": 5.245882939651181e-08,
"loss": 2.0584,
"step": 292
},
{
"epoch": 5.351851851851852,
"grad_norm": 0.9202352755672565,
"learning_rate": 4.9258151734445694e-08,
"loss": 2.0563,
"step": 293
},
{
"epoch": 5.37037037037037,
"grad_norm": 0.7972657702760176,
"learning_rate": 4.6155748756367294e-08,
"loss": 1.8333,
"step": 294
},
{
"epoch": 5.388888888888889,
"grad_norm": 0.6829451582697305,
"learning_rate": 4.3151941146248873e-08,
"loss": 1.9896,
"step": 295
},
{
"epoch": 5.407407407407407,
"grad_norm": 0.7886670762082094,
"learning_rate": 4.0247039396622e-08,
"loss": 1.8183,
"step": 296
},
{
"epoch": 5.425925925925926,
"grad_norm": 0.8840244220041553,
"learning_rate": 3.7441343776484113e-08,
"loss": 1.9354,
"step": 297
},
{
"epoch": 5.444444444444445,
"grad_norm": 0.71587738270711,
"learning_rate": 3.4735144300260255e-08,
"loss": 2.0167,
"step": 298
},
{
"epoch": 5.462962962962963,
"grad_norm": 0.7108094024246895,
"learning_rate": 3.212872069782513e-08,
"loss": 1.7169,
"step": 299
},
{
"epoch": 5.481481481481482,
"grad_norm": 0.6662930242485889,
"learning_rate": 2.962234238558925e-08,
"loss": 2.2062,
"step": 300
},
{
"epoch": 5.5,
"grad_norm": 0.7122621954506775,
"learning_rate": 2.721626843864977e-08,
"loss": 2.0591,
"step": 301
},
{
"epoch": 5.518518518518518,
"grad_norm": 0.626318180659774,
"learning_rate": 2.491074756401068e-08,
"loss": 1.5866,
"step": 302
},
{
"epoch": 5.537037037037037,
"grad_norm": 0.6909592708288532,
"learning_rate": 2.2706018074875043e-08,
"loss": 1.9005,
"step": 303
},
{
"epoch": 5.555555555555555,
"grad_norm": 0.7144569439769612,
"learning_rate": 2.0602307866012246e-08,
"loss": 2.0294,
"step": 304
},
{
"epoch": 5.574074074074074,
"grad_norm": 0.684647174393133,
"learning_rate": 1.8599834390199853e-08,
"loss": 1.6046,
"step": 305
},
{
"epoch": 5.592592592592593,
"grad_norm": 0.7752801436279185,
"learning_rate": 1.6698804635747576e-08,
"loss": 1.7937,
"step": 306
},
{
"epoch": 5.611111111111111,
"grad_norm": 0.6862611972609113,
"learning_rate": 1.4899415105101066e-08,
"loss": 1.7256,
"step": 307
},
{
"epoch": 5.62962962962963,
"grad_norm": 0.6608135193001434,
"learning_rate": 1.3201851794530371e-08,
"loss": 1.7763,
"step": 308
},
{
"epoch": 5.648148148148148,
"grad_norm": 0.7625095579861546,
"learning_rate": 1.1606290174903888e-08,
"loss": 2.0082,
"step": 309
},
{
"epoch": 5.666666666666667,
"grad_norm": 0.6914220267730987,
"learning_rate": 1.0112895173551183e-08,
"loss": 1.9359,
"step": 310
},
{
"epoch": 5.685185185185185,
"grad_norm": 0.6505975431309626,
"learning_rate": 8.721821157214316e-09,
"loss": 1.9317,
"step": 311
},
{
"epoch": 5.703703703703704,
"grad_norm": 0.6947915176450158,
"learning_rate": 7.433211916092141e-09,
"loss": 1.6243,
"step": 312
},
{
"epoch": 5.722222222222222,
"grad_norm": 0.6360099423433963,
"learning_rate": 6.247200648976991e-09,
"loss": 1.9931,
"step": 313
},
{
"epoch": 5.7407407407407405,
"grad_norm": 0.6796797146249973,
"learning_rate": 5.163909949486233e-09,
"loss": 1.9858,
"step": 314
},
{
"epoch": 5.7592592592592595,
"grad_norm": 0.7636965994787633,
"learning_rate": 4.183451793390747e-09,
"loss": 1.8201,
"step": 315
},
{
"epoch": 5.777777777777778,
"grad_norm": 0.6434704456483539,
"learning_rate": 3.30592752703962e-09,
"loss": 1.5983,
"step": 316
},
{
"epoch": 5.796296296296296,
"grad_norm": 0.6697682736960676,
"learning_rate": 2.531427856885093e-09,
"loss": 1.985,
"step": 317
},
{
"epoch": 5.814814814814815,
"grad_norm": 0.657234650874368,
"learning_rate": 1.8600328401061627e-09,
"loss": 2.0918,
"step": 318
},
{
"epoch": 5.833333333333333,
"grad_norm": 0.6249721935624161,
"learning_rate": 1.2918118763335372e-09,
"loss": 2.1123,
"step": 319
},
{
"epoch": 5.851851851851852,
"grad_norm": 0.7274585554347512,
"learning_rate": 8.268237004757095e-10,
"loss": 2.2962,
"step": 320
},
{
"epoch": 5.87037037037037,
"grad_norm": 0.6060864357328691,
"learning_rate": 4.651163766484778e-10,
"loss": 1.6461,
"step": 321
},
{
"epoch": 5.888888888888889,
"grad_norm": 0.6626618247650778,
"learning_rate": 2.0672729320581063e-10,
"loss": 2.0178,
"step": 322
},
{
"epoch": 5.907407407407407,
"grad_norm": 0.5905608542721459,
"learning_rate": 5.1683158875936994e-11,
"loss": 1.7269,
"step": 323
},
{
"epoch": 5.925925925925926,
"grad_norm": 0.7138681736753105,
"learning_rate": 0.0,
"loss": 2.11,
"step": 324
}
],
"logging_steps": 1,
"max_steps": 324,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 54,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 128024720179200.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}