leixa's picture
Training in progress, step 374, checkpoint
20d10c6 verified
raw
history blame
24.8 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.20783550986385108,
"eval_steps": 34,
"global_step": 374,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005557099194220616,
"eval_loss": 2.150275230407715,
"eval_runtime": 385.5335,
"eval_samples_per_second": 7.862,
"eval_steps_per_second": 0.983,
"step": 1
},
{
"epoch": 0.0016671297582661851,
"grad_norm": 0.9782189726829529,
"learning_rate": 1.5e-05,
"loss": 8.4821,
"step": 3
},
{
"epoch": 0.0033342595165323703,
"grad_norm": 9.340497970581055,
"learning_rate": 3e-05,
"loss": 8.0841,
"step": 6
},
{
"epoch": 0.005001389274798555,
"grad_norm": 1.2668017148971558,
"learning_rate": 4.5e-05,
"loss": 9.0,
"step": 9
},
{
"epoch": 0.0066685190330647405,
"grad_norm": 1.2630752325057983,
"learning_rate": 4.999675562428437e-05,
"loss": 8.5273,
"step": 12
},
{
"epoch": 0.008335648791330925,
"grad_norm": 1.4245824813842773,
"learning_rate": 4.9979724954289244e-05,
"loss": 8.1841,
"step": 15
},
{
"epoch": 0.01000277854959711,
"grad_norm": 1.4201760292053223,
"learning_rate": 4.994810682835951e-05,
"loss": 7.569,
"step": 18
},
{
"epoch": 0.011669908307863295,
"grad_norm": 1.8734160661697388,
"learning_rate": 4.990191971059033e-05,
"loss": 7.0022,
"step": 21
},
{
"epoch": 0.013337038066129481,
"grad_norm": 1.1166455745697021,
"learning_rate": 4.984119057295783e-05,
"loss": 6.704,
"step": 24
},
{
"epoch": 0.015004167824395665,
"grad_norm": 1.3420771360397339,
"learning_rate": 4.976595487956823e-05,
"loss": 7.354,
"step": 27
},
{
"epoch": 0.01667129758266185,
"grad_norm": 1.4245537519454956,
"learning_rate": 4.967625656594782e-05,
"loss": 6.7949,
"step": 30
},
{
"epoch": 0.018338427340928037,
"grad_norm": 1.3899712562561035,
"learning_rate": 4.957214801338581e-05,
"loss": 6.8879,
"step": 33
},
{
"epoch": 0.018894137260350097,
"eval_loss": 1.6003342866897583,
"eval_runtime": 387.5811,
"eval_samples_per_second": 7.82,
"eval_steps_per_second": 0.978,
"step": 34
},
{
"epoch": 0.02000555709919422,
"grad_norm": 1.2631561756134033,
"learning_rate": 4.9453690018345144e-05,
"loss": 6.9872,
"step": 36
},
{
"epoch": 0.021672686857460405,
"grad_norm": 1.1402790546417236,
"learning_rate": 4.932095175695911e-05,
"loss": 6.2949,
"step": 39
},
{
"epoch": 0.02333981661572659,
"grad_norm": 1.0827226638793945,
"learning_rate": 4.917401074463441e-05,
"loss": 6.049,
"step": 42
},
{
"epoch": 0.025006946373992776,
"grad_norm": 1.1316773891448975,
"learning_rate": 4.901295279078431e-05,
"loss": 6.1119,
"step": 45
},
{
"epoch": 0.026674076132258962,
"grad_norm": 1.0732563734054565,
"learning_rate": 4.883787194871841e-05,
"loss": 5.899,
"step": 48
},
{
"epoch": 0.028341205890525144,
"grad_norm": 1.0603082180023193,
"learning_rate": 4.864887046071813e-05,
"loss": 5.7208,
"step": 51
},
{
"epoch": 0.03000833564879133,
"grad_norm": 1.1684924364089966,
"learning_rate": 4.8446058698330115e-05,
"loss": 6.4161,
"step": 54
},
{
"epoch": 0.03167546540705751,
"grad_norm": 1.147234320640564,
"learning_rate": 4.822955509791233e-05,
"loss": 6.0586,
"step": 57
},
{
"epoch": 0.0333425951653237,
"grad_norm": 1.4725079536437988,
"learning_rate": 4.799948609147061e-05,
"loss": 6.2541,
"step": 60
},
{
"epoch": 0.035009724923589884,
"grad_norm": 1.3403408527374268,
"learning_rate": 4.7755986032825864e-05,
"loss": 6.0201,
"step": 63
},
{
"epoch": 0.03667685468185607,
"grad_norm": 1.272765040397644,
"learning_rate": 4.74991971191553e-05,
"loss": 5.6225,
"step": 66
},
{
"epoch": 0.037788274520700195,
"eval_loss": 1.4521716833114624,
"eval_runtime": 387.3586,
"eval_samples_per_second": 7.825,
"eval_steps_per_second": 0.978,
"step": 68
},
{
"epoch": 0.038343984440122256,
"grad_norm": 1.3769108057022095,
"learning_rate": 4.7229269307953235e-05,
"loss": 5.515,
"step": 69
},
{
"epoch": 0.04001111419838844,
"grad_norm": 1.1973274946212769,
"learning_rate": 4.694636022946012e-05,
"loss": 5.7673,
"step": 72
},
{
"epoch": 0.04167824395665463,
"grad_norm": 1.3785252571105957,
"learning_rate": 4.665063509461097e-05,
"loss": 5.945,
"step": 75
},
{
"epoch": 0.04334537371492081,
"grad_norm": 1.3699363470077515,
"learning_rate": 4.6342266598556814e-05,
"loss": 5.5223,
"step": 78
},
{
"epoch": 0.045012503473187,
"grad_norm": 1.2516443729400635,
"learning_rate": 4.6021434819815555e-05,
"loss": 5.6879,
"step": 81
},
{
"epoch": 0.04667963323145318,
"grad_norm": 1.6110479831695557,
"learning_rate": 4.568832711511125e-05,
"loss": 5.6658,
"step": 84
},
{
"epoch": 0.048346762989719363,
"grad_norm": 1.2466659545898438,
"learning_rate": 4.534313800996299e-05,
"loss": 5.6652,
"step": 87
},
{
"epoch": 0.05001389274798555,
"grad_norm": 1.5405720472335815,
"learning_rate": 4.498606908508754e-05,
"loss": 5.4541,
"step": 90
},
{
"epoch": 0.051681022506251735,
"grad_norm": 1.4913195371627808,
"learning_rate": 4.46173288586818e-05,
"loss": 6.3391,
"step": 93
},
{
"epoch": 0.053348152264517924,
"grad_norm": 1.4445130825042725,
"learning_rate": 4.4237132664654154e-05,
"loss": 5.5362,
"step": 96
},
{
"epoch": 0.05501528202278411,
"grad_norm": 1.4353359937667847,
"learning_rate": 4.384570252687542e-05,
"loss": 5.6292,
"step": 99
},
{
"epoch": 0.05668241178105029,
"grad_norm": 1.515329122543335,
"learning_rate": 4.344326702952326e-05,
"loss": 5.9767,
"step": 102
},
{
"epoch": 0.05668241178105029,
"eval_loss": 1.3901065587997437,
"eval_runtime": 387.1962,
"eval_samples_per_second": 7.828,
"eval_steps_per_second": 0.979,
"step": 102
},
{
"epoch": 0.05834954153931648,
"grad_norm": 1.3371084928512573,
"learning_rate": 4.303006118359537e-05,
"loss": 5.0829,
"step": 105
},
{
"epoch": 0.06001667129758266,
"grad_norm": 1.3986601829528809,
"learning_rate": 4.260632628966974e-05,
"loss": 4.9783,
"step": 108
},
{
"epoch": 0.06168380105584885,
"grad_norm": 1.6927534341812134,
"learning_rate": 4.217230979699188e-05,
"loss": 5.5207,
"step": 111
},
{
"epoch": 0.06335093081411503,
"grad_norm": 1.4875972270965576,
"learning_rate": 4.172826515897146e-05,
"loss": 4.9577,
"step": 114
},
{
"epoch": 0.06501806057238121,
"grad_norm": 1.5819252729415894,
"learning_rate": 4.12744516851726e-05,
"loss": 5.5329,
"step": 117
},
{
"epoch": 0.0666851903306474,
"grad_norm": 1.7034679651260376,
"learning_rate": 4.0811134389884433e-05,
"loss": 5.8642,
"step": 120
},
{
"epoch": 0.06835232008891359,
"grad_norm": 1.4892022609710693,
"learning_rate": 4.0338583837360225e-05,
"loss": 4.9988,
"step": 123
},
{
"epoch": 0.07001944984717977,
"grad_norm": 1.8653110265731812,
"learning_rate": 3.985707598381544e-05,
"loss": 5.5333,
"step": 126
},
{
"epoch": 0.07168657960544596,
"grad_norm": 1.6267685890197754,
"learning_rate": 3.9366892016277096e-05,
"loss": 5.5853,
"step": 129
},
{
"epoch": 0.07335370936371215,
"grad_norm": 1.710310935974121,
"learning_rate": 3.886831818837847e-05,
"loss": 5.5769,
"step": 132
},
{
"epoch": 0.07502083912197832,
"grad_norm": 2.016270160675049,
"learning_rate": 3.8361645653195026e-05,
"loss": 5.5546,
"step": 135
},
{
"epoch": 0.07557654904140039,
"eval_loss": 1.3533298969268799,
"eval_runtime": 387.1489,
"eval_samples_per_second": 7.829,
"eval_steps_per_second": 0.979,
"step": 136
},
{
"epoch": 0.07668796888024451,
"grad_norm": 2.021454095840454,
"learning_rate": 3.784717029321922e-05,
"loss": 5.2784,
"step": 138
},
{
"epoch": 0.0783550986385107,
"grad_norm": 1.5546106100082397,
"learning_rate": 3.732519254757344e-05,
"loss": 5.21,
"step": 141
},
{
"epoch": 0.08002222839677688,
"grad_norm": 1.662156343460083,
"learning_rate": 3.679601723656205e-05,
"loss": 5.3778,
"step": 144
},
{
"epoch": 0.08168935815504307,
"grad_norm": 1.6201971769332886,
"learning_rate": 3.625995338366492e-05,
"loss": 5.4251,
"step": 147
},
{
"epoch": 0.08335648791330925,
"grad_norm": 1.6661242246627808,
"learning_rate": 3.5717314035076355e-05,
"loss": 5.292,
"step": 150
},
{
"epoch": 0.08502361767157544,
"grad_norm": 1.595383644104004,
"learning_rate": 3.516841607689501e-05,
"loss": 5.128,
"step": 153
},
{
"epoch": 0.08669074742984162,
"grad_norm": 1.7640243768692017,
"learning_rate": 3.461358005007128e-05,
"loss": 5.2638,
"step": 156
},
{
"epoch": 0.08835787718810781,
"grad_norm": 2.1148338317871094,
"learning_rate": 3.405312996322042e-05,
"loss": 5.1089,
"step": 159
},
{
"epoch": 0.090025006946374,
"grad_norm": 1.9622715711593628,
"learning_rate": 3.348739310341068e-05,
"loss": 4.7301,
"step": 162
},
{
"epoch": 0.09169213670464017,
"grad_norm": 1.920733094215393,
"learning_rate": 3.2916699845036816e-05,
"loss": 5.0066,
"step": 165
},
{
"epoch": 0.09335926646290636,
"grad_norm": 1.6611617803573608,
"learning_rate": 3.234138345689077e-05,
"loss": 5.198,
"step": 168
},
{
"epoch": 0.09447068630175048,
"eval_loss": 1.3297733068466187,
"eval_runtime": 387.3661,
"eval_samples_per_second": 7.825,
"eval_steps_per_second": 0.978,
"step": 170
},
{
"epoch": 0.09502639622117255,
"grad_norm": 1.5192731618881226,
"learning_rate": 3.17617799075421e-05,
"loss": 4.9727,
"step": 171
},
{
"epoch": 0.09669352597943873,
"grad_norm": 2.1325037479400635,
"learning_rate": 3.1178227669141744e-05,
"loss": 5.287,
"step": 174
},
{
"epoch": 0.09836065573770492,
"grad_norm": 1.6394548416137695,
"learning_rate": 3.0591067519763895e-05,
"loss": 5.0878,
"step": 177
},
{
"epoch": 0.1000277854959711,
"grad_norm": 1.954785704612732,
"learning_rate": 3.0000642344401113e-05,
"loss": 5.7474,
"step": 180
},
{
"epoch": 0.1016949152542373,
"grad_norm": 1.7333064079284668,
"learning_rate": 2.9407296934729227e-05,
"loss": 5.2069,
"step": 183
},
{
"epoch": 0.10336204501250347,
"grad_norm": 1.7775465250015259,
"learning_rate": 2.8811377787758636e-05,
"loss": 4.8365,
"step": 186
},
{
"epoch": 0.10502917477076966,
"grad_norm": 1.766340970993042,
"learning_rate": 2.8213232903489865e-05,
"loss": 4.8806,
"step": 189
},
{
"epoch": 0.10669630452903585,
"grad_norm": 2.064275026321411,
"learning_rate": 2.761321158169134e-05,
"loss": 5.1876,
"step": 192
},
{
"epoch": 0.10836343428730202,
"grad_norm": 1.731985330581665,
"learning_rate": 2.7011664217918154e-05,
"loss": 4.6924,
"step": 195
},
{
"epoch": 0.11003056404556821,
"grad_norm": 1.8852187395095825,
"learning_rate": 2.6408942098890936e-05,
"loss": 5.0911,
"step": 198
},
{
"epoch": 0.1116976938038344,
"grad_norm": 1.8446505069732666,
"learning_rate": 2.580539719735433e-05,
"loss": 5.0379,
"step": 201
},
{
"epoch": 0.11336482356210058,
"grad_norm": 1.863871455192566,
"learning_rate": 2.5201381966534748e-05,
"loss": 5.3173,
"step": 204
},
{
"epoch": 0.11336482356210058,
"eval_loss": 1.3148518800735474,
"eval_runtime": 387.4809,
"eval_samples_per_second": 7.822,
"eval_steps_per_second": 0.978,
"step": 204
},
{
"epoch": 0.11503195332036677,
"grad_norm": 2.1243629455566406,
"learning_rate": 2.459724913431772e-05,
"loss": 5.1268,
"step": 207
},
{
"epoch": 0.11669908307863296,
"grad_norm": 1.8287479877471924,
"learning_rate": 2.399335149726463e-05,
"loss": 4.8911,
"step": 210
},
{
"epoch": 0.11836621283689913,
"grad_norm": 1.827951431274414,
"learning_rate": 2.3390041714589514e-05,
"loss": 5.0788,
"step": 213
},
{
"epoch": 0.12003334259516532,
"grad_norm": 1.9181324243545532,
"learning_rate": 2.2787672102216042e-05,
"loss": 5.2716,
"step": 216
},
{
"epoch": 0.12170047235343151,
"grad_norm": 2.334996461868286,
"learning_rate": 2.2186594427034864e-05,
"loss": 5.4529,
"step": 219
},
{
"epoch": 0.1233676021116977,
"grad_norm": 2.042280673980713,
"learning_rate": 2.1587159701481716e-05,
"loss": 4.8902,
"step": 222
},
{
"epoch": 0.1250347318699639,
"grad_norm": 1.9080718755722046,
"learning_rate": 2.098971797855599e-05,
"loss": 4.933,
"step": 225
},
{
"epoch": 0.12670186162823005,
"grad_norm": 1.7101670503616333,
"learning_rate": 2.0394618147399713e-05,
"loss": 5.0186,
"step": 228
},
{
"epoch": 0.12836899138649624,
"grad_norm": 2.011359453201294,
"learning_rate": 1.980220772955602e-05,
"loss": 4.7936,
"step": 231
},
{
"epoch": 0.13003612114476243,
"grad_norm": 2.302273750305176,
"learning_rate": 1.921283267602643e-05,
"loss": 5.1487,
"step": 234
},
{
"epoch": 0.13170325090302862,
"grad_norm": 2.4797189235687256,
"learning_rate": 1.8626837165245165e-05,
"loss": 5.2862,
"step": 237
},
{
"epoch": 0.13225896082245067,
"eval_loss": 1.303543210029602,
"eval_runtime": 387.3037,
"eval_samples_per_second": 7.826,
"eval_steps_per_second": 0.979,
"step": 238
},
{
"epoch": 0.1333703806612948,
"grad_norm": 1.8348701000213623,
"learning_rate": 1.8044563402088684e-05,
"loss": 5.0605,
"step": 240
},
{
"epoch": 0.135037510419561,
"grad_norm": 2.109149217605591,
"learning_rate": 1.746635141803761e-05,
"loss": 4.9242,
"step": 243
},
{
"epoch": 0.13670464017782719,
"grad_norm": 2.1694352626800537,
"learning_rate": 1.6892538872607937e-05,
"loss": 5.0852,
"step": 246
},
{
"epoch": 0.13837176993609335,
"grad_norm": 2.145925998687744,
"learning_rate": 1.6323460856167426e-05,
"loss": 4.9473,
"step": 249
},
{
"epoch": 0.14003889969435954,
"grad_norm": 2.4143218994140625,
"learning_rate": 1.5759449694252226e-05,
"loss": 5.2909,
"step": 252
},
{
"epoch": 0.14170602945262573,
"grad_norm": 2.154897689819336,
"learning_rate": 1.5200834753498128e-05,
"loss": 5.2477,
"step": 255
},
{
"epoch": 0.14337315921089192,
"grad_norm": 2.189666986465454,
"learning_rate": 1.4647942249299707e-05,
"loss": 5.4482,
"step": 258
},
{
"epoch": 0.1450402889691581,
"grad_norm": 1.9108495712280273,
"learning_rate": 1.4101095055309746e-05,
"loss": 5.1046,
"step": 261
},
{
"epoch": 0.1467074187274243,
"grad_norm": 1.8444137573242188,
"learning_rate": 1.356061251489012e-05,
"loss": 5.0423,
"step": 264
},
{
"epoch": 0.14837454848569048,
"grad_norm": 2.031024694442749,
"learning_rate": 1.302681025462424e-05,
"loss": 4.5857,
"step": 267
},
{
"epoch": 0.15004167824395664,
"grad_norm": 2.1987197399139404,
"learning_rate": 1.2500000000000006e-05,
"loss": 5.381,
"step": 270
},
{
"epoch": 0.15115309808280078,
"eval_loss": 1.2963863611221313,
"eval_runtime": 387.5334,
"eval_samples_per_second": 7.821,
"eval_steps_per_second": 0.978,
"step": 272
},
{
"epoch": 0.15170880800222283,
"grad_norm": 1.8340719938278198,
"learning_rate": 1.1980489393370938e-05,
"loss": 5.4333,
"step": 273
},
{
"epoch": 0.15337593776048902,
"grad_norm": 2.617314577102661,
"learning_rate": 1.1468581814301717e-05,
"loss": 5.757,
"step": 276
},
{
"epoch": 0.1550430675187552,
"grad_norm": 2.0526530742645264,
"learning_rate": 1.096457620240298e-05,
"loss": 4.9528,
"step": 279
},
{
"epoch": 0.1567101972770214,
"grad_norm": 2.3239846229553223,
"learning_rate": 1.0468766882759094e-05,
"loss": 5.2481,
"step": 282
},
{
"epoch": 0.1583773270352876,
"grad_norm": 2.1533966064453125,
"learning_rate": 9.981443394050525e-06,
"loss": 5.6509,
"step": 285
},
{
"epoch": 0.16004445679355375,
"grad_norm": 1.9592647552490234,
"learning_rate": 9.502890319471491e-06,
"loss": 4.9243,
"step": 288
},
{
"epoch": 0.16171158655181994,
"grad_norm": 2.204939126968384,
"learning_rate": 9.033387120541306e-06,
"loss": 5.2745,
"step": 291
},
{
"epoch": 0.16337871631008613,
"grad_norm": 2.236279010772705,
"learning_rate": 8.573207973906735e-06,
"loss": 5.4374,
"step": 294
},
{
"epoch": 0.16504584606835232,
"grad_norm": 2.4140145778656006,
"learning_rate": 8.1226216112306e-06,
"loss": 5.4428,
"step": 297
},
{
"epoch": 0.1667129758266185,
"grad_norm": 2.0701277256011963,
"learning_rate": 7.681891162260015e-06,
"loss": 5.4502,
"step": 300
},
{
"epoch": 0.1683801055848847,
"grad_norm": 2.154461622238159,
"learning_rate": 7.251274001166044e-06,
"loss": 5.0715,
"step": 303
},
{
"epoch": 0.1700472353431509,
"grad_norm": 2.3085010051727295,
"learning_rate": 6.831021596244424e-06,
"loss": 4.8451,
"step": 306
},
{
"epoch": 0.1700472353431509,
"eval_loss": 1.2917685508728027,
"eval_runtime": 387.2177,
"eval_samples_per_second": 7.828,
"eval_steps_per_second": 0.979,
"step": 306
},
{
"epoch": 0.17171436510141705,
"grad_norm": 2.220491886138916,
"learning_rate": 6.421379363065142e-06,
"loss": 5.3908,
"step": 309
},
{
"epoch": 0.17338149485968324,
"grad_norm": 2.2047910690307617,
"learning_rate": 6.022586521156715e-06,
"loss": 5.2721,
"step": 312
},
{
"epoch": 0.17504862461794943,
"grad_norm": 2.1623401641845703,
"learning_rate": 5.634875954308638e-06,
"loss": 5.5073,
"step": 315
},
{
"epoch": 0.17671575437621562,
"grad_norm": 1.9954192638397217,
"learning_rate": 5.258474074573877e-06,
"loss": 5.1791,
"step": 318
},
{
"epoch": 0.1783828841344818,
"grad_norm": 2.24808669090271,
"learning_rate": 4.893600690050579e-06,
"loss": 5.0704,
"step": 321
},
{
"epoch": 0.180050013892748,
"grad_norm": 2.2592039108276367,
"learning_rate": 4.540468876520323e-06,
"loss": 5.0177,
"step": 324
},
{
"epoch": 0.18171714365101416,
"grad_norm": 1.9192252159118652,
"learning_rate": 4.199284853017896e-06,
"loss": 5.2738,
"step": 327
},
{
"epoch": 0.18338427340928035,
"grad_norm": 2.021440267562866,
"learning_rate": 3.8702478614051355e-06,
"loss": 4.6439,
"step": 330
},
{
"epoch": 0.18505140316754654,
"grad_norm": 2.309406042098999,
"learning_rate": 3.5535500500193357e-06,
"loss": 5.4409,
"step": 333
},
{
"epoch": 0.18671853292581272,
"grad_norm": 2.2390878200531006,
"learning_rate": 3.249376361464021e-06,
"loss": 5.1074,
"step": 336
},
{
"epoch": 0.18838566268407891,
"grad_norm": 2.540015697479248,
"learning_rate": 2.957904424607652e-06,
"loss": 5.2675,
"step": 339
},
{
"epoch": 0.18894137260350097,
"eval_loss": 1.2895426750183105,
"eval_runtime": 387.2989,
"eval_samples_per_second": 7.826,
"eval_steps_per_second": 0.979,
"step": 340
},
{
"epoch": 0.1900527924423451,
"grad_norm": 2.492077112197876,
"learning_rate": 2.679304450853401e-06,
"loss": 5.211,
"step": 342
},
{
"epoch": 0.1917199222006113,
"grad_norm": 1.7696568965911865,
"learning_rate": 2.4137391347404476e-06,
"loss": 5.3133,
"step": 345
},
{
"epoch": 0.19338705195887745,
"grad_norm": 1.9813653230667114,
"learning_rate": 2.1613635589349756e-06,
"loss": 5.0046,
"step": 348
},
{
"epoch": 0.19505418171714364,
"grad_norm": 2.0188918113708496,
"learning_rate": 1.922325103666281e-06,
"loss": 5.1457,
"step": 351
},
{
"epoch": 0.19672131147540983,
"grad_norm": 2.231536865234375,
"learning_rate": 1.696763360660808e-06,
"loss": 4.8438,
"step": 354
},
{
"epoch": 0.19838844123367602,
"grad_norm": 2.102440357208252,
"learning_rate": 1.4848100516245717e-06,
"loss": 5.4079,
"step": 357
},
{
"epoch": 0.2000555709919422,
"grad_norm": 2.369569778442383,
"learning_rate": 1.286588951321363e-06,
"loss": 5.661,
"step": 360
},
{
"epoch": 0.2017227007502084,
"grad_norm": 1.8659355640411377,
"learning_rate": 1.102215815291774e-06,
"loss": 4.9097,
"step": 363
},
{
"epoch": 0.2033898305084746,
"grad_norm": 2.071213722229004,
"learning_rate": 9.317983122552332e-07,
"loss": 5.0217,
"step": 366
},
{
"epoch": 0.20505696026674075,
"grad_norm": 2.435093402862549,
"learning_rate": 7.754359612344859e-07,
"loss": 5.44,
"step": 369
},
{
"epoch": 0.20672409002500694,
"grad_norm": 2.3098435401916504,
"learning_rate": 6.332200734393057e-07,
"loss": 5.2669,
"step": 372
},
{
"epoch": 0.20783550986385108,
"eval_loss": 1.2886923551559448,
"eval_runtime": 387.521,
"eval_samples_per_second": 7.822,
"eval_steps_per_second": 0.978,
"step": 374
}
],
"logging_steps": 3,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 34,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.43602420813824e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}