Training in progress, step 448, checkpoint

Browse files

Files changed (6) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +1537 -4
last-checkpoint/training_args.bin +1 -1

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ab8d748fd8f20e14a3cda2d877038ce59887130178c068dcafb79817213c24e1
 size 891644712

 version https://git-lfs.github.com/spec/v1
+oid sha256:59dd4181cb7b12c18778fce401b831547648cbbc21eaca760b5c6c36b98a4637
 size 891644712

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:00bca8fedb1bc6c25e6e507abe70894fe1954894c658291feb63df23505cbe3e
 size 1783444794

 version https://git-lfs.github.com/spec/v1
+oid sha256:d37828ba5932074afc8a3ca7276b64a27b5b48490193b1197d05ecc97f5a3313
 size 1783444794

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:70357fb90a1dcb5ff229b8259118cfc08b286f367541b00f95c942e294080e49
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:6ba0efe6975e5f5bc0ac8e3f88ad39f73f4c39d66d2d03cc9bdba96c5706b8d3
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d7c5edc54ecb4a3dbb1bb6fd6f91c69141389679273cc6bb632b191180900f3c
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:7fc32074b9fff6edfb63f80aa132a15f80c948014a8e656aa4df28416ac2e074
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.005546311702717693,
   "eval_steps": 500,
-  "global_step": 10,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -42,13 +42,1546 @@
       "learning_rate": 1.834862385321101e-05,
       "loss": 12.5906,
       "step": 10
     }
   ],
   "logging_steps": 2,
   "max_steps": 3606,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 2,
-  "save_steps": 10,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
@@ -61,7 +1594,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 24358315622400.0,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.24847476428175264,
   "eval_steps": 500,
+  "global_step": 448,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "learning_rate": 1.834862385321101e-05,
       "loss": 12.5906,
       "step": 10
+    },
+    {
+      "epoch": 0.0066555740432612314,
+      "grad_norm": 44.9824333190918,
+      "learning_rate": 2.2018348623853213e-05,
+      "loss": 11.0436,
+      "step": 12
+    },
+    {
+      "epoch": 0.00776483638380477,
+      "grad_norm": 34.28094482421875,
+      "learning_rate": 2.5688073394495416e-05,
+      "loss": 9.9942,
+      "step": 14
+    },
+    {
+      "epoch": 0.00887409872434831,
+      "grad_norm": 28.417001724243164,
+      "learning_rate": 2.9357798165137618e-05,
+      "loss": 9.6041,
+      "step": 16
+    },
+    {
+      "epoch": 0.009983361064891847,
+      "grad_norm": 26.446611404418945,
+      "learning_rate": 3.302752293577982e-05,
+      "loss": 8.4555,
+      "step": 18
+    },
+    {
+      "epoch": 0.011092623405435386,
+      "grad_norm": 29.56342315673828,
+      "learning_rate": 3.669724770642202e-05,
+      "loss": 7.7889,
+      "step": 20
+    },
+    {
+      "epoch": 0.012201885745978924,
+      "grad_norm": 62.20895004272461,
+      "learning_rate": 4.036697247706422e-05,
+      "loss": 6.0004,
+      "step": 22
+    },
+    {
+      "epoch": 0.013311148086522463,
+      "grad_norm": 26.70697784423828,
+      "learning_rate": 4.403669724770643e-05,
+      "loss": 5.2102,
+      "step": 24
+    },
+    {
+      "epoch": 0.014420410427066,
+      "grad_norm": 20.650150299072266,
+      "learning_rate": 4.7706422018348626e-05,
+      "loss": 3.786,
+      "step": 26
+    },
+    {
+      "epoch": 0.01552967276760954,
+      "grad_norm": 18.51991081237793,
+      "learning_rate": 5.137614678899083e-05,
+      "loss": 2.7969,
+      "step": 28
+    },
+    {
+      "epoch": 0.016638935108153077,
+      "grad_norm": 11.624157905578613,
+      "learning_rate": 5.504587155963303e-05,
+      "loss": 1.9075,
+      "step": 30
+    },
+    {
+      "epoch": 0.01774819744869662,
+      "grad_norm": 4.947307109832764,
+      "learning_rate": 5.8715596330275236e-05,
+      "loss": 1.4263,
+      "step": 32
+    },
+    {
+      "epoch": 0.018857459789240156,
+      "grad_norm": 2.2392783164978027,
+      "learning_rate": 6.238532110091744e-05,
+      "loss": 1.3292,
+      "step": 34
+    },
+    {
+      "epoch": 0.019966722129783693,
+      "grad_norm": 1.7417014837265015,
+      "learning_rate": 6.605504587155963e-05,
+      "loss": 1.128,
+      "step": 36
+    },
+    {
+      "epoch": 0.02107598447032723,
+      "grad_norm": 1.061747431755066,
+      "learning_rate": 6.972477064220184e-05,
+      "loss": 0.9133,
+      "step": 38
+    },
+    {
+      "epoch": 0.022185246810870772,
+      "grad_norm": 1.0621497631072998,
+      "learning_rate": 7.339449541284404e-05,
+      "loss": 0.8753,
+      "step": 40
+    },
+    {
+      "epoch": 0.02329450915141431,
+      "grad_norm": 0.772523045539856,
+      "learning_rate": 7.706422018348625e-05,
+      "loss": 0.6989,
+      "step": 42
+    },
+    {
+      "epoch": 0.024403771491957847,
+      "grad_norm": 0.6044029593467712,
+      "learning_rate": 8.073394495412844e-05,
+      "loss": 0.7296,
+      "step": 44
+    },
+    {
+      "epoch": 0.025513033832501388,
+      "grad_norm": 0.5906920433044434,
+      "learning_rate": 8.440366972477065e-05,
+      "loss": 0.6963,
+      "step": 46
+    },
+    {
+      "epoch": 0.026622296173044926,
+      "grad_norm": 0.5078668594360352,
+      "learning_rate": 8.807339449541285e-05,
+      "loss": 0.6978,
+      "step": 48
+    },
+    {
+      "epoch": 0.027731558513588463,
+      "grad_norm": 0.43668779730796814,
+      "learning_rate": 9.174311926605506e-05,
+      "loss": 0.6444,
+      "step": 50
+    },
+    {
+      "epoch": 0.028840820854132,
+      "grad_norm": 0.5769929885864258,
+      "learning_rate": 9.541284403669725e-05,
+      "loss": 0.6321,
+      "step": 52
+    },
+    {
+      "epoch": 0.029950083194675542,
+      "grad_norm": 0.6032134890556335,
+      "learning_rate": 9.908256880733946e-05,
+      "loss": 0.6399,
+      "step": 54
+    },
+    {
+      "epoch": 0.03105934553521908,
+      "grad_norm": 0.5968932509422302,
+      "learning_rate": 0.00010275229357798166,
+      "loss": 0.6707,
+      "step": 56
+    },
+    {
+      "epoch": 0.03216860787576262,
+      "grad_norm": 0.5290119051933289,
+      "learning_rate": 0.00010642201834862387,
+      "loss": 0.8516,
+      "step": 58
+    },
+    {
+      "epoch": 0.033277870216306155,
+      "grad_norm": 0.4980998635292053,
+      "learning_rate": 0.00011009174311926606,
+      "loss": 0.6211,
+      "step": 60
+    },
+    {
+      "epoch": 0.03438713255684969,
+      "grad_norm": 0.6519685387611389,
+      "learning_rate": 0.00011376146788990827,
+      "loss": 0.7251,
+      "step": 62
+    },
+    {
+      "epoch": 0.03549639489739324,
+      "grad_norm": 0.6212429404258728,
+      "learning_rate": 0.00011743119266055047,
+      "loss": 0.4878,
+      "step": 64
+    },
+    {
+      "epoch": 0.036605657237936774,
+      "grad_norm": 0.7839425802230835,
+      "learning_rate": 0.00012110091743119268,
+      "loss": 0.6157,
+      "step": 66
+    },
+    {
+      "epoch": 0.03771491957848031,
+      "grad_norm": 0.5585289597511292,
+      "learning_rate": 0.00012477064220183488,
+      "loss": 0.6334,
+      "step": 68
+    },
+    {
+      "epoch": 0.03882418191902385,
+      "grad_norm": 0.4180700480937958,
+      "learning_rate": 0.00012844036697247707,
+      "loss": 0.639,
+      "step": 70
+    },
+    {
+      "epoch": 0.03993344425956739,
+      "grad_norm": 0.4288474917411804,
+      "learning_rate": 0.00013211009174311927,
+      "loss": 0.6489,
+      "step": 72
+    },
+    {
+      "epoch": 0.041042706600110924,
+      "grad_norm": 0.391681045293808,
+      "learning_rate": 0.00013577981651376149,
+      "loss": 0.53,
+      "step": 74
+    },
+    {
+      "epoch": 0.04215196894065446,
+      "grad_norm": 0.4582154154777527,
+      "learning_rate": 0.00013944954128440368,
+      "loss": 0.6368,
+      "step": 76
+    },
+    {
+      "epoch": 0.04326123128119801,
+      "grad_norm": 0.5818586945533752,
+      "learning_rate": 0.0001431192660550459,
+      "loss": 0.5243,
+      "step": 78
+    },
+    {
+      "epoch": 0.044370493621741544,
+      "grad_norm": 0.5001092553138733,
+      "learning_rate": 0.0001467889908256881,
+      "loss": 0.4512,
+      "step": 80
+    },
+    {
+      "epoch": 0.04547975596228508,
+      "grad_norm": 0.37270447611808777,
+      "learning_rate": 0.00015045871559633028,
+      "loss": 0.582,
+      "step": 82
+    },
+    {
+      "epoch": 0.04658901830282862,
+      "grad_norm": 0.3746323585510254,
+      "learning_rate": 0.0001541284403669725,
+      "loss": 0.4406,
+      "step": 84
+    },
+    {
+      "epoch": 0.04769828064337216,
+      "grad_norm": 0.3230627775192261,
+      "learning_rate": 0.0001577981651376147,
+      "loss": 0.4755,
+      "step": 86
+    },
+    {
+      "epoch": 0.048807542983915694,
+      "grad_norm": 0.46364355087280273,
+      "learning_rate": 0.00016146788990825688,
+      "loss": 0.7201,
+      "step": 88
+    },
+    {
+      "epoch": 0.04991680532445923,
+      "grad_norm": 0.37719470262527466,
+      "learning_rate": 0.0001651376146788991,
+      "loss": 0.5318,
+      "step": 90
+    },
+    {
+      "epoch": 0.051026067665002776,
+      "grad_norm": 0.3767435550689697,
+      "learning_rate": 0.0001688073394495413,
+      "loss": 0.4638,
+      "step": 92
+    },
+    {
+      "epoch": 0.052135330005546314,
+      "grad_norm": 0.42447295784950256,
+      "learning_rate": 0.00017247706422018351,
+      "loss": 0.6399,
+      "step": 94
+    },
+    {
+      "epoch": 0.05324459234608985,
+      "grad_norm": 0.33117732405662537,
+      "learning_rate": 0.0001761467889908257,
+      "loss": 0.4514,
+      "step": 96
+    },
+    {
+      "epoch": 0.05435385468663339,
+      "grad_norm": 0.32511624693870544,
+      "learning_rate": 0.0001798165137614679,
+      "loss": 0.5366,
+      "step": 98
+    },
+    {
+      "epoch": 0.05546311702717693,
+      "grad_norm": 0.34356439113616943,
+      "learning_rate": 0.00018348623853211012,
+      "loss": 0.6371,
+      "step": 100
+    },
+    {
+      "epoch": 0.056572379367720464,
+      "grad_norm": 0.4009954035282135,
+      "learning_rate": 0.0001871559633027523,
+      "loss": 0.5844,
+      "step": 102
+    },
+    {
+      "epoch": 0.057681641708264,
+      "grad_norm": 0.2896111309528351,
+      "learning_rate": 0.0001908256880733945,
+      "loss": 0.4478,
+      "step": 104
+    },
+    {
+      "epoch": 0.058790904048807546,
+      "grad_norm": 0.31993618607521057,
+      "learning_rate": 0.00019449541284403672,
+      "loss": 0.3886,
+      "step": 106
+    },
+    {
+      "epoch": 0.059900166389351084,
+      "grad_norm": 0.44810792803764343,
+      "learning_rate": 0.0001981651376146789,
+      "loss": 0.5448,
+      "step": 108
+    },
+    {
+      "epoch": 0.06100942872989462,
+      "grad_norm": 0.31323954463005066,
+      "learning_rate": 0.00019999995964675577,
+      "loss": 0.5082,
+      "step": 110
+    },
+    {
+      "epoch": 0.06211869107043816,
+      "grad_norm": 0.3845660090446472,
+      "learning_rate": 0.00019999963682099735,
+      "loss": 0.52,
+      "step": 112
+    },
+    {
+      "epoch": 0.0632279534109817,
+      "grad_norm": 0.41813963651657104,
+      "learning_rate": 0.00019999899117052264,
+      "loss": 0.6146,
+      "step": 114
+    },
+    {
+      "epoch": 0.06433721575152523,
+      "grad_norm": 0.35359835624694824,
+      "learning_rate": 0.000199998022697416,
+      "loss": 0.5561,
+      "step": 116
+    },
+    {
+      "epoch": 0.06544647809206877,
+      "grad_norm": 0.3362332284450531,
+      "learning_rate": 0.0001999967314048039,
+      "loss": 0.4836,
+      "step": 118
+    },
+    {
+      "epoch": 0.06655574043261231,
+      "grad_norm": 0.5304926633834839,
+      "learning_rate": 0.000199995117296855,
+      "loss": 0.5111,
+      "step": 120
+    },
+    {
+      "epoch": 0.06766500277315585,
+      "grad_norm": 0.2606920599937439,
+      "learning_rate": 0.00019999318037877995,
+      "loss": 0.5379,
+      "step": 122
+    },
+    {
+      "epoch": 0.06877426511369938,
+      "grad_norm": 0.47892895340919495,
+      "learning_rate": 0.0001999909206568318,
+      "loss": 0.5963,
+      "step": 124
+    },
+    {
+      "epoch": 0.06988352745424292,
+      "grad_norm": 0.30522894859313965,
+      "learning_rate": 0.00019998833813830534,
+      "loss": 0.6176,
+      "step": 126
+    },
+    {
+      "epoch": 0.07099278979478647,
+      "grad_norm": 0.41433125734329224,
+      "learning_rate": 0.00019998543283153772,
+      "loss": 0.5134,
+      "step": 128
+    },
+    {
+      "epoch": 0.07210205213533001,
+      "grad_norm": 0.27291831374168396,
+      "learning_rate": 0.000199982204745908,
+      "loss": 0.5248,
+      "step": 130
+    },
+    {
+      "epoch": 0.07321131447587355,
+      "grad_norm": 0.26184335350990295,
+      "learning_rate": 0.0001999786538918372,
+      "loss": 0.4302,
+      "step": 132
+    },
+    {
+      "epoch": 0.07432057681641709,
+      "grad_norm": 0.2925998866558075,
+      "learning_rate": 0.00019997478028078853,
+      "loss": 0.4643,
+      "step": 134
+    },
+    {
+      "epoch": 0.07542983915696062,
+      "grad_norm": 0.49747079610824585,
+      "learning_rate": 0.0001999705839252669,
+      "loss": 0.6133,
+      "step": 136
+    },
+    {
+      "epoch": 0.07653910149750416,
+      "grad_norm": 0.37335023283958435,
+      "learning_rate": 0.0001999660648388193,
+      "loss": 0.508,
+      "step": 138
+    },
+    {
+      "epoch": 0.0776483638380477,
+      "grad_norm": 0.383662611246109,
+      "learning_rate": 0.00019996122303603446,
+      "loss": 0.5049,
+      "step": 140
+    },
+    {
+      "epoch": 0.07875762617859124,
+      "grad_norm": 0.28099021315574646,
+      "learning_rate": 0.000199956058532543,
+      "loss": 0.4487,
+      "step": 142
+    },
+    {
+      "epoch": 0.07986688851913477,
+      "grad_norm": 0.2824121415615082,
+      "learning_rate": 0.00019995057134501726,
+      "loss": 0.4544,
+      "step": 144
+    },
+    {
+      "epoch": 0.08097615085967831,
+      "grad_norm": 0.33720389008522034,
+      "learning_rate": 0.00019994476149117133,
+      "loss": 0.5987,
+      "step": 146
+    },
+    {
+      "epoch": 0.08208541320022185,
+      "grad_norm": 6.722891330718994,
+      "learning_rate": 0.0001999386289897609,
+      "loss": 0.705,
+      "step": 148
+    },
+    {
+      "epoch": 0.08319467554076539,
+      "grad_norm": 0.3853927254676819,
+      "learning_rate": 0.00019993217386058326,
+      "loss": 0.6514,
+      "step": 150
+    },
+    {
+      "epoch": 0.08430393788130892,
+      "grad_norm": 0.32647258043289185,
+      "learning_rate": 0.0001999253961244773,
+      "loss": 0.4882,
+      "step": 152
+    },
+    {
+      "epoch": 0.08541320022185246,
+      "grad_norm": 0.33191072940826416,
+      "learning_rate": 0.0001999182958033232,
+      "loss": 0.55,
+      "step": 154
+    },
+    {
+      "epoch": 0.08652246256239601,
+      "grad_norm": 0.3368849456310272,
+      "learning_rate": 0.00019991087292004273,
+      "loss": 0.4264,
+      "step": 156
+    },
+    {
+      "epoch": 0.08763172490293955,
+      "grad_norm": 0.2839064598083496,
+      "learning_rate": 0.00019990312749859886,
+      "loss": 0.4845,
+      "step": 158
+    },
+    {
+      "epoch": 0.08874098724348309,
+      "grad_norm": 0.34918418526649475,
+      "learning_rate": 0.00019989505956399578,
+      "loss": 0.4774,
+      "step": 160
+    },
+    {
+      "epoch": 0.08985024958402663,
+      "grad_norm": 0.5855952501296997,
+      "learning_rate": 0.0001998866691422789,
+      "loss": 0.5082,
+      "step": 162
+    },
+    {
+      "epoch": 0.09095951192457016,
+      "grad_norm": 0.3832476735115051,
+      "learning_rate": 0.00019987795626053468,
+      "loss": 0.5372,
+      "step": 164
+    },
+    {
+      "epoch": 0.0920687742651137,
+      "grad_norm": 0.2780646085739136,
+      "learning_rate": 0.00019986892094689052,
+      "loss": 0.4893,
+      "step": 166
+    },
+    {
+      "epoch": 0.09317803660565724,
+      "grad_norm": 0.3530760109424591,
+      "learning_rate": 0.00019985956323051478,
+      "loss": 0.4235,
+      "step": 168
+    },
+    {
+      "epoch": 0.09428729894620078,
+      "grad_norm": 0.3618020713329315,
+      "learning_rate": 0.00019984988314161658,
+      "loss": 0.4818,
+      "step": 170
+    },
+    {
+      "epoch": 0.09539656128674431,
+      "grad_norm": 0.360500693321228,
+      "learning_rate": 0.00019983988071144574,
+      "loss": 0.602,
+      "step": 172
+    },
+    {
+      "epoch": 0.09650582362728785,
+      "grad_norm": 0.2537604570388794,
+      "learning_rate": 0.00019982955597229275,
+      "loss": 0.5386,
+      "step": 174
+    },
+    {
+      "epoch": 0.09761508596783139,
+      "grad_norm": 0.31803157925605774,
+      "learning_rate": 0.0001998189089574885,
+      "loss": 0.4198,
+      "step": 176
+    },
+    {
+      "epoch": 0.09872434830837493,
+      "grad_norm": 0.404077410697937,
+      "learning_rate": 0.0001998079397014043,
+      "loss": 0.4918,
+      "step": 178
+    },
+    {
+      "epoch": 0.09983361064891846,
+      "grad_norm": 0.3178037703037262,
+      "learning_rate": 0.00019979664823945178,
+      "loss": 0.5349,
+      "step": 180
+    },
+    {
+      "epoch": 0.100942872989462,
+      "grad_norm": 0.3245227336883545,
+      "learning_rate": 0.0001997850346080827,
+      "loss": 0.4801,
+      "step": 182
+    },
+    {
+      "epoch": 0.10205213533000555,
+      "grad_norm": 0.33926212787628174,
+      "learning_rate": 0.00019977309884478879,
+      "loss": 0.4323,
+      "step": 184
+    },
+    {
+      "epoch": 0.10316139767054909,
+      "grad_norm": 0.41488340497016907,
+      "learning_rate": 0.0001997608409881019,
+      "loss": 0.557,
+      "step": 186
+    },
+    {
+      "epoch": 0.10427066001109263,
+      "grad_norm": 0.40327388048171997,
+      "learning_rate": 0.0001997482610775935,
+      "loss": 0.4811,
+      "step": 188
+    },
+    {
+      "epoch": 0.10537992235163617,
+      "grad_norm": 0.2919729948043823,
+      "learning_rate": 0.0001997353591538748,
+      "loss": 0.3776,
+      "step": 190
+    },
+    {
+      "epoch": 0.1064891846921797,
+      "grad_norm": 0.280122846364975,
+      "learning_rate": 0.00019972213525859658,
+      "loss": 0.4697,
+      "step": 192
+    },
+    {
+      "epoch": 0.10759844703272324,
+      "grad_norm": 0.41433635354042053,
+      "learning_rate": 0.00019970858943444897,
+      "loss": 0.591,
+      "step": 194
+    },
+    {
+      "epoch": 0.10870770937326678,
+      "grad_norm": 0.33640623092651367,
+      "learning_rate": 0.00019969472172516142,
+      "loss": 0.523,
+      "step": 196
+    },
+    {
+      "epoch": 0.10981697171381032,
+      "grad_norm": 0.3238784670829773,
+      "learning_rate": 0.0001996805321755024,
+      "loss": 0.5112,
+      "step": 198
+    },
+    {
+      "epoch": 0.11092623405435385,
+      "grad_norm": 0.33250951766967773,
+      "learning_rate": 0.0001996660208312796,
+      "loss": 0.4554,
+      "step": 200
+    },
+    {
+      "epoch": 0.11203549639489739,
+      "grad_norm": 0.2348429262638092,
+      "learning_rate": 0.0001996511877393393,
+      "loss": 0.4454,
+      "step": 202
+    },
+    {
+      "epoch": 0.11314475873544093,
+      "grad_norm": 0.3479189872741699,
+      "learning_rate": 0.00019963603294756657,
+      "loss": 0.4648,
+      "step": 204
+    },
+    {
+      "epoch": 0.11425402107598447,
+      "grad_norm": 0.2339608371257782,
+      "learning_rate": 0.00019962055650488502,
+      "loss": 0.4383,
+      "step": 206
+    },
+    {
+      "epoch": 0.115363283416528,
+      "grad_norm": 0.3076520562171936,
+      "learning_rate": 0.0001996047584612566,
+      "loss": 0.4886,
+      "step": 208
+    },
+    {
+      "epoch": 0.11647254575707154,
+      "grad_norm": 0.3302302658557892,
+      "learning_rate": 0.00019958863886768147,
+      "loss": 0.5678,
+      "step": 210
+    },
+    {
+      "epoch": 0.11758180809761509,
+      "grad_norm": 0.3337957561016083,
+      "learning_rate": 0.00019957219777619786,
+      "loss": 0.4573,
+      "step": 212
+    },
+    {
+      "epoch": 0.11869107043815863,
+      "grad_norm": 0.31091246008872986,
+      "learning_rate": 0.0001995554352398819,
+      "loss": 0.5596,
+      "step": 214
+    },
+    {
+      "epoch": 0.11980033277870217,
+      "grad_norm": 0.27653515338897705,
+      "learning_rate": 0.00019953835131284738,
+      "loss": 0.4732,
+      "step": 216
+    },
+    {
+      "epoch": 0.1209095951192457,
+      "grad_norm": 0.45777037739753723,
+      "learning_rate": 0.00019952094605024562,
+      "loss": 0.5798,
+      "step": 218
+    },
+    {
+      "epoch": 0.12201885745978924,
+      "grad_norm": 0.30545416474342346,
+      "learning_rate": 0.00019950321950826534,
+      "loss": 0.4263,
+      "step": 220
+    },
+    {
+      "epoch": 0.12312811980033278,
+      "grad_norm": 0.29590895771980286,
+      "learning_rate": 0.00019948517174413238,
+      "loss": 0.4797,
+      "step": 222
+    },
+    {
+      "epoch": 0.12423738214087632,
+      "grad_norm": 0.387412965297699,
+      "learning_rate": 0.0001994668028161096,
+      "loss": 0.4482,
+      "step": 224
+    },
+    {
+      "epoch": 0.12534664448141986,
+      "grad_norm": 0.337591290473938,
+      "learning_rate": 0.00019944811278349667,
+      "loss": 0.5011,
+      "step": 226
+    },
+    {
+      "epoch": 0.1264559068219634,
+      "grad_norm": 0.3025722801685333,
+      "learning_rate": 0.00019942910170662987,
+      "loss": 0.6154,
+      "step": 228
+    },
+    {
+      "epoch": 0.12756516916250693,
+      "grad_norm": 0.2529117465019226,
+      "learning_rate": 0.00019940976964688182,
+      "loss": 0.4412,
+      "step": 230
+    },
+    {
+      "epoch": 0.12867443150305047,
+      "grad_norm": 0.3442631959915161,
+      "learning_rate": 0.0001993901166666615,
+      "loss": 0.5209,
+      "step": 232
+    },
+    {
+      "epoch": 0.129783693843594,
+      "grad_norm": 0.30931201577186584,
+      "learning_rate": 0.00019937014282941373,
+      "loss": 0.6154,
+      "step": 234
+    },
+    {
+      "epoch": 0.13089295618413754,
+      "grad_norm": 0.3555050790309906,
+      "learning_rate": 0.00019934984819961927,
+      "loss": 0.5173,
+      "step": 236
+    },
+    {
+      "epoch": 0.13200221852468108,
+      "grad_norm": 0.44570550322532654,
+      "learning_rate": 0.00019932923284279446,
+      "loss": 0.5319,
+      "step": 238
+    },
+    {
+      "epoch": 0.13311148086522462,
+      "grad_norm": 0.3192990720272064,
+      "learning_rate": 0.00019930829682549095,
+      "loss": 0.581,
+      "step": 240
+    },
+    {
+      "epoch": 0.13422074320576816,
+      "grad_norm": 0.24486322700977325,
+      "learning_rate": 0.00019928704021529567,
+      "loss": 0.403,
+      "step": 242
+    },
+    {
+      "epoch": 0.1353300055463117,
+      "grad_norm": 0.2888510823249817,
+      "learning_rate": 0.00019926546308083047,
+      "loss": 0.4869,
+      "step": 244
+    },
+    {
+      "epoch": 0.13643926788685523,
+      "grad_norm": 0.3472958207130432,
+      "learning_rate": 0.00019924356549175188,
+      "loss": 0.6019,
+      "step": 246
+    },
+    {
+      "epoch": 0.13754853022739877,
+      "grad_norm": 0.2682150900363922,
+      "learning_rate": 0.00019922134751875102,
+      "loss": 0.4816,
+      "step": 248
+    },
+    {
+      "epoch": 0.1386577925679423,
+      "grad_norm": 0.24147269129753113,
+      "learning_rate": 0.00019919880923355323,
+      "loss": 0.4681,
+      "step": 250
+    },
+    {
+      "epoch": 0.13976705490848584,
+      "grad_norm": 0.3548056185245514,
+      "learning_rate": 0.00019917595070891798,
+      "loss": 0.568,
+      "step": 252
+    },
+    {
+      "epoch": 0.1408763172490294,
+      "grad_norm": 0.299034982919693,
+      "learning_rate": 0.00019915277201863844,
+      "loss": 0.4479,
+      "step": 254
+    },
+    {
+      "epoch": 0.14198557958957295,
+      "grad_norm": 0.32234734296798706,
+      "learning_rate": 0.00019912927323754146,
+      "loss": 0.542,
+      "step": 256
+    },
+    {
+      "epoch": 0.14309484193011648,
+      "grad_norm": 0.31738007068634033,
+      "learning_rate": 0.00019910545444148722,
+      "loss": 0.4458,
+      "step": 258
+    },
+    {
+      "epoch": 0.14420410427066002,
+      "grad_norm": 0.2500552833080292,
+      "learning_rate": 0.0001990813157073689,
+      "loss": 0.4774,
+      "step": 260
+    },
+    {
+      "epoch": 0.14531336661120356,
+      "grad_norm": 0.2912147641181946,
+      "learning_rate": 0.0001990568571131126,
+      "loss": 0.523,
+      "step": 262
+    },
+    {
+      "epoch": 0.1464226289517471,
+      "grad_norm": 0.35038962960243225,
+      "learning_rate": 0.00019903207873767705,
+      "loss": 0.5325,
+      "step": 264
+    },
+    {
+      "epoch": 0.14753189129229063,
+      "grad_norm": 0.6067700386047363,
+      "learning_rate": 0.00019900698066105317,
+      "loss": 0.5304,
+      "step": 266
+    },
+    {
+      "epoch": 0.14864115363283417,
+      "grad_norm": 0.8355000615119934,
+      "learning_rate": 0.00019898156296426414,
+      "loss": 0.6244,
+      "step": 268
+    },
+    {
+      "epoch": 0.1497504159733777,
+      "grad_norm": 0.2898505926132202,
+      "learning_rate": 0.00019895582572936475,
+      "loss": 0.4361,
+      "step": 270
+    },
+    {
+      "epoch": 0.15085967831392125,
+      "grad_norm": 0.2776695191860199,
+      "learning_rate": 0.0001989297690394416,
+      "loss": 0.5019,
+      "step": 272
+    },
+    {
+      "epoch": 0.15196894065446478,
+      "grad_norm": 0.310939222574234,
+      "learning_rate": 0.0001989033929786123,
+      "loss": 0.44,
+      "step": 274
+    },
+    {
+      "epoch": 0.15307820299500832,
+      "grad_norm": 0.37729912996292114,
+      "learning_rate": 0.00019887669763202567,
+      "loss": 0.6571,
+      "step": 276
+    },
+    {
+      "epoch": 0.15418746533555186,
+      "grad_norm": 0.3895605802536011,
+      "learning_rate": 0.0001988496830858612,
+      "loss": 0.4787,
+      "step": 278
+    },
+    {
+      "epoch": 0.1552967276760954,
+      "grad_norm": 0.27666375041007996,
+      "learning_rate": 0.00019882234942732882,
+      "loss": 0.4677,
+      "step": 280
+    },
+    {
+      "epoch": 0.15640599001663893,
+      "grad_norm": 0.5543416142463684,
+      "learning_rate": 0.00019879469674466868,
+      "loss": 0.5272,
+      "step": 282
+    },
+    {
+      "epoch": 0.15751525235718247,
+      "grad_norm": 0.46799716353416443,
+      "learning_rate": 0.00019876672512715078,
+      "loss": 0.3936,
+      "step": 284
+    },
+    {
+      "epoch": 0.158624514697726,
+      "grad_norm": 0.47091013193130493,
+      "learning_rate": 0.00019873843466507475,
+      "loss": 0.5349,
+      "step": 286
+    },
+    {
+      "epoch": 0.15973377703826955,
+      "grad_norm": 0.2690078020095825,
+      "learning_rate": 0.0001987098254497695,
+      "loss": 0.4377,
+      "step": 288
+    },
+    {
+      "epoch": 0.16084303937881309,
+      "grad_norm": 0.8880047798156738,
+      "learning_rate": 0.000198680897573593,
+      "loss": 0.4537,
+      "step": 290
+    },
+    {
+      "epoch": 0.16195230171935662,
+      "grad_norm": 0.25852563977241516,
+      "learning_rate": 0.00019865165112993195,
+      "loss": 0.4001,
+      "step": 292
+    },
+    {
+      "epoch": 0.16306156405990016,
+      "grad_norm": 0.3986980617046356,
+      "learning_rate": 0.00019862208621320142,
+      "loss": 0.5538,
+      "step": 294
+    },
+    {
+      "epoch": 0.1641708264004437,
+      "grad_norm": 0.2550656795501709,
+      "learning_rate": 0.00019859220291884458,
+      "loss": 0.4261,
+      "step": 296
+    },
+    {
+      "epoch": 0.16528008874098724,
+      "grad_norm": 0.2604806125164032,
+      "learning_rate": 0.0001985620013433325,
+      "loss": 0.5003,
+      "step": 298
+    },
+    {
+      "epoch": 0.16638935108153077,
+      "grad_norm": 0.3848886787891388,
+      "learning_rate": 0.0001985314815841637,
+      "loss": 0.4393,
+      "step": 300
+    },
+    {
+      "epoch": 0.1674986134220743,
+      "grad_norm": 0.37041744589805603,
+      "learning_rate": 0.00019850064373986377,
+      "loss": 0.4831,
+      "step": 302
+    },
+    {
+      "epoch": 0.16860787576261785,
+      "grad_norm": 0.4813826382160187,
+      "learning_rate": 0.0001984694879099853,
+      "loss": 0.5145,
+      "step": 304
+    },
+    {
+      "epoch": 0.16971713810316139,
+      "grad_norm": 0.3015052080154419,
+      "learning_rate": 0.00019843801419510744,
+      "loss": 0.4991,
+      "step": 306
+    },
+    {
+      "epoch": 0.17082640044370492,
+      "grad_norm": 0.33228328824043274,
+      "learning_rate": 0.00019840622269683538,
+      "loss": 0.4475,
+      "step": 308
+    },
+    {
+      "epoch": 0.1719356627842485,
+      "grad_norm": 0.3386059105396271,
+      "learning_rate": 0.00019837411351780038,
+      "loss": 0.5565,
+      "step": 310
+    },
+    {
+      "epoch": 0.17304492512479203,
+      "grad_norm": 0.38623613119125366,
+      "learning_rate": 0.00019834168676165917,
+      "loss": 0.5547,
+      "step": 312
+    },
+    {
+      "epoch": 0.17415418746533556,
+      "grad_norm": 0.32846319675445557,
+      "learning_rate": 0.0001983089425330937,
+      "loss": 0.547,
+      "step": 314
+    },
+    {
+      "epoch": 0.1752634498058791,
+      "grad_norm": 0.3607962429523468,
+      "learning_rate": 0.00019827588093781083,
+      "loss": 0.5914,
+      "step": 316
+    },
+    {
+      "epoch": 0.17637271214642264,
+      "grad_norm": 0.36206066608428955,
+      "learning_rate": 0.00019824250208254194,
+      "loss": 0.4694,
+      "step": 318
+    },
+    {
+      "epoch": 0.17748197448696618,
+      "grad_norm": 0.24287384748458862,
+      "learning_rate": 0.0001982088060750426,
+      "loss": 0.3787,
+      "step": 320
+    },
+    {
+      "epoch": 0.17859123682750971,
+      "grad_norm": 0.35104668140411377,
+      "learning_rate": 0.00019817479302409227,
+      "loss": 0.4743,
+      "step": 322
+    },
+    {
+      "epoch": 0.17970049916805325,
+      "grad_norm": 0.29966557025909424,
+      "learning_rate": 0.0001981404630394939,
+      "loss": 0.5124,
+      "step": 324
+    },
+    {
+      "epoch": 0.1808097615085968,
+      "grad_norm": 0.35917168855667114,
+      "learning_rate": 0.0001981058162320735,
+      "loss": 0.5524,
+      "step": 326
+    },
+    {
+      "epoch": 0.18191902384914033,
+      "grad_norm": 0.38302454352378845,
+      "learning_rate": 0.00019807085271368005,
+      "loss": 0.6019,
+      "step": 328
+    },
+    {
+      "epoch": 0.18302828618968386,
+      "grad_norm": 0.24732793867588043,
+      "learning_rate": 0.0001980355725971847,
+      "loss": 0.4952,
+      "step": 330
+    },
+    {
+      "epoch": 0.1841375485302274,
+      "grad_norm": 0.279240220785141,
+      "learning_rate": 0.0001979999759964809,
+      "loss": 0.3949,
+      "step": 332
+    },
+    {
+      "epoch": 0.18524681087077094,
+      "grad_norm": 0.2902393341064453,
+      "learning_rate": 0.00019796406302648368,
+      "loss": 0.4938,
+      "step": 334
+    },
+    {
+      "epoch": 0.18635607321131448,
+      "grad_norm": 0.3428916037082672,
+      "learning_rate": 0.00019792783380312936,
+      "loss": 0.4401,
+      "step": 336
+    },
+    {
+      "epoch": 0.18746533555185801,
+      "grad_norm": 0.38825634121894836,
+      "learning_rate": 0.00019789128844337528,
+      "loss": 0.5968,
+      "step": 338
+    },
+    {
+      "epoch": 0.18857459789240155,
+      "grad_norm": 0.26601892709732056,
+      "learning_rate": 0.0001978544270651993,
+      "loss": 0.3993,
+      "step": 340
+    },
+    {
+      "epoch": 0.1896838602329451,
+      "grad_norm": 0.29738616943359375,
+      "learning_rate": 0.00019781724978759955,
+      "loss": 0.4975,
+      "step": 342
+    },
+    {
+      "epoch": 0.19079312257348863,
+      "grad_norm": 0.3405514061450958,
+      "learning_rate": 0.00019777975673059383,
+      "loss": 0.5674,
+      "step": 344
+    },
+    {
+      "epoch": 0.19190238491403216,
+      "grad_norm": 0.29235440492630005,
+      "learning_rate": 0.00019774194801521947,
+      "loss": 0.4854,
+      "step": 346
+    },
+    {
+      "epoch": 0.1930116472545757,
+      "grad_norm": 0.279852032661438,
+      "learning_rate": 0.00019770382376353284,
+      "loss": 0.3933,
+      "step": 348
+    },
+    {
+      "epoch": 0.19412090959511924,
+      "grad_norm": 0.3054843544960022,
+      "learning_rate": 0.00019766538409860882,
+      "loss": 0.5125,
+      "step": 350
+    },
+    {
+      "epoch": 0.19523017193566278,
+      "grad_norm": 0.2959388494491577,
+      "learning_rate": 0.00019762662914454065,
+      "loss": 0.5114,
+      "step": 352
+    },
+    {
+      "epoch": 0.19633943427620631,
+      "grad_norm": 0.42934173345565796,
+      "learning_rate": 0.0001975875590264393,
+      "loss": 0.5157,
+      "step": 354
+    },
+    {
+      "epoch": 0.19744869661674985,
+      "grad_norm": 0.28414642810821533,
+      "learning_rate": 0.00019754817387043327,
+      "loss": 0.4952,
+      "step": 356
+    },
+    {
+      "epoch": 0.1985579589572934,
+      "grad_norm": 0.27035775780677795,
+      "learning_rate": 0.00019750847380366806,
+      "loss": 0.3946,
+      "step": 358
+    },
+    {
+      "epoch": 0.19966722129783693,
+      "grad_norm": 0.23628903925418854,
+      "learning_rate": 0.0001974684589543057,
+      "loss": 0.3691,
+      "step": 360
+    },
+    {
+      "epoch": 0.20077648363838047,
+      "grad_norm": 0.3877003788948059,
+      "learning_rate": 0.0001974281294515245,
+      "loss": 0.5729,
+      "step": 362
+    },
+    {
+      "epoch": 0.201885745978924,
+      "grad_norm": 0.3051539659500122,
+      "learning_rate": 0.00019738748542551861,
+      "loss": 0.4378,
+      "step": 364
+    },
+    {
+      "epoch": 0.20299500831946754,
+      "grad_norm": 0.27406755089759827,
+      "learning_rate": 0.00019734652700749737,
+      "loss": 0.505,
+      "step": 366
+    },
+    {
+      "epoch": 0.2041042706600111,
+      "grad_norm": 2.0898008346557617,
+      "learning_rate": 0.0001973052543296852,
+      "loss": 0.5647,
+      "step": 368
+    },
+    {
+      "epoch": 0.20521353300055464,
+      "grad_norm": 0.24537453055381775,
+      "learning_rate": 0.000197263667525321,
+      "loss": 0.4543,
+      "step": 370
+    },
+    {
+      "epoch": 0.20632279534109818,
+      "grad_norm": 0.26961207389831543,
+      "learning_rate": 0.0001972217667286577,
+      "loss": 0.4595,
+      "step": 372
+    },
+    {
+      "epoch": 0.20743205768164172,
+      "grad_norm": 0.24330930411815643,
+      "learning_rate": 0.00019717955207496196,
+      "loss": 0.4383,
+      "step": 374
+    },
+    {
+      "epoch": 0.20854132002218526,
+      "grad_norm": 0.42495131492614746,
+      "learning_rate": 0.0001971370237005136,
+      "loss": 0.6703,
+      "step": 376
+    },
+    {
+      "epoch": 0.2096505823627288,
+      "grad_norm": 0.28645631670951843,
+      "learning_rate": 0.0001970941817426052,
+      "loss": 0.4693,
+      "step": 378
+    },
+    {
+      "epoch": 0.21075984470327233,
+      "grad_norm": 0.27761217951774597,
+      "learning_rate": 0.00019705102633954172,
+      "loss": 0.5217,
+      "step": 380
+    },
+    {
+      "epoch": 0.21186910704381587,
+      "grad_norm": 0.27868810296058655,
+      "learning_rate": 0.00019700755763063998,
+      "loss": 0.4173,
+      "step": 382
+    },
+    {
+      "epoch": 0.2129783693843594,
+      "grad_norm": 0.3163412809371948,
+      "learning_rate": 0.0001969637757562282,
+      "loss": 0.5144,
+      "step": 384
+    },
+    {
+      "epoch": 0.21408763172490294,
+      "grad_norm": 0.2901851534843445,
+      "learning_rate": 0.00019691968085764562,
+      "loss": 0.5128,
+      "step": 386
+    },
+    {
+      "epoch": 0.21519689406544648,
+      "grad_norm": 0.3356168270111084,
+      "learning_rate": 0.00019687527307724197,
+      "loss": 0.4398,
+      "step": 388
+    },
+    {
+      "epoch": 0.21630615640599002,
+      "grad_norm": 0.33736830949783325,
+      "learning_rate": 0.0001968305525583771,
+      "loss": 0.4771,
+      "step": 390
+    },
+    {
+      "epoch": 0.21741541874653356,
+      "grad_norm": 0.2211776226758957,
+      "learning_rate": 0.00019678551944542037,
+      "loss": 0.3975,
+      "step": 392
+    },
+    {
+      "epoch": 0.2185246810870771,
+      "grad_norm": 0.2705392837524414,
+      "learning_rate": 0.00019674017388375038,
+      "loss": 0.4347,
+      "step": 394
+    },
+    {
+      "epoch": 0.21963394342762063,
+      "grad_norm": 0.2636098861694336,
+      "learning_rate": 0.0001966945160197543,
+      "loss": 0.5036,
+      "step": 396
+    },
+    {
+      "epoch": 0.22074320576816417,
+      "grad_norm": 0.26003068685531616,
+      "learning_rate": 0.00019664854600082756,
+      "loss": 0.458,
+      "step": 398
+    },
+    {
+      "epoch": 0.2218524681087077,
+      "grad_norm": 0.1831740140914917,
+      "learning_rate": 0.00019660226397537326,
+      "loss": 0.4322,
+      "step": 400
+    },
+    {
+      "epoch": 0.22296173044925124,
+      "grad_norm": 1.4381052255630493,
+      "learning_rate": 0.00019655567009280178,
+      "loss": 0.545,
+      "step": 402
+    },
+    {
+      "epoch": 0.22407099278979478,
+      "grad_norm": 0.3401663303375244,
+      "learning_rate": 0.00019650876450353022,
+      "loss": 0.5524,
+      "step": 404
+    },
+    {
+      "epoch": 0.22518025513033832,
+      "grad_norm": 0.294527530670166,
+      "learning_rate": 0.00019646154735898202,
+      "loss": 0.4059,
+      "step": 406
+    },
+    {
+      "epoch": 0.22628951747088186,
+      "grad_norm": 0.3033091127872467,
+      "learning_rate": 0.00019641401881158625,
+      "loss": 0.4119,
+      "step": 408
+    },
+    {
+      "epoch": 0.2273987798114254,
+      "grad_norm": 0.3446100950241089,
+      "learning_rate": 0.00019636617901477746,
+      "loss": 0.4299,
+      "step": 410
+    },
+    {
+      "epoch": 0.22850804215196893,
+      "grad_norm": 0.2955077886581421,
+      "learning_rate": 0.00019631802812299483,
+      "loss": 0.4496,
+      "step": 412
+    },
+    {
+      "epoch": 0.22961730449251247,
+      "grad_norm": 0.2663453221321106,
+      "learning_rate": 0.00019626956629168192,
+      "loss": 0.5373,
+      "step": 414
+    },
+    {
+      "epoch": 0.230726566833056,
+      "grad_norm": 0.31393304467201233,
+      "learning_rate": 0.0001962207936772861,
+      "loss": 0.4856,
+      "step": 416
+    },
+    {
+      "epoch": 0.23183582917359954,
+      "grad_norm": 0.3716754615306854,
+      "learning_rate": 0.00019617171043725796,
+      "loss": 0.6161,
+      "step": 418
+    },
+    {
+      "epoch": 0.23294509151414308,
+      "grad_norm": 0.3159737288951874,
+      "learning_rate": 0.00019612231673005092,
+      "loss": 0.4493,
+      "step": 420
+    },
+    {
+      "epoch": 0.23405435385468662,
+      "grad_norm": 0.3315045237541199,
+      "learning_rate": 0.00019607261271512068,
+      "loss": 0.4695,
+      "step": 422
+    },
+    {
+      "epoch": 0.23516361619523019,
+      "grad_norm": 0.27597951889038086,
+      "learning_rate": 0.0001960225985529246,
+      "loss": 0.3391,
+      "step": 424
+    },
+    {
+      "epoch": 0.23627287853577372,
+      "grad_norm": 0.4410407543182373,
+      "learning_rate": 0.00019597227440492143,
+      "loss": 0.5034,
+      "step": 426
+    },
+    {
+      "epoch": 0.23738214087631726,
+      "grad_norm": 0.2749500572681427,
+      "learning_rate": 0.00019592164043357046,
+      "loss": 0.4934,
+      "step": 428
+    },
+    {
+      "epoch": 0.2384914032168608,
+      "grad_norm": 0.32692408561706543,
+      "learning_rate": 0.00019587069680233134,
+      "loss": 0.4589,
+      "step": 430
+    },
+    {
+      "epoch": 0.23960066555740434,
+      "grad_norm": 0.2124568372964859,
+      "learning_rate": 0.00019581944367566326,
+      "loss": 0.4367,
+      "step": 432
+    },
+    {
+      "epoch": 0.24070992789794787,
+      "grad_norm": 0.26193922758102417,
+      "learning_rate": 0.00019576788121902457,
+      "loss": 0.4172,
+      "step": 434
+    },
+    {
+      "epoch": 0.2418191902384914,
+      "grad_norm": 0.25930801033973694,
+      "learning_rate": 0.00019571600959887223,
+      "loss": 0.3983,
+      "step": 436
+    },
+    {
+      "epoch": 0.24292845257903495,
+      "grad_norm": 0.30770912766456604,
+      "learning_rate": 0.0001956638289826613,
+      "loss": 0.4245,
+      "step": 438
+    },
+    {
+      "epoch": 0.24403771491957849,
+      "grad_norm": 0.26078179478645325,
+      "learning_rate": 0.00019561133953884427,
+      "loss": 0.3807,
+      "step": 440
+    },
+    {
+      "epoch": 0.24514697726012202,
+      "grad_norm": 0.3434695303440094,
+      "learning_rate": 0.00019555854143687068,
+      "loss": 0.6136,
+      "step": 442
+    },
+    {
+      "epoch": 0.24625623960066556,
+      "grad_norm": 0.3655042350292206,
+      "learning_rate": 0.00019550543484718648,
+      "loss": 0.4969,
+      "step": 444
+    },
+    {
+      "epoch": 0.2473655019412091,
+      "grad_norm": 0.2804737389087677,
+      "learning_rate": 0.00019545201994123344,
+      "loss": 0.4154,
+      "step": 446
+    },
+    {
+      "epoch": 0.24847476428175264,
+      "grad_norm": 0.23757784068584442,
+      "learning_rate": 0.00019539829689144876,
+      "loss": 0.467,
+      "step": 448
     }
   ],
   "logging_steps": 2,
   "max_steps": 3606,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 2,
+  "save_steps": 64,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {
       "attributes": {}
     }
   },
+  "total_flos": 1091252539883520.0,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:626d772bac197cb38cc6281159e89ab89ec3be67c70cfa94c2b35d721cf3214f
 size 5304

 version https://git-lfs.github.com/spec/v1
+oid sha256:81deb56b5e82d7001bcc888a41a74fc8c2ddc0612f58053768650f477b3da027
 size 5304