End of training

Browse files

Files changed (6) hide show

README.md +4 -3
all_results.json +11 -11
eval_results.json +6 -6
runs/May14_15-54-09_cs-Precision-7960-Tower/events.out.tfevents.1747253123.cs-Precision-7960-Tower.127892.1 +3 -0
train_results.json +6 -6
trainer_state.json +715 -968

README.md CHANGED Viewed

@@ -3,6 +3,7 @@ library_name: transformers
 license: apache-2.0
 base_model: facebook/wav2vec2-base
 tags:
 - generated_from_trainer
 datasets:
 - superb
@@ -23,7 +24,7 @@ model-index:
     metrics:
     - name: Accuracy
       type: accuracy
-      value: 0.9824948514268903
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -33,8 +34,8 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [facebook/wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base) on the superb dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.0926
-- Accuracy: 0.9825
 ## Model description

 license: apache-2.0
 base_model: facebook/wav2vec2-base
 tags:
+- audio-classification
 - generated_from_trainer
 datasets:
 - superb
     metrics:
     - name: Accuracy
       type: accuracy
+      value: 0.9830832597822889
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 This model is a fine-tuned version of [facebook/wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base) on the superb dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.0956
+- Accuracy: 0.9831
 ## Model description

all_results.json CHANGED Viewed

@@ -1,13 +1,13 @@
 {
-    "epoch": 4.989355040701315,
-    "eval_accuracy": 0.9814651368049426,
-    "eval_loss": 0.09822726994752884,
-    "eval_runtime": 5.2666,
-    "eval_samples_per_second": 1290.774,
-    "eval_steps_per_second": 40.443,
-    "total_flos": 2.357895379209216e+18,
-    "train_loss": 0.5734889231528854,
-    "train_runtime": 656.7578,
-    "train_samples_per_second": 388.987,
-    "train_steps_per_second": 3.038
 }

 {
+    "epoch": 7.996245306633291,
+    "eval_accuracy": 0.9830832597822889,
+    "eval_loss": 0.0956372618675232,
+    "eval_runtime": 5.6145,
+    "eval_samples_per_second": 1210.799,
+    "eval_steps_per_second": 37.938,
+    "total_flos": 3.777723239743488e+18,
+    "train_loss": 0.596273283347787,
+    "train_runtime": 640.7753,
+    "train_samples_per_second": 637.902,
+    "train_steps_per_second": 2.484
 }

eval_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "epoch": 4.989355040701315,
-    "eval_accuracy": 0.9814651368049426,
-    "eval_loss": 0.09822726994752884,
-    "eval_runtime": 5.2666,
-    "eval_samples_per_second": 1290.774,
-    "eval_steps_per_second": 40.443
 }

 {
+    "epoch": 7.996245306633291,
+    "eval_accuracy": 0.9830832597822889,
+    "eval_loss": 0.0956372618675232,
+    "eval_runtime": 5.6145,
+    "eval_samples_per_second": 1210.799,
+    "eval_steps_per_second": 37.938
 }

runs/May14_15-54-09_cs-Precision-7960-Tower/events.out.tfevents.1747253123.cs-Precision-7960-Tower.127892.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15845e83e76e501dc7170d7b668f0ca3730974474a167df2d31342c8c5348fb8
+size 411

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "epoch": 4.989355040701315,
-    "total_flos": 2.357895379209216e+18,
-    "train_loss": 0.5734889231528854,
-    "train_runtime": 656.7578,
-    "train_samples_per_second": 388.987,
-    "train_steps_per_second": 3.038
 }

 {
+    "epoch": 7.996245306633291,
+    "total_flos": 3.777723239743488e+18,
+    "train_loss": 0.596273283347787,
+    "train_runtime": 640.7753,
+    "train_samples_per_second": 637.902,
+    "train_steps_per_second": 2.484
 }

trainer_state.json CHANGED Viewed

@@ -1,1465 +1,1212 @@
 {
-  "best_metric": 0.9814651368049426,
-  "best_model_checkpoint": "wav2vec2-base-ft-keyword-spotting/checkpoint-1995",
-  "epoch": 4.989355040701315,
   "eval_steps": 500,
-  "global_step": 1995,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.025046963055729492,
-      "grad_norm": 2.144172430038452,
-      "learning_rate": 1.5e-06,
-      "loss": 3.8317,
       "step": 10
     },
     {
-      "epoch": 0.050093926111458985,
-      "grad_norm": 3.0447957515716553,
-      "learning_rate": 3e-06,
-      "loss": 4.1331,
       "step": 20
     },
     {
-      "epoch": 0.07514088916718847,
-      "grad_norm": 3.1870126724243164,
-      "learning_rate": 4.5e-06,
-      "loss": 4.0889,
       "step": 30
     },
     {
-      "epoch": 0.10018785222291797,
-      "grad_norm": 4.074451923370361,
-      "learning_rate": 6e-06,
-      "loss": 3.9025,
       "step": 40
     },
     {
-      "epoch": 0.12523481527864747,
-      "grad_norm": 5.182351112365723,
-      "learning_rate": 7.5e-06,
-      "loss": 3.6201,
       "step": 50
     },
     {
-      "epoch": 0.15028177833437695,
-      "grad_norm": 5.756130218505859,
-      "learning_rate": 9e-06,
-      "loss": 3.1977,
       "step": 60
     },
     {
-      "epoch": 0.17532874139010646,
-      "grad_norm": 5.65469217300415,
-      "learning_rate": 1.05e-05,
-      "loss": 2.7121,
       "step": 70
     },
     {
-      "epoch": 0.20037570444583594,
-      "grad_norm": 5.120871067047119,
-      "learning_rate": 1.2e-05,
-      "loss": 2.4593,
       "step": 80
     },
     {
-      "epoch": 0.22542266750156542,
-      "grad_norm": 4.952624320983887,
-      "learning_rate": 1.3500000000000001e-05,
-      "loss": 2.2216,
       "step": 90
     },
     {
-      "epoch": 0.25046963055729493,
-      "grad_norm": 4.202530384063721,
-      "learning_rate": 1.5e-05,
-      "loss": 2.0977,
       "step": 100
     },
     {
-      "epoch": 0.27551659361302444,
-      "grad_norm": 3.235758066177368,
-      "learning_rate": 1.65e-05,
-      "loss": 2.0442,
       "step": 110
     },
     {
-      "epoch": 0.3005635566687539,
-      "grad_norm": 3.456002712249756,
-      "learning_rate": 1.8e-05,
-      "loss": 1.8601,
       "step": 120
     },
     {
-      "epoch": 0.3256105197244834,
-      "grad_norm": 1.5474969148635864,
-      "learning_rate": 1.95e-05,
-      "loss": 1.7964,
       "step": 130
     },
     {
-      "epoch": 0.3506574827802129,
-      "grad_norm": 1.7901959419250488,
-      "learning_rate": 2.1e-05,
-      "loss": 1.8098,
       "step": 140
     },
     {
-      "epoch": 0.37570444583594237,
-      "grad_norm": 1.2036372423171997,
-      "learning_rate": 2.25e-05,
-      "loss": 1.7429,
       "step": 150
     },
     {
-      "epoch": 0.4007514088916719,
-      "grad_norm": 0.6613264083862305,
-      "learning_rate": 2.4e-05,
-      "loss": 1.7313,
       "step": 160
     },
     {
-      "epoch": 0.4257983719474014,
-      "grad_norm": 1.468883991241455,
-      "learning_rate": 2.55e-05,
-      "loss": 1.7528,
       "step": 170
     },
     {
-      "epoch": 0.45084533500313084,
-      "grad_norm": 2.4236254692077637,
-      "learning_rate": 2.7000000000000002e-05,
-      "loss": 1.642,
       "step": 180
     },
     {
-      "epoch": 0.47589229805886035,
-      "grad_norm": 10.500153541564941,
-      "learning_rate": 2.8499999999999998e-05,
-      "loss": 1.6462,
       "step": 190
     },
     {
-      "epoch": 0.5009392611145899,
-      "grad_norm": 8.173652648925781,
-      "learning_rate": 3e-05,
-      "loss": 1.6049,
       "step": 200
     },
     {
-      "epoch": 0.5259862241703194,
-      "grad_norm": 4.767404556274414,
-      "learning_rate": 2.9832869080779945e-05,
-      "loss": 1.5754,
       "step": 210
     },
     {
-      "epoch": 0.5510331872260489,
-      "grad_norm": 2.0099925994873047,
-      "learning_rate": 2.9665738161559886e-05,
-      "loss": 1.5001,
       "step": 220
     },
     {
-      "epoch": 0.5760801502817783,
-      "grad_norm": 2.7814888954162598,
-      "learning_rate": 2.9498607242339834e-05,
-      "loss": 1.4049,
       "step": 230
     },
     {
-      "epoch": 0.6011271133375078,
-      "grad_norm": 4.7790846824646,
-      "learning_rate": 2.933147632311978e-05,
-      "loss": 1.5321,
       "step": 240
     },
     {
-      "epoch": 0.6261740763932373,
-      "grad_norm": 3.758580207824707,
-      "learning_rate": 2.916434540389972e-05,
-      "loss": 1.4002,
       "step": 250
     },
     {
-      "epoch": 0.6512210394489668,
-      "grad_norm": 5.455554008483887,
-      "learning_rate": 2.8997214484679665e-05,
-      "loss": 1.4013,
       "step": 260
     },
     {
-      "epoch": 0.6762680025046963,
-      "grad_norm": 5.183338165283203,
-      "learning_rate": 2.8830083565459613e-05,
-      "loss": 1.2016,
       "step": 270
     },
     {
-      "epoch": 0.7013149655604258,
-      "grad_norm": 3.5465261936187744,
-      "learning_rate": 2.8662952646239554e-05,
-      "loss": 1.1541,
       "step": 280
     },
     {
-      "epoch": 0.7263619286161553,
-      "grad_norm": 8.002464294433594,
-      "learning_rate": 2.84958217270195e-05,
-      "loss": 1.1754,
       "step": 290
     },
     {
-      "epoch": 0.7514088916718847,
-      "grad_norm": 4.145716190338135,
-      "learning_rate": 2.8328690807799443e-05,
-      "loss": 1.1651,
       "step": 300
     },
     {
-      "epoch": 0.7764558547276142,
-      "grad_norm": 8.420044898986816,
-      "learning_rate": 2.8161559888579388e-05,
-      "loss": 1.2086,
       "step": 310
     },
     {
-      "epoch": 0.8015028177833438,
-      "grad_norm": 2.530792713165283,
-      "learning_rate": 2.7994428969359332e-05,
-      "loss": 1.1337,
       "step": 320
     },
     {
-      "epoch": 0.8265497808390733,
-      "grad_norm": 3.45489501953125,
-      "learning_rate": 2.7827298050139277e-05,
-      "loss": 1.0499,
       "step": 330
     },
     {
-      "epoch": 0.8515967438948028,
-      "grad_norm": 5.169933795928955,
-      "learning_rate": 2.7660167130919218e-05,
-      "loss": 1.1298,
       "step": 340
     },
     {
-      "epoch": 0.8766437069505323,
-      "grad_norm": 5.91841459274292,
-      "learning_rate": 2.7493036211699166e-05,
-      "loss": 1.1476,
       "step": 350
     },
     {
-      "epoch": 0.9016906700062617,
-      "grad_norm": 4.016351699829102,
-      "learning_rate": 2.732590529247911e-05,
-      "loss": 1.0944,
       "step": 360
     },
     {
-      "epoch": 0.9267376330619912,
-      "grad_norm": 5.553752899169922,
-      "learning_rate": 2.7158774373259055e-05,
-      "loss": 1.0822,
       "step": 370
     },
     {
-      "epoch": 0.9517845961177207,
-      "grad_norm": 4.519126892089844,
-      "learning_rate": 2.7008356545961002e-05,
-      "loss": 1.0057,
       "step": 380
     },
     {
-      "epoch": 0.9768315591734502,
-      "grad_norm": 2.7625792026519775,
-      "learning_rate": 2.6841225626740946e-05,
-      "loss": 1.009,
       "step": 390
     },
     {
-      "epoch": 1.0,
-      "grad_norm": 1.0362297296524048,
-      "learning_rate": 2.6674094707520894e-05,
-      "loss": 0.8747,
-      "step": 400
     },
     {
-      "epoch": 1.0,
-      "eval_accuracy": 0.9340982641953516,
-      "eval_loss": 0.806678831577301,
-      "eval_runtime": 5.088,
-      "eval_samples_per_second": 1336.08,
-      "eval_steps_per_second": 41.863,
       "step": 400
     },
     {
-      "epoch": 1.0250469630557295,
-      "grad_norm": 7.034424304962158,
-      "learning_rate": 2.6506963788300836e-05,
-      "loss": 0.8779,
       "step": 410
     },
     {
-      "epoch": 1.050093926111459,
-      "grad_norm": 5.3810014724731445,
-      "learning_rate": 2.633983286908078e-05,
-      "loss": 0.7878,
       "step": 420
     },
     {
-      "epoch": 1.0751408891671885,
-      "grad_norm": 3.344200372695923,
-      "learning_rate": 2.6172701949860725e-05,
-      "loss": 0.762,
       "step": 430
     },
     {
-      "epoch": 1.100187852222918,
-      "grad_norm": 6.402628421783447,
-      "learning_rate": 2.600557103064067e-05,
-      "loss": 0.7631,
       "step": 440
     },
     {
-      "epoch": 1.1252348152786475,
-      "grad_norm": 4.863397121429443,
-      "learning_rate": 2.5838440111420614e-05,
-      "loss": 0.6772,
       "step": 450
     },
     {
-      "epoch": 1.150281778334377,
-      "grad_norm": 6.484178066253662,
-      "learning_rate": 2.567130919220056e-05,
-      "loss": 0.6461,
       "step": 460
     },
     {
-      "epoch": 1.1753287413901066,
-      "grad_norm": 3.2635133266448975,
-      "learning_rate": 2.55041782729805e-05,
-      "loss": 0.5984,
       "step": 470
     },
     {
-      "epoch": 1.2003757044458359,
-      "grad_norm": 2.40271258354187,
-      "learning_rate": 2.5337047353760448e-05,
-      "loss": 0.606,
       "step": 480
     },
     {
-      "epoch": 1.2254226675015654,
-      "grad_norm": 2.1505284309387207,
-      "learning_rate": 2.5169916434540392e-05,
-      "loss": 0.5553,
       "step": 490
     },
     {
-      "epoch": 1.2504696305572949,
-      "grad_norm": 2.476496934890747,
-      "learning_rate": 2.5002785515320333e-05,
-      "loss": 0.5238,
       "step": 500
     },
     {
-      "epoch": 1.2755165936130244,
-      "grad_norm": 4.137519359588623,
-      "learning_rate": 2.4835654596100278e-05,
-      "loss": 0.5148,
       "step": 510
     },
     {
-      "epoch": 1.300563556668754,
-      "grad_norm": 5.227903842926025,
-      "learning_rate": 2.4668523676880226e-05,
-      "loss": 0.4863,
       "step": 520
     },
     {
-      "epoch": 1.3256105197244834,
-      "grad_norm": 4.056149482727051,
-      "learning_rate": 2.4501392757660167e-05,
-      "loss": 0.49,
       "step": 530
     },
     {
-      "epoch": 1.350657482780213,
-      "grad_norm": 6.162842273712158,
-      "learning_rate": 2.433426183844011e-05,
-      "loss": 0.4264,
       "step": 540
     },
     {
-      "epoch": 1.3757044458359424,
-      "grad_norm": 4.439515113830566,
-      "learning_rate": 2.4167130919220056e-05,
-      "loss": 0.4449,
       "step": 550
     },
     {
-      "epoch": 1.400751408891672,
-      "grad_norm": 4.906720161437988,
-      "learning_rate": 2.4e-05,
-      "loss": 0.408,
       "step": 560
     },
     {
-      "epoch": 1.4257983719474014,
-      "grad_norm": 5.718549728393555,
-      "learning_rate": 2.3832869080779945e-05,
-      "loss": 0.4143,
       "step": 570
     },
     {
-      "epoch": 1.4508453350031307,
-      "grad_norm": 3.9077138900756836,
-      "learning_rate": 2.366573816155989e-05,
-      "loss": 0.3716,
       "step": 580
     },
     {
-      "epoch": 1.4758922980588602,
-      "grad_norm": 3.5204200744628906,
-      "learning_rate": 2.349860724233983e-05,
-      "loss": 0.3932,
       "step": 590
     },
     {
-      "epoch": 1.5009392611145898,
-      "grad_norm": 5.870133399963379,
-      "learning_rate": 2.333147632311978e-05,
-      "loss": 0.3932,
       "step": 600
     },
     {
-      "epoch": 1.5259862241703193,
-      "grad_norm": 5.287498950958252,
-      "learning_rate": 2.3164345403899723e-05,
-      "loss": 0.4031,
       "step": 610
     },
     {
-      "epoch": 1.5510331872260488,
-      "grad_norm": 5.271251201629639,
-      "learning_rate": 2.2997214484679665e-05,
-      "loss": 0.3467,
       "step": 620
     },
     {
-      "epoch": 1.5760801502817783,
-      "grad_norm": 5.845817565917969,
-      "learning_rate": 2.283008356545961e-05,
-      "loss": 0.3593,
       "step": 630
     },
     {
-      "epoch": 1.6011271133375078,
-      "grad_norm": 3.02872896194458,
-      "learning_rate": 2.2662952646239557e-05,
-      "loss": 0.3535,
       "step": 640
     },
     {
-      "epoch": 1.6261740763932373,
-      "grad_norm": 2.3705966472625732,
-      "learning_rate": 2.2495821727019498e-05,
-      "loss": 0.3313,
       "step": 650
     },
     {
-      "epoch": 1.6512210394489668,
-      "grad_norm": 3.9336166381835938,
-      "learning_rate": 2.2328690807799443e-05,
-      "loss": 0.374,
       "step": 660
     },
     {
-      "epoch": 1.6762680025046963,
-      "grad_norm": 6.896333694458008,
-      "learning_rate": 2.2161559888579387e-05,
-      "loss": 0.359,
       "step": 670
     },
     {
-      "epoch": 1.7013149655604258,
-      "grad_norm": 3.1803808212280273,
-      "learning_rate": 2.1994428969359335e-05,
-      "loss": 0.3215,
       "step": 680
     },
     {
-      "epoch": 1.7263619286161553,
-      "grad_norm": 3.6727025508880615,
-      "learning_rate": 2.1827298050139276e-05,
-      "loss": 0.3313,
       "step": 690
     },
     {
-      "epoch": 1.7514088916718848,
-      "grad_norm": 4.481452941894531,
-      "learning_rate": 2.166016713091922e-05,
-      "loss": 0.3075,
       "step": 700
     },
     {
-      "epoch": 1.7764558547276144,
-      "grad_norm": 4.977258205413818,
-      "learning_rate": 2.1493036211699166e-05,
-      "loss": 0.2814,
       "step": 710
     },
     {
-      "epoch": 1.8015028177833439,
-      "grad_norm": 4.018652439117432,
-      "learning_rate": 2.1325905292479107e-05,
-      "loss": 0.2902,
       "step": 720
     },
     {
-      "epoch": 1.8265497808390734,
-      "grad_norm": 6.154052734375,
-      "learning_rate": 2.1158774373259055e-05,
-      "loss": 0.3123,
       "step": 730
     },
     {
-      "epoch": 1.8515967438948029,
-      "grad_norm": 3.7956981658935547,
-      "learning_rate": 2.0991643454039e-05,
-      "loss": 0.3119,
       "step": 740
     },
     {
-      "epoch": 1.8766437069505324,
-      "grad_norm": 6.320951461791992,
-      "learning_rate": 2.082451253481894e-05,
-      "loss": 0.2874,
       "step": 750
     },
     {
-      "epoch": 1.9016906700062617,
-      "grad_norm": 2.6970086097717285,
-      "learning_rate": 2.0657381615598885e-05,
-      "loss": 0.2889,
       "step": 760
     },
     {
-      "epoch": 1.9267376330619912,
-      "grad_norm": 4.386446952819824,
-      "learning_rate": 2.0490250696378833e-05,
-      "loss": 0.2529,
       "step": 770
     },
     {
-      "epoch": 1.9517845961177207,
-      "grad_norm": 5.870710372924805,
-      "learning_rate": 2.0323119777158774e-05,
-      "loss": 0.2819,
       "step": 780
     },
     {
-      "epoch": 1.9768315591734502,
-      "grad_norm": 5.930877685546875,
-      "learning_rate": 2.015598885793872e-05,
-      "loss": 0.2725,
       "step": 790
     },
     {
-      "epoch": 2.0,
-      "grad_norm": 0.3753320872783661,
-      "learning_rate": 1.9988857938718663e-05,
-      "loss": 0.2332,
-      "step": 800
     },
     {
-      "epoch": 2.0,
-      "eval_accuracy": 0.9745513386290086,
-      "eval_loss": 0.17788007855415344,
-      "eval_runtime": 4.7999,
-      "eval_samples_per_second": 1416.28,
-      "eval_steps_per_second": 44.376,
       "step": 800
     },
     {
-      "epoch": 2.0250469630557295,
-      "grad_norm": 6.34937047958374,
-      "learning_rate": 1.9821727019498608e-05,
-      "loss": 0.2545,
       "step": 810
     },
     {
-      "epoch": 2.050093926111459,
-      "grad_norm": 4.1271138191223145,
-      "learning_rate": 1.9654596100278552e-05,
-      "loss": 0.2497,
       "step": 820
     },
     {
-      "epoch": 2.0751408891671885,
-      "grad_norm": 5.419626712799072,
-      "learning_rate": 1.9487465181058497e-05,
-      "loss": 0.2502,
       "step": 830
     },
     {
-      "epoch": 2.100187852222918,
-      "grad_norm": 6.219632148742676,
-      "learning_rate": 1.9320334261838438e-05,
-      "loss": 0.2492,
       "step": 840
     },
     {
-      "epoch": 2.1252348152786475,
-      "grad_norm": 6.462090492248535,
-      "learning_rate": 1.9153203342618386e-05,
-      "loss": 0.2517,
       "step": 850
     },
     {
-      "epoch": 2.150281778334377,
-      "grad_norm": 3.6385743618011475,
-      "learning_rate": 1.898607242339833e-05,
-      "loss": 0.2393,
       "step": 860
     },
     {
-      "epoch": 2.1753287413901066,
-      "grad_norm": 4.627376556396484,
-      "learning_rate": 1.8818941504178272e-05,
-      "loss": 0.2414,
       "step": 870
     },
     {
-      "epoch": 2.200375704445836,
-      "grad_norm": 5.165160179138184,
-      "learning_rate": 1.8651810584958216e-05,
-      "loss": 0.2357,
       "step": 880
     },
     {
-      "epoch": 2.2254226675015656,
-      "grad_norm": 1.4684484004974365,
-      "learning_rate": 1.8484679665738164e-05,
-      "loss": 0.2548,
       "step": 890
     },
     {
-      "epoch": 2.250469630557295,
-      "grad_norm": 3.5594701766967773,
-      "learning_rate": 1.8317548746518105e-05,
-      "loss": 0.2403,
       "step": 900
     },
     {
-      "epoch": 2.2755165936130246,
-      "grad_norm": 3.314188003540039,
-      "learning_rate": 1.815041782729805e-05,
-      "loss": 0.2788,
       "step": 910
     },
     {
-      "epoch": 2.300563556668754,
-      "grad_norm": 2.3456945419311523,
-      "learning_rate": 1.7983286908077995e-05,
-      "loss": 0.2707,
       "step": 920
     },
     {
-      "epoch": 2.325610519724483,
-      "grad_norm": 3.4486682415008545,
-      "learning_rate": 1.781615598885794e-05,
-      "loss": 0.2304,
       "step": 930
     },
     {
-      "epoch": 2.350657482780213,
-      "grad_norm": 3.3779501914978027,
-      "learning_rate": 1.7649025069637884e-05,
-      "loss": 0.2318,
       "step": 940
     },
     {
-      "epoch": 2.375704445835942,
-      "grad_norm": 1.7540189027786255,
-      "learning_rate": 1.7481894150417828e-05,
-      "loss": 0.2125,
       "step": 950
     },
     {
-      "epoch": 2.4007514088916717,
-      "grad_norm": 8.057242393493652,
-      "learning_rate": 1.731476323119777e-05,
-      "loss": 0.2407,
       "step": 960
     },
     {
-      "epoch": 2.425798371947401,
-      "grad_norm": 3.5279042720794678,
-      "learning_rate": 1.7147632311977717e-05,
-      "loss": 0.2432,
       "step": 970
     },
     {
-      "epoch": 2.4508453350031307,
-      "grad_norm": 4.324340343475342,
-      "learning_rate": 1.6980501392757662e-05,
-      "loss": 0.2316,
       "step": 980
     },
     {
-      "epoch": 2.4758922980588602,
-      "grad_norm": 3.12505841255188,
-      "learning_rate": 1.6813370473537606e-05,
-      "loss": 0.2226,
       "step": 990
     },
     {
-      "epoch": 2.5009392611145898,
-      "grad_norm": 3.6332638263702393,
-      "learning_rate": 1.6646239554317548e-05,
-      "loss": 0.2305,
       "step": 1000
     },
     {
-      "epoch": 2.5259862241703193,
-      "grad_norm": 3.011505126953125,
-      "learning_rate": 1.6479108635097496e-05,
-      "loss": 0.2361,
       "step": 1010
     },
     {
-      "epoch": 2.5510331872260488,
-      "grad_norm": 3.6663644313812256,
-      "learning_rate": 1.631197771587744e-05,
-      "loss": 0.2398,
       "step": 1020
     },
     {
-      "epoch": 2.5760801502817783,
-      "grad_norm": 3.6134729385375977,
-      "learning_rate": 1.614484679665738e-05,
-      "loss": 0.1777,
       "step": 1030
     },
     {
-      "epoch": 2.601127113337508,
-      "grad_norm": 3.180910587310791,
-      "learning_rate": 1.5977715877437326e-05,
-      "loss": 0.2304,
       "step": 1040
     },
     {
-      "epoch": 2.6261740763932373,
-      "grad_norm": 2.667623281478882,
-      "learning_rate": 1.581058495821727e-05,
-      "loss": 0.1787,
       "step": 1050
     },
     {
-      "epoch": 2.651221039448967,
-      "grad_norm": 6.972463130950928,
-      "learning_rate": 1.5643454038997215e-05,
-      "loss": 0.2026,
       "step": 1060
     },
     {
-      "epoch": 2.6762680025046963,
-      "grad_norm": 1.4821382761001587,
-      "learning_rate": 1.547632311977716e-05,
-      "loss": 0.2295,
       "step": 1070
     },
     {
-      "epoch": 2.701314965560426,
-      "grad_norm": 3.339320421218872,
-      "learning_rate": 1.5309192200557104e-05,
-      "loss": 0.2234,
       "step": 1080
     },
     {
-      "epoch": 2.7263619286161553,
-      "grad_norm": 3.3274917602539062,
-      "learning_rate": 1.5142061281337047e-05,
-      "loss": 0.1899,
       "step": 1090
     },
     {
-      "epoch": 2.751408891671885,
-      "grad_norm": 3.077637195587158,
-      "learning_rate": 1.4974930362116992e-05,
-      "loss": 0.1747,
       "step": 1100
     },
     {
-      "epoch": 2.7764558547276144,
-      "grad_norm": 4.980368137359619,
-      "learning_rate": 1.4807799442896936e-05,
-      "loss": 0.2359,
       "step": 1110
     },
     {
-      "epoch": 2.801502817783344,
-      "grad_norm": 3.46724534034729,
-      "learning_rate": 1.464066852367688e-05,
-      "loss": 0.1945,
       "step": 1120
     },
     {
-      "epoch": 2.8265497808390734,
-      "grad_norm": 6.3585710525512695,
-      "learning_rate": 1.4473537604456825e-05,
-      "loss": 0.2356,
       "step": 1130
     },
     {
-      "epoch": 2.851596743894803,
-      "grad_norm": 3.6611506938934326,
-      "learning_rate": 1.4306406685236768e-05,
-      "loss": 0.223,
       "step": 1140
     },
     {
-      "epoch": 2.8766437069505324,
-      "grad_norm": 3.0758209228515625,
-      "learning_rate": 1.4139275766016714e-05,
-      "loss": 0.2675,
       "step": 1150
     },
     {
-      "epoch": 2.9016906700062615,
-      "grad_norm": 3.0930421352386475,
-      "learning_rate": 1.3972144846796657e-05,
-      "loss": 0.2342,
       "step": 1160
     },
     {
-      "epoch": 2.9267376330619914,
-      "grad_norm": 3.915057897567749,
-      "learning_rate": 1.3805013927576602e-05,
-      "loss": 0.2089,
       "step": 1170
     },
     {
-      "epoch": 2.9517845961177205,
-      "grad_norm": 3.755885601043701,
-      "learning_rate": 1.3637883008356546e-05,
-      "loss": 0.2251,
       "step": 1180
     },
     {
-      "epoch": 2.9768315591734504,
-      "grad_norm": 3.1619045734405518,
-      "learning_rate": 1.3470752089136491e-05,
-      "loss": 0.1927,
       "step": 1190
     },
     {
-      "epoch": 3.0,
-      "grad_norm": 3.681858539581299,
-      "learning_rate": 1.3303621169916434e-05,
-      "loss": 0.217,
-      "step": 1200
     },
     {
-      "epoch": 3.0,
-      "eval_accuracy": 0.9766107678729038,
-      "eval_loss": 0.1263045072555542,
-      "eval_runtime": 4.7521,
-      "eval_samples_per_second": 1430.516,
-      "eval_steps_per_second": 44.822,
       "step": 1200
     },
     {
-      "epoch": 3.0250469630557295,
-      "grad_norm": 4.34127140045166,
-      "learning_rate": 1.313649025069638e-05,
-      "loss": 0.2222,
       "step": 1210
     },
     {
-      "epoch": 3.050093926111459,
-      "grad_norm": 2.2270660400390625,
-      "learning_rate": 1.2969359331476323e-05,
-      "loss": 0.1822,
       "step": 1220
     },
     {
-      "epoch": 3.0751408891671885,
-      "grad_norm": 3.879969358444214,
-      "learning_rate": 1.2802228412256267e-05,
-      "loss": 0.1947,
       "step": 1230
     },
     {
-      "epoch": 3.100187852222918,
-      "grad_norm": 4.284245491027832,
-      "learning_rate": 1.2635097493036212e-05,
-      "loss": 0.2126,
       "step": 1240
     },
     {
-      "epoch": 3.1252348152786475,
-      "grad_norm": 3.0802762508392334,
-      "learning_rate": 1.2467966573816157e-05,
-      "loss": 0.1954,
       "step": 1250
     },
     {
-      "epoch": 3.150281778334377,
-      "grad_norm": 4.699860095977783,
-      "learning_rate": 1.2300835654596101e-05,
-      "loss": 0.2189,
       "step": 1260
     },
     {
-      "epoch": 3.1753287413901066,
-      "grad_norm": 4.755823612213135,
-      "learning_rate": 1.2133704735376046e-05,
-      "loss": 0.2019,
       "step": 1270
     },
     {
-      "epoch": 3.200375704445836,
-      "grad_norm": 4.778765678405762,
-      "learning_rate": 1.1966573816155989e-05,
-      "loss": 0.1987,
       "step": 1280
     },
     {
-      "epoch": 3.2254226675015656,
-      "grad_norm": 4.719218730926514,
-      "learning_rate": 1.1799442896935935e-05,
-      "loss": 0.1947,
       "step": 1290
     },
     {
-      "epoch": 3.250469630557295,
-      "grad_norm": 4.547497272491455,
-      "learning_rate": 1.1632311977715878e-05,
-      "loss": 0.2097,
       "step": 1300
     },
     {
-      "epoch": 3.2755165936130246,
-      "grad_norm": 2.1130096912384033,
-      "learning_rate": 1.1465181058495822e-05,
-      "loss": 0.1327,
       "step": 1310
     },
     {
-      "epoch": 3.300563556668754,
-      "grad_norm": 4.512012958526611,
-      "learning_rate": 1.1298050139275767e-05,
-      "loss": 0.178,
       "step": 1320
     },
     {
-      "epoch": 3.325610519724483,
-      "grad_norm": 2.9694018363952637,
-      "learning_rate": 1.1130919220055711e-05,
-      "loss": 0.2077,
       "step": 1330
     },
     {
-      "epoch": 3.350657482780213,
-      "grad_norm": 2.5430564880371094,
-      "learning_rate": 1.0963788300835654e-05,
-      "loss": 0.1774,
       "step": 1340
     },
     {
-      "epoch": 3.375704445835942,
-      "grad_norm": 5.131649971008301,
-      "learning_rate": 1.0796657381615599e-05,
-      "loss": 0.2013,
       "step": 1350
     },
     {
-      "epoch": 3.4007514088916717,
-      "grad_norm": 3.391754627227783,
-      "learning_rate": 1.0629526462395543e-05,
-      "loss": 0.1772,
       "step": 1360
     },
     {
-      "epoch": 3.425798371947401,
-      "grad_norm": 3.4772632122039795,
-      "learning_rate": 1.0462395543175486e-05,
-      "loss": 0.1597,
       "step": 1370
     },
     {
-      "epoch": 3.4508453350031307,
-      "grad_norm": 3.5944013595581055,
-      "learning_rate": 1.0295264623955432e-05,
-      "loss": 0.2192,
       "step": 1380
     },
     {
-      "epoch": 3.4758922980588602,
-      "grad_norm": 2.526901960372925,
-      "learning_rate": 1.0128133704735375e-05,
-      "loss": 0.168,
       "step": 1390
     },
     {
-      "epoch": 3.5009392611145898,
-      "grad_norm": 3.234485626220703,
-      "learning_rate": 9.961002785515322e-06,
-      "loss": 0.2007,
       "step": 1400
     },
     {
-      "epoch": 3.5259862241703193,
-      "grad_norm": 4.3917622566223145,
-      "learning_rate": 9.793871866295264e-06,
-      "loss": 0.1766,
       "step": 1410
     },
     {
-      "epoch": 3.5510331872260488,
-      "grad_norm": 1.4780800342559814,
-      "learning_rate": 9.626740947075209e-06,
-      "loss": 0.1738,
       "step": 1420
     },
     {
-      "epoch": 3.5760801502817783,
-      "grad_norm": 3.670740842819214,
-      "learning_rate": 9.459610027855154e-06,
-      "loss": 0.2033,
       "step": 1430
     },
     {
-      "epoch": 3.601127113337508,
-      "grad_norm": 4.08475923538208,
-      "learning_rate": 9.292479108635098e-06,
-      "loss": 0.1638,
       "step": 1440
     },
     {
-      "epoch": 3.6261740763932373,
-      "grad_norm": 3.445945978164673,
-      "learning_rate": 9.125348189415041e-06,
-      "loss": 0.1854,
       "step": 1450
     },
     {
-      "epoch": 3.651221039448967,
-      "grad_norm": 3.1884312629699707,
-      "learning_rate": 8.958217270194987e-06,
-      "loss": 0.1967,
       "step": 1460
     },
     {
-      "epoch": 3.6762680025046963,
-      "grad_norm": 1.9130624532699585,
-      "learning_rate": 8.79108635097493e-06,
-      "loss": 0.1618,
       "step": 1470
     },
     {
-      "epoch": 3.701314965560426,
-      "grad_norm": 1.0646212100982666,
-      "learning_rate": 8.623955431754875e-06,
-      "loss": 0.1816,
       "step": 1480
     },
     {
-      "epoch": 3.7263619286161553,
-      "grad_norm": 3.629429817199707,
-      "learning_rate": 8.45682451253482e-06,
-      "loss": 0.1432,
       "step": 1490
     },
     {
-      "epoch": 3.751408891671885,
-      "grad_norm": 2.1418120861053467,
-      "learning_rate": 8.289693593314764e-06,
-      "loss": 0.1662,
       "step": 1500
     },
     {
-      "epoch": 3.7764558547276144,
-      "grad_norm": 3.682490825653076,
-      "learning_rate": 8.122562674094707e-06,
-      "loss": 0.1819,
       "step": 1510
     },
     {
-      "epoch": 3.801502817783344,
-      "grad_norm": 2.9112191200256348,
-      "learning_rate": 7.955431754874653e-06,
-      "loss": 0.1785,
       "step": 1520
     },
     {
-      "epoch": 3.8265497808390734,
-      "grad_norm": 3.727522134780884,
-      "learning_rate": 7.788300835654596e-06,
-      "loss": 0.1476,
       "step": 1530
     },
     {
-      "epoch": 3.851596743894803,
-      "grad_norm": 4.77044153213501,
-      "learning_rate": 7.621169916434541e-06,
-      "loss": 0.1668,
       "step": 1540
     },
     {
-      "epoch": 3.8766437069505324,
-      "grad_norm": 2.953248977661133,
-      "learning_rate": 7.454038997214485e-06,
-      "loss": 0.1712,
       "step": 1550
     },
     {
-      "epoch": 3.9016906700062615,
-      "grad_norm": 4.06650972366333,
-      "learning_rate": 7.2869080779944286e-06,
-      "loss": 0.1621,
       "step": 1560
     },
     {
-      "epoch": 3.9267376330619914,
-      "grad_norm": 4.628715991973877,
-      "learning_rate": 7.119777158774373e-06,
-      "loss": 0.1513,
       "step": 1570
     },
     {
-      "epoch": 3.9517845961177205,
-      "grad_norm": 2.5671701431274414,
-      "learning_rate": 6.952646239554318e-06,
-      "loss": 0.1918,
       "step": 1580
     },
     {
-      "epoch": 3.9768315591734504,
-      "grad_norm": 4.511129379272461,
-      "learning_rate": 6.785515320334261e-06,
-      "loss": 0.1957,
       "step": 1590
     },
     {
-      "epoch": 4.0,
-      "grad_norm": 1.4342707395553589,
-      "learning_rate": 6.618384401114206e-06,
-      "loss": 0.1529,
-      "step": 1600
-    },
-    {
-      "epoch": 4.0,
-      "eval_accuracy": 0.9805825242718447,
-      "eval_loss": 0.10449180752038956,
-      "eval_runtime": 5.032,
-      "eval_samples_per_second": 1350.958,
-      "eval_steps_per_second": 42.329,
-      "step": 1600
-    },
-    {
-      "epoch": 4.025046963055729,
-      "grad_norm": 3.8492510318756104,
-      "learning_rate": 6.4512534818941505e-06,
-      "loss": 0.1797,
-      "step": 1610
-    },
-    {
-      "epoch": 4.050093926111459,
-      "grad_norm": 4.300637245178223,
-      "learning_rate": 6.284122562674095e-06,
-      "loss": 0.1598,
-      "step": 1620
-    },
-    {
-      "epoch": 4.075140889167188,
-      "grad_norm": 1.9370712041854858,
-      "learning_rate": 6.116991643454039e-06,
-      "loss": 0.1746,
-      "step": 1630
-    },
-    {
-      "epoch": 4.100187852222918,
-      "grad_norm": 2.7867672443389893,
-      "learning_rate": 5.949860724233983e-06,
-      "loss": 0.1913,
-      "step": 1640
-    },
-    {
-      "epoch": 4.125234815278647,
-      "grad_norm": 3.536440372467041,
-      "learning_rate": 5.782729805013928e-06,
-      "loss": 0.1731,
-      "step": 1650
-    },
-    {
-      "epoch": 4.150281778334377,
-      "grad_norm": 3.2297356128692627,
-      "learning_rate": 5.615598885793872e-06,
-      "loss": 0.2019,
-      "step": 1660
-    },
-    {
-      "epoch": 4.175328741390106,
-      "grad_norm": 5.139032363891602,
-      "learning_rate": 5.448467966573816e-06,
-      "loss": 0.1793,
-      "step": 1670
-    },
-    {
-      "epoch": 4.200375704445836,
-      "grad_norm": 3.031764030456543,
-      "learning_rate": 5.281337047353761e-06,
-      "loss": 0.1836,
-      "step": 1680
-    },
-    {
-      "epoch": 4.225422667501565,
-      "grad_norm": 2.6612586975097656,
-      "learning_rate": 5.114206128133705e-06,
-      "loss": 0.1629,
-      "step": 1690
-    },
-    {
-      "epoch": 4.250469630557295,
-      "grad_norm": 2.8296072483062744,
-      "learning_rate": 4.947075208913649e-06,
-      "loss": 0.1691,
-      "step": 1700
-    },
-    {
-      "epoch": 4.275516593613024,
-      "grad_norm": 4.683578968048096,
-      "learning_rate": 4.7799442896935936e-06,
-      "loss": 0.1489,
-      "step": 1710
-    },
-    {
-      "epoch": 4.300563556668754,
-      "grad_norm": 3.7361319065093994,
-      "learning_rate": 4.612813370473538e-06,
-      "loss": 0.1453,
-      "step": 1720
-    },
-    {
-      "epoch": 4.325610519724483,
-      "grad_norm": 4.293661117553711,
-      "learning_rate": 4.445682451253482e-06,
-      "loss": 0.1445,
-      "step": 1730
-    },
-    {
-      "epoch": 4.350657482780213,
-      "grad_norm": 3.1756207942962646,
-      "learning_rate": 4.278551532033426e-06,
-      "loss": 0.1523,
-      "step": 1740
-    },
-    {
-      "epoch": 4.375704445835942,
-      "grad_norm": 3.921405792236328,
-      "learning_rate": 4.111420612813371e-06,
-      "loss": 0.1603,
-      "step": 1750
-    },
-    {
-      "epoch": 4.400751408891672,
-      "grad_norm": 2.8336334228515625,
-      "learning_rate": 3.944289693593315e-06,
-      "loss": 0.1764,
-      "step": 1760
-    },
-    {
-      "epoch": 4.425798371947401,
-      "grad_norm": 3.7013275623321533,
-      "learning_rate": 3.7771587743732592e-06,
-      "loss": 0.1481,
-      "step": 1770
-    },
-    {
-      "epoch": 4.450845335003131,
-      "grad_norm": 1.9302759170532227,
-      "learning_rate": 3.6100278551532034e-06,
-      "loss": 0.1526,
-      "step": 1780
-    },
-    {
-      "epoch": 4.47589229805886,
-      "grad_norm": 4.772688865661621,
-      "learning_rate": 3.4428969359331475e-06,
-      "loss": 0.1595,
-      "step": 1790
-    },
-    {
-      "epoch": 4.50093926111459,
-      "grad_norm": 3.1509013175964355,
-      "learning_rate": 3.275766016713092e-06,
-      "loss": 0.1384,
-      "step": 1800
-    },
-    {
-      "epoch": 4.525986224170319,
-      "grad_norm": 2.2213551998138428,
-      "learning_rate": 3.108635097493036e-06,
-      "loss": 0.1542,
-      "step": 1810
-    },
-    {
-      "epoch": 4.551033187226049,
-      "grad_norm": 2.7349748611450195,
-      "learning_rate": 2.9415041782729803e-06,
-      "loss": 0.1739,
-      "step": 1820
-    },
-    {
-      "epoch": 4.576080150281778,
-      "grad_norm": 5.009521961212158,
-      "learning_rate": 2.774373259052925e-06,
-      "loss": 0.1355,
-      "step": 1830
-    },
-    {
-      "epoch": 4.601127113337508,
-      "grad_norm": 3.5362050533294678,
-      "learning_rate": 2.607242339832869e-06,
-      "loss": 0.1335,
-      "step": 1840
-    },
-    {
-      "epoch": 4.626174076393237,
-      "grad_norm": 4.31157922744751,
-      "learning_rate": 2.4401114206128136e-06,
-      "loss": 0.1848,
-      "step": 1850
-    },
-    {
-      "epoch": 4.651221039448966,
-      "grad_norm": 3.6075448989868164,
-      "learning_rate": 2.2729805013927577e-06,
-      "loss": 0.1763,
-      "step": 1860
-    },
-    {
-      "epoch": 4.676268002504696,
-      "grad_norm": 3.1636250019073486,
-      "learning_rate": 2.1058495821727023e-06,
-      "loss": 0.1672,
-      "step": 1870
-    },
-    {
-      "epoch": 4.701314965560426,
-      "grad_norm": 0.7837923169136047,
-      "learning_rate": 1.9387186629526464e-06,
-      "loss": 0.1223,
-      "step": 1880
-    },
-    {
-      "epoch": 4.726361928616155,
-      "grad_norm": 3.2040278911590576,
-      "learning_rate": 1.7715877437325906e-06,
-      "loss": 0.1676,
-      "step": 1890
-    },
-    {
-      "epoch": 4.751408891671884,
-      "grad_norm": 2.6808199882507324,
-      "learning_rate": 1.604456824512535e-06,
-      "loss": 0.1606,
-      "step": 1900
-    },
-    {
-      "epoch": 4.776455854727614,
-      "grad_norm": 2.356374979019165,
-      "learning_rate": 1.437325905292479e-06,
-      "loss": 0.1655,
-      "step": 1910
-    },
-    {
-      "epoch": 4.801502817783343,
-      "grad_norm": 3.077162504196167,
-      "learning_rate": 1.2701949860724234e-06,
-      "loss": 0.1332,
-      "step": 1920
-    },
-    {
-      "epoch": 4.826549780839073,
-      "grad_norm": 3.580504894256592,
-      "learning_rate": 1.1030640668523677e-06,
-      "loss": 0.1687,
-      "step": 1930
-    },
-    {
-      "epoch": 4.851596743894802,
-      "grad_norm": 4.193363189697266,
-      "learning_rate": 9.35933147632312e-07,
-      "loss": 0.163,
-      "step": 1940
-    },
-    {
-      "epoch": 4.876643706950532,
-      "grad_norm": 3.2785427570343018,
-      "learning_rate": 7.688022284122563e-07,
-      "loss": 0.1546,
-      "step": 1950
-    },
-    {
-      "epoch": 4.9016906700062615,
-      "grad_norm": 2.1263206005096436,
-      "learning_rate": 6.016713091922006e-07,
-      "loss": 0.1445,
-      "step": 1960
-    },
-    {
-      "epoch": 4.926737633061991,
-      "grad_norm": 1.8054914474487305,
-      "learning_rate": 4.3454038997214486e-07,
-      "loss": 0.1489,
-      "step": 1970
-    },
-    {
-      "epoch": 4.9517845961177205,
-      "grad_norm": 2.179539680480957,
-      "learning_rate": 2.6740947075208915e-07,
-      "loss": 0.1632,
-      "step": 1980
-    },
-    {
-      "epoch": 4.97683155917345,
-      "grad_norm": 2.8142571449279785,
-      "learning_rate": 1.0027855153203343e-07,
-      "loss": 0.15,
-      "step": 1990
-    },
-    {
-      "epoch": 4.989355040701315,
-      "eval_accuracy": 0.9814651368049426,
-      "eval_loss": 0.09822726994752884,
-      "eval_runtime": 5.1844,
-      "eval_samples_per_second": 1311.237,
-      "eval_steps_per_second": 41.085,
-      "step": 1995
     },
     {
-      "epoch": 4.989355040701315,
-      "step": 1995,
-      "total_flos": 2.357895379209216e+18,
-      "train_loss": 0.5734889231528854,
-      "train_runtime": 656.7578,
-      "train_samples_per_second": 388.987,
-      "train_steps_per_second": 3.038
     }
   ],
   "logging_steps": 10,
-  "max_steps": 1995,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 5,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -1473,8 +1220,8 @@
       "attributes": {}
     }
   },
-  "total_flos": 2.357895379209216e+18,
-  "train_batch_size": 32,
   "trial_name": null,
   "trial_params": null
 }

 {
+  "best_metric": 0.9830832597822889,
+  "best_model_checkpoint": "wav2vec2-base-ft-keyword-spotting/checkpoint-1393",
+  "epoch": 7.996245306633291,
   "eval_steps": 500,
+  "global_step": 1592,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.05006257822277847,
+      "grad_norm": 2.299422264099121,
+      "learning_rate": 1.875e-06,
+      "loss": 4.1412,
       "step": 10
     },
     {
+      "epoch": 0.10012515644555695,
+      "grad_norm": 3.199314832687378,
+      "learning_rate": 3.75e-06,
+      "loss": 4.1637,
       "step": 20
     },
     {
+      "epoch": 0.15018773466833543,
+      "grad_norm": 3.3083832263946533,
+      "learning_rate": 5.625e-06,
+      "loss": 4.0438,
       "step": 30
     },
     {
+      "epoch": 0.2002503128911139,
+      "grad_norm": 4.229264736175537,
+      "learning_rate": 7.5e-06,
+      "loss": 3.8012,
       "step": 40
     },
     {
+      "epoch": 0.2503128911138924,
+      "grad_norm": 5.718367576599121,
+      "learning_rate": 9.375000000000001e-06,
+      "loss": 3.3779,
       "step": 50
     },
     {
+      "epoch": 0.30037546933667086,
+      "grad_norm": 6.0788254737854,
+      "learning_rate": 1.125e-05,
+      "loss": 2.8533,
       "step": 60
     },
     {
+      "epoch": 0.3504380475594493,
+      "grad_norm": 5.600748538970947,
+      "learning_rate": 1.3125e-05,
+      "loss": 2.4796,
       "step": 70
     },
     {
+      "epoch": 0.4005006257822278,
+      "grad_norm": 5.035912990570068,
+      "learning_rate": 1.5e-05,
+      "loss": 2.253,
       "step": 80
     },
     {
+      "epoch": 0.45056320400500627,
+      "grad_norm": 4.351953983306885,
+      "learning_rate": 1.6875e-05,
+      "loss": 2.0939,
       "step": 90
     },
     {
+      "epoch": 0.5006257822277848,
+      "grad_norm": 3.4278855323791504,
+      "learning_rate": 1.8750000000000002e-05,
+      "loss": 1.9864,
       "step": 100
     },
     {
+      "epoch": 0.5506883604505632,
+      "grad_norm": 2.862748384475708,
+      "learning_rate": 2.0625e-05,
+      "loss": 1.8611,
       "step": 110
     },
     {
+      "epoch": 0.6007509386733417,
+      "grad_norm": 1.2488276958465576,
+      "learning_rate": 2.25e-05,
+      "loss": 1.8302,
       "step": 120
     },
     {
+      "epoch": 0.6508135168961201,
+      "grad_norm": 0.6365911960601807,
+      "learning_rate": 2.4375e-05,
+      "loss": 1.7982,
       "step": 130
     },
     {
+      "epoch": 0.7008760951188986,
+      "grad_norm": 0.5073445439338684,
+      "learning_rate": 2.625e-05,
+      "loss": 1.6792,
       "step": 140
     },
     {
+      "epoch": 0.7509386733416771,
+      "grad_norm": 0.9256235361099243,
+      "learning_rate": 2.8125e-05,
+      "loss": 1.7528,
       "step": 150
     },
     {
+      "epoch": 0.8010012515644556,
+      "grad_norm": 5.669793128967285,
+      "learning_rate": 3e-05,
+      "loss": 1.7688,
       "step": 160
     },
     {
+      "epoch": 0.851063829787234,
+      "grad_norm": 10.126972198486328,
+      "learning_rate": 2.979050279329609e-05,
+      "loss": 1.6876,
       "step": 170
     },
     {
+      "epoch": 0.9011264080100125,
+      "grad_norm": 1.5617619752883911,
+      "learning_rate": 2.958100558659218e-05,
+      "loss": 1.6298,
       "step": 180
     },
     {
+      "epoch": 0.951188986232791,
+      "grad_norm": 1.5987392663955688,
+      "learning_rate": 2.937150837988827e-05,
+      "loss": 1.6106,
       "step": 190
     },
     {
+      "epoch": 0.9962453066332916,
+      "eval_accuracy": 0.6209179170344219,
+      "eval_loss": 1.4251551628112793,
+      "eval_runtime": 4.8605,
+      "eval_samples_per_second": 1398.631,
+      "eval_steps_per_second": 43.823,
+      "step": 199
+    },
+    {
+      "epoch": 1.0050062578222778,
+      "grad_norm": 3.5913760662078857,
+      "learning_rate": 2.9162011173184356e-05,
+      "loss": 1.6815,
       "step": 200
     },
     {
+      "epoch": 1.0550688360450564,
+      "grad_norm": 2.8698642253875732,
+      "learning_rate": 2.895251396648045e-05,
+      "loss": 1.4457,
       "step": 210
     },
     {
+      "epoch": 1.1051314142678348,
+      "grad_norm": 2.3613011837005615,
+      "learning_rate": 2.8743016759776535e-05,
+      "loss": 1.4503,
       "step": 220
     },
     {
+      "epoch": 1.1551939924906134,
+      "grad_norm": 3.4527103900909424,
+      "learning_rate": 2.8533519553072625e-05,
+      "loss": 1.2686,
       "step": 230
     },
     {
+      "epoch": 1.2052565707133918,
+      "grad_norm": 4.879206657409668,
+      "learning_rate": 2.8324022346368715e-05,
+      "loss": 1.2226,
       "step": 240
     },
     {
+      "epoch": 1.2553191489361701,
+      "grad_norm": 3.2351438999176025,
+      "learning_rate": 2.8114525139664805e-05,
+      "loss": 1.1545,
       "step": 250
     },
     {
+      "epoch": 1.3053817271589487,
+      "grad_norm": 5.1034464836120605,
+      "learning_rate": 2.7905027932960894e-05,
+      "loss": 1.1284,
       "step": 260
     },
     {
+      "epoch": 1.355444305381727,
+      "grad_norm": 2.128084421157837,
+      "learning_rate": 2.7695530726256984e-05,
+      "loss": 1.0926,
       "step": 270
     },
     {
+      "epoch": 1.4055068836045057,
+      "grad_norm": 5.853870391845703,
+      "learning_rate": 2.7486033519553074e-05,
+      "loss": 1.075,
       "step": 280
     },
     {
+      "epoch": 1.455569461827284,
+      "grad_norm": 2.4751949310302734,
+      "learning_rate": 2.7276536312849163e-05,
+      "loss": 0.9992,
       "step": 290
     },
     {
+      "epoch": 1.5056320400500627,
+      "grad_norm": 3.3400278091430664,
+      "learning_rate": 2.7067039106145253e-05,
+      "loss": 0.9649,
       "step": 300
     },
     {
+      "epoch": 1.555694618272841,
+      "grad_norm": 2.893463611602783,
+      "learning_rate": 2.685754189944134e-05,
+      "loss": 1.0066,
       "step": 310
     },
     {
+      "epoch": 1.6057571964956194,
+      "grad_norm": 2.179349660873413,
+      "learning_rate": 2.6648044692737432e-05,
+      "loss": 0.9203,
       "step": 320
     },
     {
+      "epoch": 1.655819774718398,
+      "grad_norm": 4.882504463195801,
+      "learning_rate": 2.643854748603352e-05,
+      "loss": 0.9159,
       "step": 330
     },
     {
+      "epoch": 1.7058823529411766,
+      "grad_norm": 4.354543685913086,
+      "learning_rate": 2.622905027932961e-05,
+      "loss": 0.8608,
       "step": 340
     },
     {
+      "epoch": 1.7559449311639548,
+      "grad_norm": 3.870502233505249,
+      "learning_rate": 2.60195530726257e-05,
+      "loss": 0.8231,
       "step": 350
     },
     {
+      "epoch": 1.8060075093867334,
+      "grad_norm": 3.579007148742676,
+      "learning_rate": 2.5810055865921788e-05,
+      "loss": 0.7965,
       "step": 360
     },
     {
+      "epoch": 1.856070087609512,
+      "grad_norm": 4.881648540496826,
+      "learning_rate": 2.5600558659217877e-05,
+      "loss": 0.7647,
       "step": 370
     },
     {
+      "epoch": 1.9061326658322904,
+      "grad_norm": 2.9336421489715576,
+      "learning_rate": 2.5391061452513967e-05,
+      "loss": 0.7005,
       "step": 380
     },
     {
+      "epoch": 1.9561952440550687,
+      "grad_norm": 2.542874813079834,
+      "learning_rate": 2.5181564245810057e-05,
+      "loss": 0.6495,
       "step": 390
     },
     {
+      "epoch": 1.9962453066332917,
+      "eval_accuracy": 0.9682259488084731,
+      "eval_loss": 0.5032486915588379,
+      "eval_runtime": 4.9258,
+      "eval_samples_per_second": 1380.074,
+      "eval_steps_per_second": 43.242,
+      "step": 398
     },
     {
+      "epoch": 2.0100125156445556,
+      "grad_norm": 1.9056552648544312,
+      "learning_rate": 2.4972067039106143e-05,
+      "loss": 0.6495,
       "step": 400
     },
     {
+      "epoch": 2.0600750938673342,
+      "grad_norm": 3.567265033721924,
+      "learning_rate": 2.4762569832402236e-05,
+      "loss": 0.5869,
       "step": 410
     },
     {
+      "epoch": 2.110137672090113,
+      "grad_norm": 2.240018844604492,
+      "learning_rate": 2.4553072625698326e-05,
+      "loss": 0.5728,
       "step": 420
     },
     {
+      "epoch": 2.160200250312891,
+      "grad_norm": 2.6313724517822266,
+      "learning_rate": 2.4343575418994412e-05,
+      "loss": 0.5028,
       "step": 430
     },
     {
+      "epoch": 2.2102628285356696,
+      "grad_norm": 3.360229015350342,
+      "learning_rate": 2.4134078212290505e-05,
+      "loss": 0.4928,
       "step": 440
     },
     {
+      "epoch": 2.260325406758448,
+      "grad_norm": 5.249541282653809,
+      "learning_rate": 2.392458100558659e-05,
+      "loss": 0.4773,
       "step": 450
     },
     {
+      "epoch": 2.3103879849812268,
+      "grad_norm": 3.6117191314697266,
+      "learning_rate": 2.3715083798882685e-05,
+      "loss": 0.4852,
       "step": 460
     },
     {
+      "epoch": 2.360450563204005,
+      "grad_norm": 4.820945739746094,
+      "learning_rate": 2.350558659217877e-05,
+      "loss": 0.4331,
       "step": 470
     },
     {
+      "epoch": 2.4105131414267835,
+      "grad_norm": 4.089610576629639,
+      "learning_rate": 2.329608938547486e-05,
+      "loss": 0.4246,
       "step": 480
     },
     {
+      "epoch": 2.460575719649562,
+      "grad_norm": 4.083464622497559,
+      "learning_rate": 2.308659217877095e-05,
+      "loss": 0.3752,
       "step": 490
     },
     {
+      "epoch": 2.5106382978723403,
+      "grad_norm": 4.422226428985596,
+      "learning_rate": 2.287709497206704e-05,
+      "loss": 0.3916,
       "step": 500
     },
     {
+      "epoch": 2.560700876095119,
+      "grad_norm": 2.952890634536743,
+      "learning_rate": 2.266759776536313e-05,
+      "loss": 0.3973,
       "step": 510
     },
     {
+      "epoch": 2.6107634543178975,
+      "grad_norm": 3.720259428024292,
+      "learning_rate": 2.245810055865922e-05,
+      "loss": 0.3432,
       "step": 520
     },
     {
+      "epoch": 2.660826032540676,
+      "grad_norm": 4.10168981552124,
+      "learning_rate": 2.224860335195531e-05,
+      "loss": 0.3479,
       "step": 530
     },
     {
+      "epoch": 2.710888610763454,
+      "grad_norm": 4.39931058883667,
+      "learning_rate": 2.2039106145251395e-05,
+      "loss": 0.3418,
       "step": 540
     },
     {
+      "epoch": 2.760951188986233,
+      "grad_norm": 2.6174728870391846,
+      "learning_rate": 2.182960893854749e-05,
+      "loss": 0.3153,
       "step": 550
     },
     {
+      "epoch": 2.8110137672090114,
+      "grad_norm": 3.489020347595215,
+      "learning_rate": 2.1620111731843575e-05,
+      "loss": 0.3242,
       "step": 560
     },
     {
+      "epoch": 2.8610763454317896,
+      "grad_norm": 3.2841830253601074,
+      "learning_rate": 2.1410614525139664e-05,
+      "loss": 0.3016,
       "step": 570
     },
     {
+      "epoch": 2.911138923654568,
+      "grad_norm": 4.06994104385376,
+      "learning_rate": 2.1201117318435754e-05,
+      "loss": 0.3237,
       "step": 580
     },
     {
+      "epoch": 2.9612015018773468,
+      "grad_norm": 2.4799962043762207,
+      "learning_rate": 2.0991620111731844e-05,
+      "loss": 0.2978,
       "step": 590
     },
     {
+      "epoch": 2.9962453066332917,
+      "eval_accuracy": 0.9782288908502501,
+      "eval_loss": 0.19027507305145264,
+      "eval_runtime": 4.8737,
+      "eval_samples_per_second": 1394.823,
+      "eval_steps_per_second": 43.704,
+      "step": 597
+    },
+    {
+      "epoch": 3.0150187734668337,
+      "grad_norm": 2.1213083267211914,
+      "learning_rate": 2.0782122905027933e-05,
+      "loss": 0.3231,
       "step": 600
     },
     {
+      "epoch": 3.065081351689612,
+      "grad_norm": 4.8361945152282715,
+      "learning_rate": 2.0572625698324023e-05,
+      "loss": 0.302,
       "step": 610
     },
     {
+      "epoch": 3.1151439299123904,
+      "grad_norm": 3.293104887008667,
+      "learning_rate": 2.0363128491620113e-05,
+      "loss": 0.2883,
       "step": 620
     },
     {
+      "epoch": 3.165206508135169,
+      "grad_norm": 3.274291515350342,
+      "learning_rate": 2.01536312849162e-05,
+      "loss": 0.2961,
       "step": 630
     },
     {
+      "epoch": 3.2152690863579476,
+      "grad_norm": 3.7976105213165283,
+      "learning_rate": 1.9944134078212292e-05,
+      "loss": 0.2688,
       "step": 640
     },
     {
+      "epoch": 3.2653316645807258,
+      "grad_norm": 2.9893229007720947,
+      "learning_rate": 1.973463687150838e-05,
+      "loss": 0.2446,
       "step": 650
     },
     {
+      "epoch": 3.3153942428035044,
+      "grad_norm": 2.2266604900360107,
+      "learning_rate": 1.952513966480447e-05,
+      "loss": 0.2613,
       "step": 660
     },
     {
+      "epoch": 3.365456821026283,
+      "grad_norm": 3.621093511581421,
+      "learning_rate": 1.9315642458100558e-05,
+      "loss": 0.2512,
       "step": 670
     },
     {
+      "epoch": 3.415519399249061,
+      "grad_norm": 2.952971935272217,
+      "learning_rate": 1.9106145251396648e-05,
+      "loss": 0.2536,
       "step": 680
     },
     {
+      "epoch": 3.4655819774718397,
+      "grad_norm": 2.7361905574798584,
+      "learning_rate": 1.889664804469274e-05,
+      "loss": 0.2396,
       "step": 690
     },
     {
+      "epoch": 3.5156445556946183,
+      "grad_norm": 2.3844313621520996,
+      "learning_rate": 1.8687150837988827e-05,
+      "loss": 0.2518,
       "step": 700
     },
     {
+      "epoch": 3.565707133917397,
+      "grad_norm": 3.0508193969726562,
+      "learning_rate": 1.8477653631284917e-05,
+      "loss": 0.2325,
       "step": 710
     },
     {
+      "epoch": 3.615769712140175,
+      "grad_norm": 3.923941135406494,
+      "learning_rate": 1.8268156424581006e-05,
+      "loss": 0.2277,
       "step": 720
     },
     {
+      "epoch": 3.6658322903629537,
+      "grad_norm": 2.638787031173706,
+      "learning_rate": 1.8058659217877096e-05,
+      "loss": 0.2292,
       "step": 730
     },
     {
+      "epoch": 3.7158948685857323,
+      "grad_norm": 2.75313138961792,
+      "learning_rate": 1.7849162011173182e-05,
+      "loss": 0.2364,
       "step": 740
     },
     {
+      "epoch": 3.7659574468085104,
+      "grad_norm": 3.686354398727417,
+      "learning_rate": 1.7639664804469275e-05,
+      "loss": 0.2409,
       "step": 750
     },
     {
+      "epoch": 3.816020025031289,
+      "grad_norm": 4.230103969573975,
+      "learning_rate": 1.7430167597765365e-05,
+      "loss": 0.2293,
       "step": 760
     },
     {
+      "epoch": 3.8660826032540676,
+      "grad_norm": 4.4972100257873535,
+      "learning_rate": 1.722067039106145e-05,
+      "loss": 0.2431,
       "step": 770
     },
     {
+      "epoch": 3.916145181476846,
+      "grad_norm": 3.6224372386932373,
+      "learning_rate": 1.7011173184357544e-05,
+      "loss": 0.2099,
       "step": 780
     },
     {
+      "epoch": 3.966207759699625,
+      "grad_norm": 3.072998285293579,
+      "learning_rate": 1.680167597765363e-05,
+      "loss": 0.2273,
       "step": 790
     },
     {
+      "epoch": 3.9962453066332917,
+      "eval_accuracy": 0.9771991762283024,
+      "eval_loss": 0.14364813268184662,
+      "eval_runtime": 4.9858,
+      "eval_samples_per_second": 1363.468,
+      "eval_steps_per_second": 42.721,
+      "step": 796
     },
     {
+      "epoch": 4.020025031289111,
+      "grad_norm": 3.3532874584198,
+      "learning_rate": 1.659217877094972e-05,
+      "loss": 0.23,
       "step": 800
     },
     {
+      "epoch": 4.07008760951189,
+      "grad_norm": 2.384181261062622,
+      "learning_rate": 1.638268156424581e-05,
+      "loss": 0.2157,
       "step": 810
     },
     {
+      "epoch": 4.1201501877346685,
+      "grad_norm": 4.237877368927002,
+      "learning_rate": 1.61731843575419e-05,
+      "loss": 0.2141,
       "step": 820
     },
     {
+      "epoch": 4.170212765957447,
+      "grad_norm": 3.8752825260162354,
+      "learning_rate": 1.5963687150837986e-05,
+      "loss": 0.214,
       "step": 830
     },
     {
+      "epoch": 4.220275344180226,
+      "grad_norm": 2.571617364883423,
+      "learning_rate": 1.575418994413408e-05,
+      "loss": 0.2124,
       "step": 840
     },
     {
+      "epoch": 4.270337922403003,
+      "grad_norm": 1.8986117839813232,
+      "learning_rate": 1.554469273743017e-05,
+      "loss": 0.2142,
       "step": 850
     },
     {
+      "epoch": 4.320400500625782,
+      "grad_norm": 3.71712589263916,
+      "learning_rate": 1.533519553072626e-05,
+      "loss": 0.1939,
       "step": 860
     },
     {
+      "epoch": 4.370463078848561,
+      "grad_norm": 2.1387696266174316,
+      "learning_rate": 1.5125698324022348e-05,
+      "loss": 0.1905,
       "step": 870
     },
     {
+      "epoch": 4.420525657071339,
+      "grad_norm": 3.3053841590881348,
+      "learning_rate": 1.4916201117318435e-05,
+      "loss": 0.2156,
       "step": 880
     },
     {
+      "epoch": 4.470588235294118,
+      "grad_norm": 2.574657440185547,
+      "learning_rate": 1.4706703910614526e-05,
+      "loss": 0.198,
       "step": 890
     },
     {
+      "epoch": 4.520650813516896,
+      "grad_norm": 2.290309429168701,
+      "learning_rate": 1.4497206703910616e-05,
+      "loss": 0.1754,
       "step": 900
     },
     {
+      "epoch": 4.570713391739675,
+      "grad_norm": 2.4253950119018555,
+      "learning_rate": 1.4287709497206705e-05,
+      "loss": 0.1971,
       "step": 910
     },
     {
+      "epoch": 4.6207759699624535,
+      "grad_norm": 3.070322275161743,
+      "learning_rate": 1.4078212290502793e-05,
+      "loss": 0.1928,
       "step": 920
     },
     {
+      "epoch": 4.670838548185231,
+      "grad_norm": 1.5523446798324585,
+      "learning_rate": 1.3868715083798883e-05,
+      "loss": 0.1948,
       "step": 930
     },
     {
+      "epoch": 4.72090112640801,
+      "grad_norm": 3.076679229736328,
+      "learning_rate": 1.3659217877094973e-05,
+      "loss": 0.1822,
       "step": 940
     },
     {
+      "epoch": 4.7709637046307884,
+      "grad_norm": 3.6825084686279297,
+      "learning_rate": 1.344972067039106e-05,
+      "loss": 0.1954,
       "step": 950
     },
     {
+      "epoch": 4.821026282853567,
+      "grad_norm": 4.037261009216309,
+      "learning_rate": 1.324022346368715e-05,
+      "loss": 0.1946,
       "step": 960
     },
     {
+      "epoch": 4.871088861076346,
+      "grad_norm": 3.026543378829956,
+      "learning_rate": 1.3030726256983242e-05,
+      "loss": 0.2155,
       "step": 970
     },
     {
+      "epoch": 4.921151439299124,
+      "grad_norm": 2.0362164974212646,
+      "learning_rate": 1.282122905027933e-05,
+      "loss": 0.1851,
       "step": 980
     },
     {
+      "epoch": 4.971214017521902,
+      "grad_norm": 1.8788719177246094,
+      "learning_rate": 1.261173184357542e-05,
+      "loss": 0.1866,
       "step": 990
     },
     {
+      "epoch": 4.996245306633291,
+      "eval_accuracy": 0.981759340982642,
+      "eval_loss": 0.11026876419782639,
+      "eval_runtime": 4.7749,
+      "eval_samples_per_second": 1423.702,
+      "eval_steps_per_second": 44.608,
+      "step": 995
+    },
+    {
+      "epoch": 5.025031289111389,
+      "grad_norm": 3.612476110458374,
+      "learning_rate": 1.2402234636871509e-05,
+      "loss": 0.1911,
       "step": 1000
     },
     {
+      "epoch": 5.075093867334168,
+      "grad_norm": 1.3886641263961792,
+      "learning_rate": 1.2192737430167599e-05,
+      "loss": 0.1768,
       "step": 1010
     },
     {
+      "epoch": 5.1251564455569465,
+      "grad_norm": 3.217656135559082,
+      "learning_rate": 1.1983240223463687e-05,
+      "loss": 0.1835,
       "step": 1020
     },
     {
+      "epoch": 5.175219023779725,
+      "grad_norm": 2.281695604324341,
+      "learning_rate": 1.1773743016759776e-05,
+      "loss": 0.1858,
       "step": 1030
     },
     {
+      "epoch": 5.225281602002503,
+      "grad_norm": 2.7055630683898926,
+      "learning_rate": 1.1564245810055866e-05,
+      "loss": 0.1718,
       "step": 1040
     },
     {
+      "epoch": 5.275344180225281,
+      "grad_norm": 2.00937819480896,
+      "learning_rate": 1.1354748603351954e-05,
+      "loss": 0.1479,
       "step": 1050
     },
     {
+      "epoch": 5.32540675844806,
+      "grad_norm": 2.65446138381958,
+      "learning_rate": 1.1145251396648046e-05,
+      "loss": 0.1664,
       "step": 1060
     },
     {
+      "epoch": 5.375469336670839,
+      "grad_norm": 2.499176502227783,
+      "learning_rate": 1.0935754189944135e-05,
+      "loss": 0.1882,
       "step": 1070
     },
     {
+      "epoch": 5.425531914893617,
+      "grad_norm": 3.318516492843628,
+      "learning_rate": 1.0726256983240223e-05,
+      "loss": 0.1823,
       "step": 1080
     },
     {
+      "epoch": 5.475594493116396,
+      "grad_norm": 2.1236233711242676,
+      "learning_rate": 1.0516759776536313e-05,
+      "loss": 0.1657,
       "step": 1090
     },
     {
+      "epoch": 5.5256570713391735,
+      "grad_norm": 3.342689037322998,
+      "learning_rate": 1.0307262569832403e-05,
+      "loss": 0.1823,
       "step": 1100
     },
     {
+      "epoch": 5.575719649561952,
+      "grad_norm": 2.687920331954956,
+      "learning_rate": 1.0097765363128492e-05,
+      "loss": 0.1888,
       "step": 1110
     },
     {
+      "epoch": 5.625782227784731,
+      "grad_norm": 3.692422866821289,
+      "learning_rate": 9.88826815642458e-06,
+      "loss": 0.2053,
       "step": 1120
     },
     {
+      "epoch": 5.675844806007509,
+      "grad_norm": 3.453005790710449,
+      "learning_rate": 9.67877094972067e-06,
+      "loss": 0.179,
       "step": 1130
     },
     {
+      "epoch": 5.725907384230288,
+      "grad_norm": 4.005608081817627,
+      "learning_rate": 9.46927374301676e-06,
+      "loss": 0.1748,
       "step": 1140
     },
     {
+      "epoch": 5.7759699624530665,
+      "grad_norm": 2.1113505363464355,
+      "learning_rate": 9.25977653631285e-06,
+      "loss": 0.1574,
       "step": 1150
     },
     {
+      "epoch": 5.826032540675845,
+      "grad_norm": 4.529311180114746,
+      "learning_rate": 9.050279329608939e-06,
+      "loss": 0.1599,
       "step": 1160
     },
     {
+      "epoch": 5.876095118898624,
+      "grad_norm": 1.885956048965454,
+      "learning_rate": 8.840782122905029e-06,
+      "loss": 0.1909,
       "step": 1170
     },
     {
+      "epoch": 5.926157697121401,
+      "grad_norm": 2.369316816329956,
+      "learning_rate": 8.631284916201118e-06,
+      "loss": 0.1603,
       "step": 1180
     },
     {
+      "epoch": 5.97622027534418,
+      "grad_norm": 1.402648687362671,
+      "learning_rate": 8.421787709497206e-06,
+      "loss": 0.1616,
       "step": 1190
     },
     {
+      "epoch": 5.996245306633291,
+      "eval_accuracy": 0.9819064430714917,
+      "eval_loss": 0.0981006771326065,
+      "eval_runtime": 4.775,
+      "eval_samples_per_second": 1423.653,
+      "eval_steps_per_second": 44.607,
+      "step": 1194
     },
     {
+      "epoch": 6.030037546933667,
+      "grad_norm": 1.6693238019943237,
+      "learning_rate": 8.212290502793296e-06,
+      "loss": 0.1931,
       "step": 1200
     },
     {
+      "epoch": 6.080100125156446,
+      "grad_norm": 2.3462257385253906,
+      "learning_rate": 8.002793296089386e-06,
+      "loss": 0.1506,
       "step": 1210
     },
     {
+      "epoch": 6.130162703379224,
+      "grad_norm": 1.6939945220947266,
+      "learning_rate": 7.793296089385474e-06,
+      "loss": 0.1677,
       "step": 1220
     },
     {
+      "epoch": 6.180225281602002,
+      "grad_norm": 1.728092908859253,
+      "learning_rate": 7.583798882681565e-06,
+      "loss": 0.1569,
       "step": 1230
     },
     {
+      "epoch": 6.230287859824781,
+      "grad_norm": 1.6664111614227295,
+      "learning_rate": 7.374301675977653e-06,
+      "loss": 0.1564,
       "step": 1240
     },
     {
+      "epoch": 6.280350438047559,
+      "grad_norm": 2.0160274505615234,
+      "learning_rate": 7.164804469273744e-06,
+      "loss": 0.1513,
       "step": 1250
     },
     {
+      "epoch": 6.330413016270338,
+      "grad_norm": 4.013051986694336,
+      "learning_rate": 6.9553072625698325e-06,
+      "loss": 0.1594,
       "step": 1260
     },
     {
+      "epoch": 6.380475594493117,
+      "grad_norm": 3.11110258102417,
+      "learning_rate": 6.745810055865922e-06,
+      "loss": 0.1445,
       "step": 1270
     },
     {
+      "epoch": 6.430538172715895,
+      "grad_norm": 3.418999433517456,
+      "learning_rate": 6.536312849162011e-06,
+      "loss": 0.1679,
       "step": 1280
     },
     {
+      "epoch": 6.480600750938673,
+      "grad_norm": 2.4514362812042236,
+      "learning_rate": 6.326815642458101e-06,
+      "loss": 0.152,
       "step": 1290
     },
     {
+      "epoch": 6.5306633291614515,
+      "grad_norm": 3.2242462635040283,
+      "learning_rate": 6.1173184357541904e-06,
+      "loss": 0.1676,
       "step": 1300
     },
     {
+      "epoch": 6.58072590738423,
+      "grad_norm": 4.046393871307373,
+      "learning_rate": 5.907821229050279e-06,
+      "loss": 0.1615,
       "step": 1310
     },
     {
+      "epoch": 6.630788485607009,
+      "grad_norm": 1.9088122844696045,
+      "learning_rate": 5.698324022346369e-06,
+      "loss": 0.1465,
       "step": 1320
     },
     {
+      "epoch": 6.680851063829787,
+      "grad_norm": 2.5699033737182617,
+      "learning_rate": 5.488826815642458e-06,
+      "loss": 0.1472,
       "step": 1330
     },
     {
+      "epoch": 6.730913642052566,
+      "grad_norm": 1.9140872955322266,
+      "learning_rate": 5.2793296089385475e-06,
+      "loss": 0.1585,
       "step": 1340
     },
     {
+      "epoch": 6.7809762202753445,
+      "grad_norm": 2.022095203399658,
+      "learning_rate": 5.069832402234637e-06,
+      "loss": 0.1455,
       "step": 1350
     },
     {
+      "epoch": 6.831038798498122,
+      "grad_norm": 2.5366971492767334,
+      "learning_rate": 4.860335195530726e-06,
+      "loss": 0.1542,
       "step": 1360
     },
     {
+      "epoch": 6.881101376720901,
+      "grad_norm": 1.6112697124481201,
+      "learning_rate": 4.650837988826816e-06,
+      "loss": 0.1569,
       "step": 1370
     },
     {
+      "epoch": 6.931163954943679,
+      "grad_norm": 2.8735201358795166,
+      "learning_rate": 4.441340782122905e-06,
+      "loss": 0.1628,
       "step": 1380
     },
     {
+      "epoch": 6.981226533166458,
+      "grad_norm": 2.044304132461548,
+      "learning_rate": 4.231843575418994e-06,
+      "loss": 0.1385,
       "step": 1390
     },
     {
+      "epoch": 6.996245306633291,
+      "eval_accuracy": 0.9830832597822889,
+      "eval_loss": 0.0956372618675232,
+      "eval_runtime": 4.8597,
+      "eval_samples_per_second": 1398.844,
+      "eval_steps_per_second": 43.83,
+      "step": 1393
+    },
+    {
+      "epoch": 7.035043804755945,
+      "grad_norm": 1.4923343658447266,
+      "learning_rate": 4.022346368715084e-06,
+      "loss": 0.1558,
       "step": 1400
     },
     {
+      "epoch": 7.085106382978723,
+      "grad_norm": 3.339181423187256,
+      "learning_rate": 3.812849162011173e-06,
+      "loss": 0.1391,
       "step": 1410
     },
     {
+      "epoch": 7.135168961201502,
+      "grad_norm": 2.091777801513672,
+      "learning_rate": 3.6033519553072625e-06,
+      "loss": 0.1541,
       "step": 1420
     },
     {
+      "epoch": 7.18523153942428,
+      "grad_norm": 2.6580100059509277,
+      "learning_rate": 3.393854748603352e-06,
+      "loss": 0.1379,
       "step": 1430
     },
     {
+      "epoch": 7.235294117647059,
+      "grad_norm": 2.4065537452697754,
+      "learning_rate": 3.1843575418994414e-06,
+      "loss": 0.1475,
       "step": 1440
     },
     {
+      "epoch": 7.2853566958698375,
+      "grad_norm": 2.618218183517456,
+      "learning_rate": 2.974860335195531e-06,
+      "loss": 0.1828,
       "step": 1450
     },
     {
+      "epoch": 7.335419274092616,
+      "grad_norm": 3.5904743671417236,
+      "learning_rate": 2.7653631284916204e-06,
+      "loss": 0.1365,
       "step": 1460
     },
     {
+      "epoch": 7.385481852315394,
+      "grad_norm": 2.245260000228882,
+      "learning_rate": 2.555865921787709e-06,
+      "loss": 0.1394,
       "step": 1470
     },
     {
+      "epoch": 7.435544430538172,
+      "grad_norm": 2.558086395263672,
+      "learning_rate": 2.346368715083799e-06,
+      "loss": 0.1533,
       "step": 1480
     },
     {
+      "epoch": 7.485607008760951,
+      "grad_norm": 2.851020097732544,
+      "learning_rate": 2.136871508379888e-06,
+      "loss": 0.1313,
       "step": 1490
     },
     {
+      "epoch": 7.53566958698373,
+      "grad_norm": 1.7011760473251343,
+      "learning_rate": 1.927374301675978e-06,
+      "loss": 0.1509,
       "step": 1500
     },
     {
+      "epoch": 7.585732165206508,
+      "grad_norm": 2.6264467239379883,
+      "learning_rate": 1.717877094972067e-06,
+      "loss": 0.1515,
       "step": 1510
     },
     {
+      "epoch": 7.635794743429287,
+      "grad_norm": 1.6332521438598633,
+      "learning_rate": 1.5083798882681566e-06,
+      "loss": 0.1489,
       "step": 1520
     },
     {
+      "epoch": 7.685857321652065,
+      "grad_norm": 2.0622401237487793,
+      "learning_rate": 1.2988826815642458e-06,
+      "loss": 0.1594,
       "step": 1530
     },
     {
+      "epoch": 7.735919899874844,
+      "grad_norm": 2.3861618041992188,
+      "learning_rate": 1.0893854748603353e-06,
+      "loss": 0.1669,
       "step": 1540
     },
     {
+      "epoch": 7.785982478097622,
+      "grad_norm": 4.30822229385376,
+      "learning_rate": 8.798882681564246e-07,
+      "loss": 0.1759,
       "step": 1550
     },
     {
+      "epoch": 7.8360450563204,
+      "grad_norm": 1.4631046056747437,
+      "learning_rate": 6.70391061452514e-07,
+      "loss": 0.1645,
       "step": 1560
     },
     {
+      "epoch": 7.886107634543179,
+      "grad_norm": 2.452613115310669,
+      "learning_rate": 4.608938547486033e-07,
+      "loss": 0.1575,
       "step": 1570
     },
     {
+      "epoch": 7.9361702127659575,
+      "grad_norm": 1.279895305633545,
+      "learning_rate": 2.5139664804469275e-07,
+      "loss": 0.149,
       "step": 1580
     },
     {
+      "epoch": 7.986232790988736,
+      "grad_norm": 2.141481399536133,
+      "learning_rate": 4.189944134078212e-08,
+      "loss": 0.1524,
       "step": 1590
     },
     {
+      "epoch": 7.996245306633291,
+      "eval_accuracy": 0.9824948514268903,
+      "eval_loss": 0.09257339686155319,
+      "eval_runtime": 5.6151,
+      "eval_samples_per_second": 1210.67,
+      "eval_steps_per_second": 37.934,
+      "step": 1592
     },
     {
+      "epoch": 7.996245306633291,
+      "step": 1592,
+      "total_flos": 3.777723239743488e+18,
+      "train_loss": 0.596273283347787,
+      "train_runtime": 640.7753,
+      "train_samples_per_second": 637.902,
+      "train_steps_per_second": 2.484
     }
   ],
   "logging_steps": 10,
+  "max_steps": 1592,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 3.777723239743488e+18,
+  "train_batch_size": 64,
   "trial_name": null,
   "trial_params": null
 }